@crawlith/core 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +70 -0
  3. package/dist/analysis/analysis_list.html +35 -0
  4. package/dist/analysis/analysis_page.html +123 -0
  5. package/dist/analysis/analyze.d.ts +40 -5
  6. package/dist/analysis/analyze.js +395 -347
  7. package/dist/analysis/clustering.d.ts +23 -0
  8. package/dist/analysis/clustering.js +206 -0
  9. package/dist/analysis/content.d.ts +1 -1
  10. package/dist/analysis/content.js +11 -5
  11. package/dist/analysis/duplicate.d.ts +34 -0
  12. package/dist/analysis/duplicate.js +305 -0
  13. package/dist/analysis/heading.d.ts +116 -0
  14. package/dist/analysis/heading.js +356 -0
  15. package/dist/analysis/images.d.ts +1 -1
  16. package/dist/analysis/images.js +6 -5
  17. package/dist/analysis/links.d.ts +1 -1
  18. package/dist/analysis/links.js +8 -8
  19. package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
  20. package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
  21. package/dist/analysis/scoring.js +11 -2
  22. package/dist/analysis/seo.d.ts +8 -4
  23. package/dist/analysis/seo.js +41 -30
  24. package/dist/analysis/soft404.d.ts +17 -0
  25. package/dist/analysis/soft404.js +62 -0
  26. package/dist/analysis/structuredData.d.ts +1 -1
  27. package/dist/analysis/structuredData.js +5 -4
  28. package/dist/analysis/templates.d.ts +2 -0
  29. package/dist/analysis/templates.js +7 -0
  30. package/dist/application/index.d.ts +2 -0
  31. package/dist/application/index.js +2 -0
  32. package/dist/application/usecase.d.ts +3 -0
  33. package/dist/application/usecase.js +1 -0
  34. package/dist/application/usecases.d.ts +114 -0
  35. package/dist/application/usecases.js +201 -0
  36. package/dist/audit/index.js +1 -1
  37. package/dist/audit/transport.d.ts +1 -1
  38. package/dist/audit/transport.js +5 -4
  39. package/dist/audit/types.d.ts +1 -0
  40. package/dist/constants.d.ts +17 -0
  41. package/dist/constants.js +23 -0
  42. package/dist/core/scope/scopeManager.js +3 -0
  43. package/dist/core/security/ipGuard.d.ts +11 -0
  44. package/dist/core/security/ipGuard.js +71 -3
  45. package/dist/crawler/crawl.d.ts +4 -22
  46. package/dist/crawler/crawl.js +4 -335
  47. package/dist/crawler/crawler.d.ts +87 -0
  48. package/dist/crawler/crawler.js +683 -0
  49. package/dist/crawler/extract.d.ts +4 -1
  50. package/dist/crawler/extract.js +7 -2
  51. package/dist/crawler/fetcher.d.ts +2 -1
  52. package/dist/crawler/fetcher.js +26 -11
  53. package/dist/crawler/metricsRunner.d.ts +23 -1
  54. package/dist/crawler/metricsRunner.js +202 -72
  55. package/dist/crawler/normalize.d.ts +41 -0
  56. package/dist/crawler/normalize.js +119 -3
  57. package/dist/crawler/parser.d.ts +1 -3
  58. package/dist/crawler/parser.js +2 -49
  59. package/dist/crawler/resolver.d.ts +11 -0
  60. package/dist/crawler/resolver.js +67 -0
  61. package/dist/crawler/sitemap.d.ts +6 -0
  62. package/dist/crawler/sitemap.js +27 -17
  63. package/dist/crawler/trap.d.ts +5 -1
  64. package/dist/crawler/trap.js +23 -2
  65. package/dist/db/CrawlithDB.d.ts +110 -0
  66. package/dist/db/CrawlithDB.js +500 -0
  67. package/dist/db/graphLoader.js +42 -30
  68. package/dist/db/index.d.ts +11 -0
  69. package/dist/db/index.js +41 -29
  70. package/dist/db/migrations.d.ts +2 -0
  71. package/dist/db/{schema.js → migrations.js} +90 -43
  72. package/dist/db/pluginRegistry.d.ts +9 -0
  73. package/dist/db/pluginRegistry.js +19 -0
  74. package/dist/db/repositories/EdgeRepository.d.ts +13 -0
  75. package/dist/db/repositories/EdgeRepository.js +20 -0
  76. package/dist/db/repositories/MetricsRepository.d.ts +16 -8
  77. package/dist/db/repositories/MetricsRepository.js +28 -7
  78. package/dist/db/repositories/PageRepository.d.ts +15 -2
  79. package/dist/db/repositories/PageRepository.js +169 -25
  80. package/dist/db/repositories/SiteRepository.d.ts +9 -0
  81. package/dist/db/repositories/SiteRepository.js +13 -0
  82. package/dist/db/repositories/SnapshotRepository.d.ts +14 -5
  83. package/dist/db/repositories/SnapshotRepository.js +64 -5
  84. package/dist/db/reset.d.ts +9 -0
  85. package/dist/db/reset.js +32 -0
  86. package/dist/db/statements.d.ts +12 -0
  87. package/dist/db/statements.js +40 -0
  88. package/dist/diff/compare.d.ts +0 -5
  89. package/dist/diff/compare.js +0 -12
  90. package/dist/diff/service.d.ts +16 -0
  91. package/dist/diff/service.js +41 -0
  92. package/dist/domain/index.d.ts +4 -0
  93. package/dist/domain/index.js +4 -0
  94. package/dist/events.d.ts +56 -0
  95. package/dist/events.js +1 -0
  96. package/dist/graph/graph.d.ts +36 -42
  97. package/dist/graph/graph.js +26 -17
  98. package/dist/graph/hits.d.ts +23 -0
  99. package/dist/graph/hits.js +111 -0
  100. package/dist/graph/metrics.d.ts +0 -4
  101. package/dist/graph/metrics.js +25 -9
  102. package/dist/graph/pagerank.d.ts +17 -4
  103. package/dist/graph/pagerank.js +126 -91
  104. package/dist/graph/simhash.d.ts +6 -0
  105. package/dist/graph/simhash.js +14 -0
  106. package/dist/index.d.ts +29 -8
  107. package/dist/index.js +29 -8
  108. package/dist/lock/hashKey.js +1 -1
  109. package/dist/lock/lockManager.d.ts +5 -1
  110. package/dist/lock/lockManager.js +38 -13
  111. package/dist/plugin-system/plugin-cli.d.ts +10 -0
  112. package/dist/plugin-system/plugin-cli.js +31 -0
  113. package/dist/plugin-system/plugin-config.d.ts +16 -0
  114. package/dist/plugin-system/plugin-config.js +36 -0
  115. package/dist/plugin-system/plugin-loader.d.ts +17 -0
  116. package/dist/plugin-system/plugin-loader.js +122 -0
  117. package/dist/plugin-system/plugin-registry.d.ts +25 -0
  118. package/dist/plugin-system/plugin-registry.js +167 -0
  119. package/dist/plugin-system/plugin-types.d.ts +205 -0
  120. package/dist/plugin-system/plugin-types.js +1 -0
  121. package/dist/ports/index.d.ts +9 -0
  122. package/dist/ports/index.js +1 -0
  123. package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
  124. package/dist/report/crawlExport.d.ts +3 -0
  125. package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
  126. package/dist/report/crawl_template.d.ts +1 -0
  127. package/dist/report/crawl_template.js +7 -0
  128. package/dist/report/export.d.ts +3 -0
  129. package/dist/report/export.js +81 -0
  130. package/dist/report/html.js +15 -216
  131. package/dist/report/insight.d.ts +27 -0
  132. package/dist/report/insight.js +103 -0
  133. package/dist/scoring/health.d.ts +56 -0
  134. package/dist/scoring/health.js +213 -0
  135. package/dist/utils/chalk.d.ts +6 -0
  136. package/dist/utils/chalk.js +41 -0
  137. package/dist/utils/secureConfig.d.ts +23 -0
  138. package/dist/utils/secureConfig.js +128 -0
  139. package/package.json +12 -6
  140. package/CHANGELOG.md +0 -7
  141. package/dist/db/schema.d.ts +0 -2
  142. package/dist/graph/cluster.d.ts +0 -6
  143. package/dist/graph/cluster.js +0 -173
  144. package/dist/graph/duplicate.d.ts +0 -10
  145. package/dist/graph/duplicate.js +0 -251
  146. package/dist/report/sitegraphExport.d.ts +0 -3
  147. package/dist/report/sitegraph_template.d.ts +0 -1
  148. package/dist/report/sitegraph_template.js +0 -630
  149. package/dist/scoring/hits.d.ts +0 -9
  150. package/dist/scoring/hits.js +0 -111
  151. package/src/analysis/analyze.ts +0 -548
  152. package/src/analysis/content.ts +0 -62
  153. package/src/analysis/images.ts +0 -28
  154. package/src/analysis/links.ts +0 -41
  155. package/src/analysis/scoring.ts +0 -59
  156. package/src/analysis/seo.ts +0 -82
  157. package/src/analysis/structuredData.ts +0 -62
  158. package/src/audit/dns.ts +0 -49
  159. package/src/audit/headers.ts +0 -98
  160. package/src/audit/index.ts +0 -66
  161. package/src/audit/scoring.ts +0 -232
  162. package/src/audit/transport.ts +0 -258
  163. package/src/audit/types.ts +0 -102
  164. package/src/core/network/proxyAdapter.ts +0 -21
  165. package/src/core/network/rateLimiter.ts +0 -39
  166. package/src/core/network/redirectController.ts +0 -47
  167. package/src/core/network/responseLimiter.ts +0 -34
  168. package/src/core/network/retryPolicy.ts +0 -57
  169. package/src/core/scope/domainFilter.ts +0 -45
  170. package/src/core/scope/scopeManager.ts +0 -52
  171. package/src/core/scope/subdomainPolicy.ts +0 -39
  172. package/src/core/security/ipGuard.ts +0 -92
  173. package/src/crawler/crawl.ts +0 -382
  174. package/src/crawler/extract.ts +0 -34
  175. package/src/crawler/fetcher.ts +0 -233
  176. package/src/crawler/metricsRunner.ts +0 -124
  177. package/src/crawler/normalize.ts +0 -108
  178. package/src/crawler/parser.ts +0 -190
  179. package/src/crawler/sitemap.ts +0 -73
  180. package/src/crawler/trap.ts +0 -96
  181. package/src/db/graphLoader.ts +0 -105
  182. package/src/db/index.ts +0 -70
  183. package/src/db/repositories/EdgeRepository.ts +0 -29
  184. package/src/db/repositories/MetricsRepository.ts +0 -49
  185. package/src/db/repositories/PageRepository.ts +0 -128
  186. package/src/db/repositories/SiteRepository.ts +0 -32
  187. package/src/db/repositories/SnapshotRepository.ts +0 -74
  188. package/src/db/schema.ts +0 -177
  189. package/src/diff/compare.ts +0 -84
  190. package/src/graph/cluster.ts +0 -192
  191. package/src/graph/duplicate.ts +0 -286
  192. package/src/graph/graph.ts +0 -172
  193. package/src/graph/metrics.ts +0 -110
  194. package/src/graph/pagerank.ts +0 -125
  195. package/src/graph/simhash.ts +0 -61
  196. package/src/index.ts +0 -30
  197. package/src/lock/hashKey.ts +0 -51
  198. package/src/lock/lockManager.ts +0 -124
  199. package/src/lock/pidCheck.ts +0 -13
  200. package/src/report/html.ts +0 -227
  201. package/src/report/sitegraphExport.ts +0 -58
  202. package/src/scoring/hits.ts +0 -131
  203. package/src/scoring/orphanSeverity.ts +0 -176
  204. package/src/utils/version.ts +0 -18
  205. package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
  206. package/tests/analysis.unit.test.ts +0 -98
  207. package/tests/analyze.integration.test.ts +0 -98
  208. package/tests/audit/dns.test.ts +0 -31
  209. package/tests/audit/headers.test.ts +0 -45
  210. package/tests/audit/scoring.test.ts +0 -133
  211. package/tests/audit/security.test.ts +0 -12
  212. package/tests/audit/transport.test.ts +0 -112
  213. package/tests/clustering.test.ts +0 -118
  214. package/tests/crawler.test.ts +0 -358
  215. package/tests/db.test.ts +0 -159
  216. package/tests/diff.test.ts +0 -67
  217. package/tests/duplicate.test.ts +0 -110
  218. package/tests/fetcher.test.ts +0 -106
  219. package/tests/fetcher_safety.test.ts +0 -85
  220. package/tests/fixtures/analyze-crawl.json +0 -26
  221. package/tests/hits.test.ts +0 -134
  222. package/tests/html_report.test.ts +0 -58
  223. package/tests/lock/lockManager.test.ts +0 -138
  224. package/tests/metrics.test.ts +0 -196
  225. package/tests/normalize.test.ts +0 -101
  226. package/tests/orphanSeverity.test.ts +0 -160
  227. package/tests/pagerank.test.ts +0 -98
  228. package/tests/parser.test.ts +0 -117
  229. package/tests/proxy_safety.test.ts +0 -57
  230. package/tests/redirect_safety.test.ts +0 -73
  231. package/tests/safety.test.ts +0 -114
  232. package/tests/scope.test.ts +0 -66
  233. package/tests/scoring.test.ts +0 -59
  234. package/tests/sitemap.test.ts +0 -88
  235. package/tests/soft404.test.ts +0 -41
  236. package/tests/trap.test.ts +0 -39
  237. package/tests/visualization_data.test.ts +0 -46
  238. package/tsconfig.json +0 -11
@@ -0,0 +1,56 @@
1
+ export type CrawlEvent = {
2
+ type: 'crawl:start';
3
+ url: string;
4
+ } | {
5
+ type: 'crawl:success';
6
+ url: string;
7
+ status: number;
8
+ durationMs: number;
9
+ depth?: number;
10
+ } | {
11
+ type: 'crawl:error';
12
+ url: string;
13
+ error: string;
14
+ depth?: number;
15
+ } | {
16
+ type: 'crawl:limit-reached';
17
+ limit: number;
18
+ } | {
19
+ type: 'crawl:progress';
20
+ pagesCrawled: number;
21
+ queued: number;
22
+ active: number;
23
+ nodesFound: number;
24
+ edgesFound: number;
25
+ phase?: string;
26
+ } | {
27
+ type: 'queue:enqueue';
28
+ url: string;
29
+ depth: number;
30
+ } | {
31
+ type: 'metrics:start';
32
+ phase: string;
33
+ } | {
34
+ type: 'metrics:complete';
35
+ durationMs: number;
36
+ } | {
37
+ type: 'debug';
38
+ message: string;
39
+ context?: unknown;
40
+ } | {
41
+ type: 'info';
42
+ message: string;
43
+ context?: unknown;
44
+ } | {
45
+ type: 'warn';
46
+ message: string;
47
+ context?: unknown;
48
+ } | {
49
+ type: 'error';
50
+ message: string;
51
+ error?: unknown;
52
+ context?: unknown;
53
+ };
54
+ export interface EngineContext {
55
+ emit: (event: CrawlEvent) => void;
56
+ }
package/dist/events.js ADDED
@@ -0,0 +1 @@
1
+ export {};
@@ -1,5 +1,6 @@
1
1
  export interface GraphNode {
2
2
  url: string;
3
+ isInternal?: boolean;
3
4
  depth: number;
4
5
  inLinks: number;
5
6
  outLinks: number;
@@ -9,45 +10,48 @@ export interface GraphNode {
9
10
  nofollow?: boolean;
10
11
  brokenLinks?: string[];
11
12
  redirectChain?: string[];
13
+ discoveredViaSitemap?: boolean;
12
14
  incrementalStatus?: 'new' | 'changed' | 'unchanged' | 'deleted';
13
15
  etag?: string;
14
16
  lastModified?: string;
15
17
  contentHash?: string;
16
18
  html?: string;
17
- pageRank?: number;
18
- pageRankScore?: number;
19
- authorityScore?: number;
20
- hubScore?: number;
21
- duplicateClusterId?: string;
22
- duplicateType?: 'exact' | 'near' | 'template_heavy' | 'none';
23
- isClusterPrimary?: boolean;
24
- isCollapsed?: boolean;
25
- collapseInto?: string;
26
19
  simhash?: string;
27
20
  uniqueTokenRatio?: number;
28
- soft404Score?: number;
29
- soft404Signals?: string[];
30
21
  crawlTrapFlag?: boolean;
31
22
  crawlTrapRisk?: number;
32
23
  trapType?: string;
33
24
  securityError?: string;
34
25
  retries?: number;
35
- clusterId?: number;
36
26
  bytesReceived?: number;
37
- linkRole?: 'hub' | 'authority' | 'power' | 'balanced' | 'peripheral';
27
+ crawlStatus?: string;
28
+ wordCount?: number;
29
+ thinContentScore?: number;
30
+ externalLinkRatio?: number;
31
+ h1Count?: number;
32
+ h2Count?: number;
33
+ title?: string;
34
+ clusterId?: number;
35
+ duplicateClusterId?: string;
36
+ duplicateType?: 'exact' | 'near' | 'template_heavy';
37
+ pagerankScore?: number;
38
+ hubScore?: number;
39
+ authScore?: number;
40
+ linkRole?: string;
41
+ soft404Score?: number;
42
+ headingScore?: number;
43
+ orphanScore?: number;
44
+ orphanType?: string;
45
+ impactLevel?: string;
46
+ headingData?: string;
47
+ isClusterPrimary?: boolean;
48
+ isCollapsed?: boolean;
38
49
  }
39
50
  export interface GraphEdge {
40
51
  source: string;
41
52
  target: string;
42
53
  weight: number;
43
54
  }
44
- export interface ClusterInfo {
45
- id: number;
46
- count: number;
47
- primaryUrl: string;
48
- risk: 'low' | 'medium' | 'high';
49
- sharedPathPrefix?: string;
50
- }
51
55
  export interface CrawlStats {
52
56
  pagesFetched: number;
53
57
  pagesCached: number;
@@ -59,25 +63,23 @@ export declare class Graph {
59
63
  edges: Map<string, number>;
60
64
  limitReached: boolean;
61
65
  sessionStats: CrawlStats;
62
- trapClusters: {
63
- pattern: string;
64
- type: string;
65
- count: number;
66
- }[];
67
- duplicateClusters: {
68
- id: string;
69
- type: 'exact' | 'near' | 'template_heavy';
70
- size: number;
71
- representative: string;
72
- severity: 'low' | 'medium' | 'high';
73
- }[];
74
- contentClusters: ClusterInfo[];
66
+ /**
67
+ * Generates a unique key for an edge.
68
+ */
69
+ static getEdgeKey(source: string, target: string): string;
70
+ /**
71
+ * Parses an edge key back into source and target.
72
+ */
73
+ static parseEdgeKey(key: string): {
74
+ source: string;
75
+ target: string;
76
+ };
75
77
  /**
76
78
  * Adds a node to the graph if it doesn't exist.
77
79
  * If it exists, updates the status if the new status is non-zero (meaning we crawled it).
78
80
  * Depth is only set on creation (BFS guarantees shortest path first).
79
81
  */
80
- addNode(url: string, depth: number, status?: number): void;
82
+ addNode(url: string, depth: number, status?: number, isInternal?: boolean): void;
81
83
  updateNodeData(url: string, data: Partial<GraphNode>): void;
82
84
  /**
83
85
  * Adds a directed edge between two nodes.
@@ -90,14 +92,6 @@ export declare class Graph {
90
92
  toJSON(): {
91
93
  nodes: GraphNode[];
92
94
  edges: GraphEdge[];
93
- duplicateClusters: {
94
- id: string;
95
- type: "exact" | "near" | "template_heavy";
96
- size: number;
97
- representative: string;
98
- severity: "low" | "medium" | "high";
99
- }[];
100
- contentClusters: ClusterInfo[];
101
95
  };
102
96
  static fromJSON(json: any): Graph;
103
97
  }
@@ -1,6 +1,6 @@
1
1
  export class Graph {
2
2
  nodes = new Map();
3
- // Using string "source|target" to ensure uniqueness efficiently. Mapping to weight.
3
+ // Using JSON string of [source, target] to ensure uniqueness. Mapping to weight.
4
4
  edges = new Map();
5
5
  limitReached = false;
6
6
  sessionStats = {
@@ -9,19 +9,33 @@ export class Graph {
9
9
  pagesSkipped: 0,
10
10
  totalFound: 0
11
11
  };
12
- trapClusters = [];
13
- duplicateClusters = [];
14
- contentClusters = [];
12
+ /**
13
+ * Generates a unique key for an edge.
14
+ */
15
+ static getEdgeKey(source, target) {
16
+ return source + '\x00' + target;
17
+ }
18
+ /**
19
+ * Parses an edge key back into source and target.
20
+ */
21
+ static parseEdgeKey(key) {
22
+ const splitIndex = key.indexOf('\x00');
23
+ return {
24
+ source: key.slice(0, splitIndex),
25
+ target: key.slice(splitIndex + 1)
26
+ };
27
+ }
15
28
  /**
16
29
  * Adds a node to the graph if it doesn't exist.
17
30
  * If it exists, updates the status if the new status is non-zero (meaning we crawled it).
18
31
  * Depth is only set on creation (BFS guarantees shortest path first).
19
32
  */
20
- addNode(url, depth, status = 0) {
33
+ addNode(url, depth, status = 0, isInternal = true) {
21
34
  const existing = this.nodes.get(url);
22
35
  if (!existing) {
23
36
  this.nodes.set(url, {
24
37
  url,
38
+ isInternal,
25
39
  depth,
26
40
  status,
27
41
  inLinks: 0,
@@ -33,6 +47,9 @@ export class Graph {
33
47
  if (status !== 0) {
34
48
  existing.status = status;
35
49
  }
50
+ if (isInternal !== undefined) {
51
+ existing.isInternal = isInternal;
52
+ }
36
53
  }
37
54
  }
38
55
  updateNodeData(url, data) {
@@ -50,7 +67,7 @@ export class Graph {
50
67
  const sourceNode = this.nodes.get(source);
51
68
  const targetNode = this.nodes.get(target);
52
69
  if (sourceNode && targetNode) {
53
- const edgeKey = `${source}|${target}`;
70
+ const edgeKey = Graph.getEdgeKey(source, target);
54
71
  if (!this.edges.has(edgeKey)) {
55
72
  this.edges.set(edgeKey, weight);
56
73
  sourceNode.outLinks++;
@@ -70,16 +87,14 @@ export class Graph {
70
87
  }
71
88
  getEdges() {
72
89
  return Array.from(this.edges.entries()).map(([edge, weight]) => {
73
- const [source, target] = edge.split('|');
90
+ const { source, target } = Graph.parseEdgeKey(edge);
74
91
  return { source, target, weight };
75
92
  });
76
93
  }
77
94
  toJSON() {
78
95
  return {
79
96
  nodes: this.getNodes(),
80
- edges: this.getEdges(),
81
- duplicateClusters: this.duplicateClusters,
82
- contentClusters: this.contentClusters
97
+ edges: this.getEdges()
83
98
  };
84
99
  }
85
100
  static fromJSON(json) {
@@ -91,16 +106,10 @@ export class Graph {
91
106
  }
92
107
  if (json.edges) {
93
108
  for (const edge of json.edges) {
94
- const key = `${edge.source}|${edge.target}`;
109
+ const key = Graph.getEdgeKey(edge.source, edge.target);
95
110
  graph.edges.set(key, edge.weight || 1.0);
96
111
  }
97
112
  }
98
- if (json.duplicateClusters) {
99
- graph.duplicateClusters = json.duplicateClusters;
100
- }
101
- if (json.contentClusters) {
102
- graph.contentClusters = json.contentClusters;
103
- }
104
113
  return graph;
105
114
  }
106
115
  }
@@ -0,0 +1,23 @@
1
+ import { Graph } from './graph.js';
2
+ export type LinkRole = 'hub' | 'authority' | 'power' | 'balanced' | 'peripheral';
3
+ export interface HITSRow {
4
+ authority_score: number;
5
+ hub_score: number;
6
+ link_role: LinkRole;
7
+ }
8
+ export interface HITSOptions {
9
+ iterations?: number;
10
+ }
11
+ /**
12
+ * Service to compute Hub and Authority scores using the HITS algorithm.
13
+ * Operates purely on the internal link graph.
14
+ */
15
+ export declare class HITSService {
16
+ /**
17
+ * Computes Hub and Authority scores using the HITS algorithm.
18
+ * @param {Graph} graph - The link graph to analyze.
19
+ * @param {HITSOptions} options - Algorithm options (e.g. number of iterations).
20
+ * @returns {Map<string, HITSRow>} A map of page URLs to their HITS results.
21
+ */
22
+ evaluate(graph: Graph, options?: HITSOptions): Map<string, HITSRow>;
23
+ }
@@ -0,0 +1,111 @@
1
+ /**
2
+ * Service to compute Hub and Authority scores using the HITS algorithm.
3
+ * Operates purely on the internal link graph.
4
+ */
5
+ export class HITSService {
6
+ /**
7
+ * Computes Hub and Authority scores using the HITS algorithm.
8
+ * @param {Graph} graph - The link graph to analyze.
9
+ * @param {HITSOptions} options - Algorithm options (e.g. number of iterations).
10
+ * @returns {Map<string, HITSRow>} A map of page URLs to their HITS results.
11
+ */
12
+ evaluate(graph, options = {}) {
13
+ const iterations = options.iterations || 20;
14
+ const nodes = graph.getNodes();
15
+ // 1. Filter eligible nodes
16
+ const eligibleNodes = nodes.filter(n => (n.status === 200 || n.status === 0) &&
17
+ (!n.redirectChain || n.redirectChain.length === 0) &&
18
+ !n.noindex);
19
+ const N = eligibleNodes.length;
20
+ const results = new Map();
21
+ if (N === 0)
22
+ return results;
23
+ // Map URL to Index for O(1) access
24
+ const urlToIndex = new Map();
25
+ for (let i = 0; i < N; i++) {
26
+ urlToIndex.set(eligibleNodes[i].url, i);
27
+ }
28
+ // Build Adjacency Lists
29
+ const incoming = new Array(N).fill(null).map(() => []);
30
+ const outgoing = new Array(N).fill(null).map(() => []);
31
+ const allEdges = graph.getEdges();
32
+ for (const edge of allEdges) {
33
+ if (edge.source === edge.target)
34
+ continue;
35
+ const sourceIndex = urlToIndex.get(edge.source);
36
+ const targetIndex = urlToIndex.get(edge.target);
37
+ if (sourceIndex !== undefined && targetIndex !== undefined) {
38
+ const weight = edge.weight || 1.0;
39
+ incoming[targetIndex].push({ sourceIndex, weight });
40
+ outgoing[sourceIndex].push({ targetIndex, weight });
41
+ }
42
+ }
43
+ // Initialize Scores
44
+ const authScores = new Float64Array(N).fill(1.0);
45
+ const hubScores = new Float64Array(N).fill(1.0);
46
+ // 2. Iteration
47
+ for (let iter = 0; iter < iterations; iter++) {
48
+ let normAuth = 0;
49
+ for (let i = 0; i < N; i++) {
50
+ const inLinks = incoming[i];
51
+ let newAuth = 0;
52
+ for (let j = 0; j < inLinks.length; j++) {
53
+ const link = inLinks[j];
54
+ newAuth += hubScores[link.sourceIndex] * link.weight;
55
+ }
56
+ authScores[i] = newAuth;
57
+ normAuth += newAuth * newAuth;
58
+ }
59
+ normAuth = Math.sqrt(normAuth);
60
+ if (normAuth > 0) {
61
+ for (let i = 0; i < N; i++)
62
+ authScores[i] /= normAuth;
63
+ }
64
+ let normHub = 0;
65
+ for (let i = 0; i < N; i++) {
66
+ const outLinks = outgoing[i];
67
+ let newHub = 0;
68
+ for (let j = 0; j < outLinks.length; j++) {
69
+ const link = outLinks[j];
70
+ newHub += authScores[link.targetIndex] * link.weight;
71
+ }
72
+ hubScores[i] = newHub;
73
+ normHub += newHub * newHub;
74
+ }
75
+ normHub = Math.sqrt(normHub);
76
+ if (normHub > 0) {
77
+ for (let i = 0; i < N; i++)
78
+ hubScores[i] /= normHub;
79
+ }
80
+ }
81
+ // 3. Classification and Result Mapping
82
+ const sortedAuth = [...authScores].sort((a, b) => a - b);
83
+ const sortedHub = [...hubScores].sort((a, b) => a - b);
84
+ const medianAuth = sortedAuth[Math.floor(sortedAuth.length / 2)];
85
+ const medianHub = sortedHub[Math.floor(sortedHub.length / 2)];
86
+ const maxAuth = sortedAuth[sortedAuth.length - 1];
87
+ const maxHub = sortedHub[sortedHub.length - 1];
88
+ for (let i = 0; i < N; i++) {
89
+ const auth = authScores[i];
90
+ const hub = hubScores[i];
91
+ const url = eligibleNodes[i].url;
92
+ const isHighAuth = (auth > medianAuth || (auth === maxAuth && auth > 0)) && auth > 0.00001;
93
+ const isHighHub = (hub > medianHub || (hub === maxHub && hub > 0)) && hub > 0.00001;
94
+ let link_role = 'peripheral';
95
+ if (isHighAuth && isHighHub)
96
+ link_role = 'power';
97
+ else if (isHighAuth)
98
+ link_role = 'authority';
99
+ else if (isHighHub)
100
+ link_role = 'hub';
101
+ else if (auth > 0.00001 && hub > 0.00001)
102
+ link_role = 'balanced';
103
+ results.set(url, {
104
+ authority_score: auth,
105
+ hub_score: hub,
106
+ link_role
107
+ });
108
+ }
109
+ return results;
110
+ }
111
+ }
@@ -14,10 +14,6 @@ export interface Metrics {
14
14
  crawlEfficiencyScore: number;
15
15
  averageDepth: number;
16
16
  structuralEntropy: number;
17
- topPageRankPages: {
18
- url: string;
19
- score: number;
20
- }[];
21
17
  limitReached: boolean;
22
18
  sessionStats?: {
23
19
  pagesFetched: number;
@@ -3,6 +3,28 @@ export function calculateMetrics(graph, _maxDepth) {
3
3
  const edges = graph.getEdges();
4
4
  const totalPages = nodes.length;
5
5
  const totalEdges = edges.length;
6
+ // Identify broken nodes
7
+ const brokenNodes = new Set(nodes.filter(n => n.status >= 400 || n.status === 0).map(n => n.url));
8
+ // Pre-compute outgoing edges per node for faster lookup
9
+ const outgoingEdges = new Map();
10
+ for (const edge of edges) {
11
+ let targets = outgoingEdges.get(edge.source);
12
+ if (!targets) {
13
+ targets = [];
14
+ outgoingEdges.set(edge.source, targets);
15
+ }
16
+ targets.push(edge.target);
17
+ }
18
+ // Populate brokenLinks per node
19
+ for (const node of nodes) {
20
+ const targets = outgoingEdges.get(node.url);
21
+ if (targets) {
22
+ const broken = targets.filter(targetUrl => brokenNodes.has(targetUrl));
23
+ if (broken.length > 0) {
24
+ node.brokenLinks = broken;
25
+ }
26
+ }
27
+ }
6
28
  // Authority Score (per node)
7
29
  const maxInLinks = nodes.reduce((max, n) => Math.max(max, n.inLinks), 0);
8
30
  const getAuthority = (node) => {
@@ -43,16 +65,11 @@ export function calculateMetrics(graph, _maxDepth) {
43
65
  }
44
66
  }
45
67
  // topAuthorityPages: Top 10 by authority
46
- const topAuthorityPages = [...nodes]
47
- .map(n => ({ url: n.url, authority: n.authorityScore ?? getAuthority(n) }))
68
+ const topAuthorityPages = nodes
69
+ .filter(n => n.isInternal !== false && n.status > 0)
70
+ .map(n => ({ url: n.url, authority: getAuthority(n) }))
48
71
  .sort((a, b) => b.authority - a.authority)
49
72
  .slice(0, 10);
50
- // topPageRankPages: Top 10 by raw PageRank
51
- const topPageRankPages = [...nodes]
52
- .filter(n => n.pageRank !== undefined)
53
- .map(n => ({ url: n.url, score: n.pageRank }))
54
- .sort((a, b) => b.score - a.score)
55
- .slice(0, 10);
56
73
  const averageOutDegree = totalPages > 0 ? totalEdges / totalPages : 0;
57
74
  const maxDepthFound = nodes.reduce((max, n) => Math.max(max, n.depth), 0);
58
75
  return {
@@ -67,7 +84,6 @@ export function calculateMetrics(graph, _maxDepth) {
67
84
  crawlEfficiencyScore,
68
85
  averageDepth,
69
86
  structuralEntropy,
70
- topPageRankPages,
71
87
  limitReached: graph.limitReached,
72
88
  sessionStats: graph.sessionStats
73
89
  };
@@ -1,12 +1,25 @@
1
1
  import { Graph } from './graph.js';
2
- interface PageRankOptions {
2
+ export interface PageRankRow {
3
+ raw_rank: number;
4
+ score: number;
5
+ }
6
+ export interface PageRankOptions {
3
7
  dampingFactor?: number;
4
8
  maxIterations?: number;
5
9
  convergenceThreshold?: number;
6
10
  soft404WeightThreshold?: number;
11
+ neutralScoreWhenFlat?: number;
7
12
  }
8
13
  /**
9
- * Production-Grade Weighted PageRank Engine
14
+ * Service to analyze a site's link graph and compute PageRank metrics.
15
+ * Runs only on the full crawl graph.
10
16
  */
11
- export declare function computePageRank(graph: Graph, options?: PageRankOptions): void;
12
- export {};
17
+ export declare class PageRankService {
18
+ /**
19
+ * Computes a Production-Grade Weighted PageRank over the given graph.
20
+ * @param {Graph} graph - The full site graph structure.
21
+ * @param {PageRankOptions} options - Configuration overrides for damping factor, limits, etc.
22
+ * @returns {Map<string, PageRankRow>} The individual metrics keyed by exact normalized url.
23
+ */
24
+ evaluate(graph: Graph, options?: PageRankOptions): Map<string, PageRankRow>;
25
+ }