@crawlith/core 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +70 -0
  3. package/dist/analysis/analyze.d.ts +29 -8
  4. package/dist/analysis/analyze.js +325 -221
  5. package/dist/analysis/clustering.d.ts +23 -0
  6. package/dist/analysis/clustering.js +206 -0
  7. package/dist/analysis/content.d.ts +1 -1
  8. package/dist/analysis/content.js +11 -5
  9. package/dist/analysis/duplicate.d.ts +34 -0
  10. package/dist/analysis/duplicate.js +305 -0
  11. package/dist/analysis/heading.d.ts +116 -0
  12. package/dist/analysis/heading.js +356 -0
  13. package/dist/analysis/images.d.ts +1 -1
  14. package/dist/analysis/images.js +6 -5
  15. package/dist/analysis/links.d.ts +1 -1
  16. package/dist/analysis/links.js +8 -8
  17. package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
  18. package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
  19. package/dist/analysis/scoring.js +4 -1
  20. package/dist/analysis/seo.d.ts +8 -4
  21. package/dist/analysis/seo.js +41 -30
  22. package/dist/analysis/soft404.d.ts +17 -0
  23. package/dist/analysis/soft404.js +62 -0
  24. package/dist/analysis/structuredData.d.ts +1 -1
  25. package/dist/analysis/structuredData.js +5 -4
  26. package/dist/application/index.d.ts +2 -0
  27. package/dist/application/index.js +2 -0
  28. package/dist/application/usecase.d.ts +3 -0
  29. package/dist/application/usecase.js +1 -0
  30. package/dist/application/usecases.d.ts +114 -0
  31. package/dist/application/usecases.js +201 -0
  32. package/dist/audit/index.js +1 -1
  33. package/dist/audit/transport.d.ts +1 -1
  34. package/dist/audit/transport.js +5 -4
  35. package/dist/audit/types.d.ts +1 -0
  36. package/dist/constants.d.ts +17 -0
  37. package/dist/constants.js +23 -0
  38. package/dist/core/scope/scopeManager.js +3 -0
  39. package/dist/crawler/crawl.d.ts +2 -2
  40. package/dist/crawler/crawler.d.ts +17 -5
  41. package/dist/crawler/crawler.js +259 -94
  42. package/dist/crawler/fetcher.d.ts +1 -1
  43. package/dist/crawler/fetcher.js +6 -6
  44. package/dist/crawler/metricsRunner.d.ts +21 -1
  45. package/dist/crawler/metricsRunner.js +181 -60
  46. package/dist/crawler/normalize.d.ts +41 -0
  47. package/dist/crawler/normalize.js +119 -3
  48. package/dist/crawler/parser.d.ts +1 -3
  49. package/dist/crawler/parser.js +2 -49
  50. package/dist/crawler/resolver.d.ts +11 -0
  51. package/dist/crawler/resolver.js +67 -0
  52. package/dist/crawler/sitemap.d.ts +4 -1
  53. package/dist/crawler/sitemap.js +24 -18
  54. package/dist/crawler/trap.d.ts +5 -1
  55. package/dist/crawler/trap.js +23 -2
  56. package/dist/db/CrawlithDB.d.ts +110 -0
  57. package/dist/db/CrawlithDB.js +500 -0
  58. package/dist/db/graphLoader.js +15 -32
  59. package/dist/db/index.d.ts +9 -1
  60. package/dist/db/index.js +39 -31
  61. package/dist/db/migrations.d.ts +2 -0
  62. package/dist/db/{schema.js → migrations.js} +90 -43
  63. package/dist/db/pluginRegistry.d.ts +9 -0
  64. package/dist/db/pluginRegistry.js +19 -0
  65. package/dist/db/repositories/EdgeRepository.d.ts +5 -0
  66. package/dist/db/repositories/EdgeRepository.js +7 -0
  67. package/dist/db/repositories/MetricsRepository.d.ts +13 -8
  68. package/dist/db/repositories/MetricsRepository.js +14 -6
  69. package/dist/db/repositories/PageRepository.d.ts +5 -3
  70. package/dist/db/repositories/PageRepository.js +68 -17
  71. package/dist/db/repositories/SiteRepository.d.ts +6 -0
  72. package/dist/db/repositories/SiteRepository.js +4 -0
  73. package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
  74. package/dist/db/repositories/SnapshotRepository.js +48 -10
  75. package/dist/db/reset.d.ts +9 -0
  76. package/dist/db/reset.js +32 -0
  77. package/dist/db/statements.d.ts +12 -0
  78. package/dist/db/statements.js +40 -0
  79. package/dist/diff/compare.d.ts +0 -5
  80. package/dist/diff/compare.js +0 -12
  81. package/dist/diff/service.d.ts +16 -0
  82. package/dist/diff/service.js +41 -0
  83. package/dist/domain/index.d.ts +4 -0
  84. package/dist/domain/index.js +4 -0
  85. package/dist/events.d.ts +8 -0
  86. package/dist/graph/graph.d.ts +20 -42
  87. package/dist/graph/graph.js +12 -16
  88. package/dist/graph/hits.d.ts +23 -0
  89. package/dist/graph/hits.js +111 -0
  90. package/dist/graph/metrics.d.ts +0 -4
  91. package/dist/graph/metrics.js +19 -15
  92. package/dist/graph/pagerank.d.ts +17 -4
  93. package/dist/graph/pagerank.js +126 -93
  94. package/dist/index.d.ts +27 -9
  95. package/dist/index.js +27 -9
  96. package/dist/lock/lockManager.d.ts +1 -0
  97. package/dist/lock/lockManager.js +15 -0
  98. package/dist/plugin-system/plugin-cli.d.ts +10 -0
  99. package/dist/plugin-system/plugin-cli.js +31 -0
  100. package/dist/plugin-system/plugin-config.d.ts +16 -0
  101. package/dist/plugin-system/plugin-config.js +36 -0
  102. package/dist/plugin-system/plugin-loader.d.ts +17 -0
  103. package/dist/plugin-system/plugin-loader.js +122 -0
  104. package/dist/plugin-system/plugin-registry.d.ts +25 -0
  105. package/dist/plugin-system/plugin-registry.js +167 -0
  106. package/dist/plugin-system/plugin-types.d.ts +205 -0
  107. package/dist/plugin-system/plugin-types.js +1 -0
  108. package/dist/ports/index.d.ts +9 -0
  109. package/dist/ports/index.js +1 -0
  110. package/dist/report/export.d.ts +3 -0
  111. package/dist/report/export.js +81 -0
  112. package/dist/report/insight.d.ts +27 -0
  113. package/dist/report/insight.js +103 -0
  114. package/dist/scoring/health.d.ts +17 -11
  115. package/dist/scoring/health.js +183 -140
  116. package/dist/utils/chalk.d.ts +6 -0
  117. package/dist/utils/chalk.js +41 -0
  118. package/dist/utils/secureConfig.d.ts +23 -0
  119. package/dist/utils/secureConfig.js +128 -0
  120. package/package.json +10 -4
  121. package/CHANGELOG.md +0 -13
  122. package/dist/db/schema.d.ts +0 -2
  123. package/dist/graph/cluster.d.ts +0 -6
  124. package/dist/graph/cluster.js +0 -221
  125. package/dist/graph/duplicate.d.ts +0 -10
  126. package/dist/graph/duplicate.js +0 -302
  127. package/dist/scoring/hits.d.ts +0 -10
  128. package/dist/scoring/hits.js +0 -131
  129. package/scripts/copy-assets.js +0 -37
  130. package/src/analysis/analysis_list.html +0 -35
  131. package/src/analysis/analysis_page.html +0 -123
  132. package/src/analysis/analyze.ts +0 -505
  133. package/src/analysis/content.ts +0 -62
  134. package/src/analysis/images.ts +0 -28
  135. package/src/analysis/links.ts +0 -41
  136. package/src/analysis/scoring.ts +0 -66
  137. package/src/analysis/seo.ts +0 -82
  138. package/src/analysis/structuredData.ts +0 -62
  139. package/src/analysis/templates.ts +0 -9
  140. package/src/audit/dns.ts +0 -49
  141. package/src/audit/headers.ts +0 -98
  142. package/src/audit/index.ts +0 -66
  143. package/src/audit/scoring.ts +0 -232
  144. package/src/audit/transport.ts +0 -258
  145. package/src/audit/types.ts +0 -102
  146. package/src/core/network/proxyAdapter.ts +0 -21
  147. package/src/core/network/rateLimiter.ts +0 -39
  148. package/src/core/network/redirectController.ts +0 -47
  149. package/src/core/network/responseLimiter.ts +0 -34
  150. package/src/core/network/retryPolicy.ts +0 -57
  151. package/src/core/scope/domainFilter.ts +0 -45
  152. package/src/core/scope/scopeManager.ts +0 -52
  153. package/src/core/scope/subdomainPolicy.ts +0 -39
  154. package/src/core/security/ipGuard.ts +0 -171
  155. package/src/crawler/crawl.ts +0 -9
  156. package/src/crawler/crawler.ts +0 -601
  157. package/src/crawler/extract.ts +0 -39
  158. package/src/crawler/fetcher.ts +0 -251
  159. package/src/crawler/metricsRunner.ts +0 -137
  160. package/src/crawler/normalize.ts +0 -108
  161. package/src/crawler/parser.ts +0 -190
  162. package/src/crawler/sitemap.ts +0 -76
  163. package/src/crawler/trap.ts +0 -96
  164. package/src/db/graphLoader.ts +0 -135
  165. package/src/db/index.ts +0 -75
  166. package/src/db/repositories/EdgeRepository.ts +0 -43
  167. package/src/db/repositories/MetricsRepository.ts +0 -63
  168. package/src/db/repositories/PageRepository.ts +0 -228
  169. package/src/db/repositories/SiteRepository.ts +0 -43
  170. package/src/db/repositories/SnapshotRepository.ts +0 -99
  171. package/src/db/schema.ts +0 -177
  172. package/src/diff/compare.ts +0 -84
  173. package/src/events.ts +0 -16
  174. package/src/graph/cluster.ts +0 -246
  175. package/src/graph/duplicate.ts +0 -350
  176. package/src/graph/graph.ts +0 -192
  177. package/src/graph/metrics.ts +0 -125
  178. package/src/graph/pagerank.ts +0 -126
  179. package/src/graph/simhash.ts +0 -76
  180. package/src/index.ts +0 -33
  181. package/src/lock/hashKey.ts +0 -51
  182. package/src/lock/lockManager.ts +0 -132
  183. package/src/lock/pidCheck.ts +0 -13
  184. package/src/report/crawl.html +0 -879
  185. package/src/report/crawlExport.ts +0 -58
  186. package/src/report/crawl_template.ts +0 -9
  187. package/src/report/html.ts +0 -27
  188. package/src/scoring/health.ts +0 -241
  189. package/src/scoring/hits.ts +0 -153
  190. package/src/scoring/orphanSeverity.ts +0 -176
  191. package/src/utils/version.ts +0 -18
  192. package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
  193. package/tests/analysis.unit.test.ts +0 -142
  194. package/tests/analyze.integration.test.ts +0 -133
  195. package/tests/analyze_markdown.test.ts +0 -98
  196. package/tests/audit/audit.test.ts +0 -101
  197. package/tests/audit/dns.test.ts +0 -31
  198. package/tests/audit/headers.test.ts +0 -45
  199. package/tests/audit/scoring.test.ts +0 -133
  200. package/tests/audit/security.test.ts +0 -12
  201. package/tests/audit/transport.test.ts +0 -111
  202. package/tests/clustering.test.ts +0 -118
  203. package/tests/clustering_risk.test.ts +0 -118
  204. package/tests/crawler.test.ts +0 -364
  205. package/tests/db/index.test.ts +0 -134
  206. package/tests/db/repositories.test.ts +0 -115
  207. package/tests/db.test.ts +0 -159
  208. package/tests/db_repos.test.ts +0 -72
  209. package/tests/diff.test.ts +0 -67
  210. package/tests/duplicate.test.ts +0 -110
  211. package/tests/extract.test.ts +0 -86
  212. package/tests/fetcher.test.ts +0 -110
  213. package/tests/fetcher_safety.test.ts +0 -91
  214. package/tests/fixtures/analyze-crawl.json +0 -26
  215. package/tests/graph/graph.test.ts +0 -100
  216. package/tests/graphLoader.test.ts +0 -124
  217. package/tests/hits.test.ts +0 -134
  218. package/tests/html_report.test.ts +0 -59
  219. package/tests/ipGuard.test.ts +0 -73
  220. package/tests/lock/lockManager.test.ts +0 -198
  221. package/tests/metrics.test.ts +0 -196
  222. package/tests/normalize.test.ts +0 -88
  223. package/tests/orphanSeverity.test.ts +0 -160
  224. package/tests/pagerank.test.ts +0 -98
  225. package/tests/parser.test.ts +0 -117
  226. package/tests/proxy_safety.test.ts +0 -57
  227. package/tests/redirect_safety.test.ts +0 -77
  228. package/tests/renderAnalysisCsv.test.ts +0 -183
  229. package/tests/safety.test.ts +0 -126
  230. package/tests/scope.test.ts +0 -84
  231. package/tests/scoring.test.ts +0 -60
  232. package/tests/sitemap.test.ts +0 -100
  233. package/tests/soft404.test.ts +0 -41
  234. package/tests/ssrf_fix.test.ts +0 -69
  235. package/tests/trap.test.ts +0 -39
  236. package/tests/visualization_data.test.ts +0 -46
  237. package/tsconfig.json +0 -11
@@ -2,9 +2,10 @@ import { load } from 'cheerio';
2
2
  function normalizedText(value) {
3
3
  return (value ?? '').trim().toLowerCase();
4
4
  }
5
- export function analyzeTitle(html) {
6
- const $ = load(html);
7
- const title = $('title').first().text().trim();
5
+ export function analyzeTitle($) {
6
+ const isString = typeof $ === 'string';
7
+ const cheerioObj = isString ? load($ || '<html></html>') : $;
8
+ const title = cheerioObj('title').first().text().trim();
8
9
  if (!title) {
9
10
  return { value: null, length: 0, status: 'missing' };
10
11
  }
@@ -14,9 +15,10 @@ export function analyzeTitle(html) {
14
15
  return { value: title, length: title.length, status: 'too_long' };
15
16
  return { value: title, length: title.length, status: 'ok' };
16
17
  }
17
- export function analyzeMetaDescription(html) {
18
- const $ = load(html);
19
- const raw = $('meta[name="description"]').attr('content');
18
+ export function analyzeMetaDescription($) {
19
+ const isString = typeof $ === 'string';
20
+ const cheerioObj = isString ? load($ || '<html></html>') : $;
21
+ const raw = cheerioObj('meta[name="description"]').attr('content');
20
22
  if (raw === undefined) {
21
23
  return { value: null, length: 0, status: 'missing' };
22
24
  }
@@ -30,35 +32,44 @@ export function analyzeMetaDescription(html) {
30
32
  return { value: description, length: description.length, status: 'too_long' };
31
33
  return { value: description, length: description.length, status: 'ok' };
32
34
  }
33
- export function applyDuplicateStatuses(fields) {
34
- const counts = new Map();
35
- for (const field of fields) {
36
- const key = normalizedText(field.value);
37
- if (!key)
38
- continue;
39
- counts.set(key, (counts.get(key) || 0) + 1);
40
- }
41
- return fields.map((field) => {
42
- const key = normalizedText(field.value);
43
- if (!key)
44
- return field;
45
- if ((counts.get(key) || 0) > 1) {
46
- return { ...field, status: 'duplicate' };
47
- }
48
- return field;
49
- });
50
- }
51
- export function analyzeH1(html, titleValue) {
52
- const $ = load(html);
53
- const h1Values = $('h1').toArray().map((el) => $(el).text().trim()).filter(Boolean);
35
+ export function analyzeH1($, titleValue) {
36
+ const isString = typeof $ === 'string';
37
+ const cheerioObj = isString ? load($ || '<html></html>') : $;
38
+ const h1Values = cheerioObj('h1').toArray().map((el) => cheerioObj(el).text().trim()).filter(Boolean);
54
39
  const count = h1Values.length;
55
40
  const first = h1Values[0] || null;
56
41
  const matchesTitle = Boolean(first && titleValue && normalizedText(first) === normalizedText(titleValue));
57
42
  if (count === 0) {
58
- return { count, status: 'critical', matchesTitle };
43
+ return { count, status: 'critical', matchesTitle, value: null };
59
44
  }
60
45
  if (count > 1) {
61
- return { count, status: 'warning', matchesTitle };
46
+ return { count, status: 'warning', matchesTitle, value: first };
62
47
  }
63
- return { count, status: 'ok', matchesTitle };
48
+ return { count, status: 'ok', matchesTitle, value: first };
49
+ }
50
+ export function applyDuplicateStatuses(items) {
51
+ const counts = new Map();
52
+ const normalizedToOriginal = new Map();
53
+ // First pass: count occurrences of each normalized value
54
+ for (const item of items) {
55
+ if (item.value) {
56
+ const normalized = normalizedText(item.value);
57
+ if (normalized) {
58
+ counts.set(normalized, (counts.get(normalized) || 0) + 1);
59
+ if (!normalizedToOriginal.has(normalized)) {
60
+ normalizedToOriginal.set(normalized, item.value);
61
+ }
62
+ }
63
+ }
64
+ }
65
+ // Second pass: apply duplicate status
66
+ return items.map(item => {
67
+ if (item.value) {
68
+ const normalized = normalizedText(item.value);
69
+ if ((counts.get(normalized) || 0) > 1) {
70
+ return { ...item, status: 'duplicate' };
71
+ }
72
+ }
73
+ return item;
74
+ });
64
75
  }
@@ -0,0 +1,17 @@
1
+ export interface Soft404Result {
2
+ score: number;
3
+ reason: string;
4
+ }
5
+ /**
6
+ * Service to analyze HTML content for soft 404 signals.
7
+ * Extracts signals from title, H1, body content length, and outlinks.
8
+ */
9
+ export declare class Soft404Service {
10
+ /**
11
+ * Analyzes HTML string to determine probability of being a soft 404 page.
12
+ * @param {string | undefined} html - Raw HTML source code.
13
+ * @param {number} outLinks - Total number of outbound links extracted during parsing.
14
+ * @returns {Soft404Result} A calculated score between 0.0 and 1.0, and the matched reasons.
15
+ */
16
+ analyze(html: string | undefined, outLinks: number): Soft404Result;
17
+ }
@@ -0,0 +1,62 @@
1
+ import * as cheerio from 'cheerio';
2
+ /**
3
+ * Service to analyze HTML content for soft 404 signals.
4
+ * Extracts signals from title, H1, body content length, and outlinks.
5
+ */
6
+ export class Soft404Service {
7
+ /**
8
+ * Analyzes HTML string to determine probability of being a soft 404 page.
9
+ * @param {string | undefined} html - Raw HTML source code.
10
+ * @param {number} outLinks - Total number of outbound links extracted during parsing.
11
+ * @returns {Soft404Result} A calculated score between 0.0 and 1.0, and the matched reasons.
12
+ */
13
+ analyze(html, outLinks) {
14
+ if (!html)
15
+ return { score: 0, reason: '' };
16
+ let score = 0;
17
+ const signals = [];
18
+ const $ = cheerio.load(html);
19
+ $('script, style, noscript, iframe').remove();
20
+ const cleanText = $('body').text().replace(/\s+/g, ' ').trim();
21
+ const title = $('title').text().toLowerCase();
22
+ const h1Text = $('h1').first().text().toLowerCase();
23
+ const bodyText = cleanText.toLowerCase();
24
+ const errorPatterns = ['404', 'not found', 'error', "doesn't exist", 'unavailable', 'invalid'];
25
+ for (const pattern of errorPatterns) {
26
+ if (title.includes(pattern)) {
27
+ score += 0.4;
28
+ signals.push(`title_contains_${pattern}`);
29
+ break;
30
+ }
31
+ }
32
+ for (const pattern of errorPatterns) {
33
+ if (h1Text.includes(pattern)) {
34
+ score += 0.3;
35
+ signals.push(`h1_contains_${pattern}`);
36
+ break;
37
+ }
38
+ }
39
+ if (bodyText.includes('page not found') || bodyText.includes('404 error')) {
40
+ score += 0.2;
41
+ signals.push('body_error_phrase');
42
+ }
43
+ const words = cleanText.split(/\s+/).filter(w => w.length > 0);
44
+ if (words.length < 50) {
45
+ score += 0.3;
46
+ signals.push('very_low_word_count');
47
+ }
48
+ else if (words.length < 150) {
49
+ score += 0.1;
50
+ signals.push('low_word_count');
51
+ }
52
+ if (outLinks === 0) {
53
+ score += 0.2;
54
+ signals.push('no_outbound_links');
55
+ }
56
+ score = Math.min(1.0, score);
57
+ return {
58
+ score: Number(score.toFixed(2)),
59
+ reason: signals.join(', ')
60
+ };
61
+ }
62
+ }
@@ -3,4 +3,4 @@ export interface StructuredDataResult {
3
3
  types: string[];
4
4
  valid: boolean;
5
5
  }
6
- export declare function analyzeStructuredData(html: string): StructuredDataResult;
6
+ export declare function analyzeStructuredData($: any): StructuredDataResult;
@@ -1,14 +1,15 @@
1
1
  import { load } from 'cheerio';
2
- export function analyzeStructuredData(html) {
3
- const $ = load(html);
4
- const scripts = $('script[type="application/ld+json"]').toArray();
2
+ export function analyzeStructuredData($) {
3
+ const isString = typeof $ === 'string';
4
+ const cheerioObj = isString ? load($ || '<html></html>') : $;
5
+ const scripts = cheerioObj('script[type="application/ld+json"]').toArray();
5
6
  if (scripts.length === 0) {
6
7
  return { present: false, types: [], valid: false };
7
8
  }
8
9
  const types = new Set();
9
10
  let valid = true;
10
11
  for (const script of scripts) {
11
- const raw = $(script).text().trim();
12
+ const raw = cheerioObj(script).text().trim();
12
13
  if (!raw) {
13
14
  valid = false;
14
15
  continue;
@@ -0,0 +1,2 @@
1
+ export * from './usecase.js';
2
+ export * from './usecases.js';
@@ -0,0 +1,2 @@
1
+ export * from './usecase.js';
2
+ export * from './usecases.js';
@@ -0,0 +1,3 @@
1
+ export interface UseCase<I, O> {
2
+ execute(input: I): Promise<O>;
3
+ }
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,114 @@
1
+ import { type AnalyzeOptions, type AnalysisResult } from '../analysis/analyze.js';
2
+ import { compareGraphs } from '../diff/compare.js';
3
+ import type { CrawlithPlugin, PluginContext } from '../plugin-system/plugin-types.js';
4
+ import type { UseCase } from './usecase.js';
5
+ import type { Graph } from '../graph/graph.js';
6
+ import type { EngineContext } from '../events.js';
7
+ export interface CrawlSitegraphResult {
8
+ snapshotId: number;
9
+ graph: Graph;
10
+ metrics?: any;
11
+ healthData?: any;
12
+ }
13
+ export interface SiteCrawlInput {
14
+ url: string;
15
+ limit?: number;
16
+ depth?: number;
17
+ concurrency?: number;
18
+ stripQuery?: boolean;
19
+ ignoreRobots?: boolean;
20
+ sitemap?: string | boolean;
21
+ debug?: boolean;
22
+ detectSoft404?: boolean;
23
+ detectTraps?: boolean;
24
+ rate?: number;
25
+ maxBytes?: number;
26
+ allowedDomains?: string[];
27
+ deniedDomains?: string[];
28
+ includeSubdomains?: boolean;
29
+ proxyUrl?: string;
30
+ maxRedirects?: number;
31
+ userAgent?: string;
32
+ clustering?: boolean;
33
+ clusterThreshold?: number;
34
+ minClusterSize?: number;
35
+ heading?: boolean;
36
+ failOnCritical?: boolean;
37
+ scoreBreakdown?: boolean;
38
+ computeHits?: boolean;
39
+ computePagerank?: boolean;
40
+ orphans?: boolean;
41
+ orphanSeverity?: 'low' | 'medium' | 'high';
42
+ includeSoftOrphans?: boolean;
43
+ minInbound?: number;
44
+ plugins?: CrawlithPlugin[];
45
+ context?: PluginContext;
46
+ }
47
+ export declare class CrawlSitegraph implements UseCase<SiteCrawlInput, CrawlSitegraphResult> {
48
+ execute(input: SiteCrawlInput): Promise<CrawlSitegraphResult>;
49
+ }
50
+ export declare class AnalyzeSnapshot implements UseCase<{
51
+ url: string;
52
+ options: AnalyzeOptions;
53
+ plugins?: CrawlithPlugin[];
54
+ context?: PluginContext;
55
+ }, AnalysisResult> {
56
+ execute(input: {
57
+ url: string;
58
+ options: AnalyzeOptions;
59
+ plugins?: CrawlithPlugin[];
60
+ context?: PluginContext;
61
+ }): Promise<AnalysisResult>;
62
+ }
63
+ export interface PageAnalysisInput {
64
+ url: string;
65
+ live?: boolean;
66
+ snapshotId?: number;
67
+ seo?: boolean;
68
+ content?: boolean;
69
+ accessibility?: boolean;
70
+ rate?: number;
71
+ proxyUrl?: string;
72
+ userAgent?: string;
73
+ maxRedirects?: number;
74
+ maxBytes?: number;
75
+ clustering?: boolean;
76
+ clusterThreshold?: number;
77
+ minClusterSize?: number;
78
+ debug?: boolean;
79
+ allPages?: boolean;
80
+ sitemap?: string | boolean;
81
+ heading?: boolean;
82
+ health?: boolean;
83
+ failOnCritical?: boolean;
84
+ scoreBreakdown?: boolean;
85
+ computeHits?: boolean;
86
+ computePagerank?: boolean;
87
+ orphans?: boolean;
88
+ orphanSeverity?: 'low' | 'medium' | 'high';
89
+ includeSoftOrphans?: boolean;
90
+ minInbound?: number;
91
+ plugins?: CrawlithPlugin[];
92
+ context?: PluginContext;
93
+ }
94
+ export declare class PageAnalysisUseCase implements UseCase<PageAnalysisInput, AnalysisResult> {
95
+ private readonly context?;
96
+ constructor(context?: EngineContext | undefined);
97
+ execute(input: PageAnalysisInput): Promise<AnalysisResult>;
98
+ }
99
+ export declare class ExportReport implements UseCase<{
100
+ snapshotId: number;
101
+ }, string> {
102
+ execute(input: {
103
+ snapshotId: number;
104
+ }): Promise<string>;
105
+ }
106
+ export declare class DiffSnapshots implements UseCase<{
107
+ oldSnapshotId: number;
108
+ newSnapshotId: number;
109
+ }, ReturnType<typeof compareGraphs>> {
110
+ execute(input: {
111
+ oldSnapshotId: number;
112
+ newSnapshotId: number;
113
+ }): Promise<import("../diff/compare.js").DiffResult>;
114
+ }
@@ -0,0 +1,201 @@
1
+ import { crawl } from '../crawler/crawl.js';
2
+ import { DEFAULTS } from '../constants.js';
3
+ import { runPostCrawlMetrics } from '../crawler/metricsRunner.js';
4
+ import { analyzeSite } from '../analysis/analyze.js';
5
+ import { loadGraphFromSnapshot } from '../db/graphLoader.js';
6
+ import { compareGraphs } from '../diff/compare.js';
7
+ import { SiteRepository } from '../db/repositories/SiteRepository.js';
8
+ import { SnapshotRepository } from '../db/repositories/SnapshotRepository.js';
9
+ import { PluginRegistry } from '../plugin-system/plugin-registry.js';
10
+ import { getCrawlithDB } from '../db/index.js';
11
+ import { UrlUtil } from '../crawler/normalize.js';
12
+ export class CrawlSitegraph {
13
+ async execute(input) {
14
+ const ctx = input.context ?? { command: 'crawl', scope: 'crawl' };
15
+ ctx.scope = 'crawl';
16
+ ctx.db = getCrawlithDB();
17
+ const registry = new PluginRegistry(input.plugins ?? []);
18
+ await registry.applyStorage(ctx);
19
+ await registry.runHook('onInit', ctx);
20
+ if (ctx.terminate)
21
+ return { snapshotId: 0, graph: undefined };
22
+ await registry.runHook('onCrawlStart', ctx);
23
+ if (ctx.terminate)
24
+ return { snapshotId: 0, graph: undefined };
25
+ const policy = (ctx.metadata?.crawlPolicy || {});
26
+ // Map the unified DTO into the underlying CrawlOptions
27
+ const crawlOpts = {
28
+ limit: input.limit ?? DEFAULTS.CRAWL_LIMIT,
29
+ depth: input.depth ?? DEFAULTS.MAX_DEPTH,
30
+ concurrency: input.concurrency,
31
+ stripQuery: input.stripQuery,
32
+ ignoreRobots: policy.ignoreRobots !== undefined ? policy.ignoreRobots : input.ignoreRobots,
33
+ sitemap: input.sitemap,
34
+ debug: input.debug,
35
+ detectSoft404: input.detectSoft404,
36
+ detectTraps: input.detectTraps,
37
+ rate: policy.rate !== undefined ? policy.rate : input.rate,
38
+ maxBytes: policy.maxBytes !== undefined ? policy.maxBytes : input.maxBytes,
39
+ allowedDomains: policy.allowedDomains?.length ? policy.allowedDomains : input.allowedDomains,
40
+ deniedDomains: policy.deniedDomains?.length ? policy.deniedDomains : input.deniedDomains,
41
+ includeSubdomains: policy.includeSubdomains !== undefined ? policy.includeSubdomains : input.includeSubdomains,
42
+ proxyUrl: policy.proxyUrl !== undefined ? policy.proxyUrl : input.proxyUrl,
43
+ maxRedirects: policy.maxRedirects !== undefined ? policy.maxRedirects : input.maxRedirects,
44
+ userAgent: policy.userAgent !== undefined ? policy.userAgent : input.userAgent,
45
+ registry: registry
46
+ };
47
+ // Build an EngineContext from the plugin context's emit so per-page logs reach OutputController
48
+ const engineContext = ctx.emit
49
+ ? { emit: (e) => ctx.emit(e) }
50
+ : undefined;
51
+ const snapshotId = await crawl(input.url, crawlOpts, engineContext);
52
+ const graph = loadGraphFromSnapshot(snapshotId);
53
+ // Ensure plugin hooks that persist data are scoped to the created snapshot.
54
+ ctx.snapshotId = snapshotId;
55
+ await registry.runHook('onGraphBuilt', ctx, graph);
56
+ await registry.runHook('onMetrics', ctx, graph);
57
+ const db = getCrawlithDB().unsafeGetRawDb();
58
+ const siteRepo = new SiteRepository(db);
59
+ const snapshotRepo = new SnapshotRepository(db);
60
+ const snapshot = snapshotRepo.getSnapshot(snapshotId);
61
+ let resolvedOrigin = '';
62
+ if (snapshot) {
63
+ const site = siteRepo.getSiteById(snapshot.site_id);
64
+ if (site?.preferred_url) {
65
+ try {
66
+ resolvedOrigin = new URL(site.preferred_url).origin;
67
+ }
68
+ catch { /* ignore */ }
69
+ }
70
+ }
71
+ const result = runPostCrawlMetrics(snapshotId, crawlOpts.depth, {
72
+ context: undefined,
73
+ limitReached: false,
74
+ graphInstance: graph,
75
+ clustering: input.clustering ?? true,
76
+ clusterThreshold: input.clusterThreshold,
77
+ minClusterSize: input.minClusterSize,
78
+ heading: input.heading ?? true,
79
+ // Always compute and persist health score for full crawls.
80
+ health: true,
81
+ computeHits: input.computeHits ?? true,
82
+ computePagerank: input.computePagerank ?? true,
83
+ orphans: input.orphans ?? true,
84
+ orphanSeverity: input.orphanSeverity ?? true,
85
+ includeSoftOrphans: input.includeSoftOrphans ?? true,
86
+ minInbound: input.minInbound,
87
+ rootOrigin: resolvedOrigin || (input.url.startsWith('http://') || input.url.startsWith('https://')
88
+ ? new URL(input.url).origin
89
+ : `https://${UrlUtil.extractDomain(input.url)}`)
90
+ });
91
+ const metrics = result?.metrics;
92
+ const healthData = result?.healthData;
93
+ if (ctx.db) {
94
+ ctx.db.aggregateScoreProviders(snapshotId, registry.pluginsList);
95
+ }
96
+ await registry.runHook('onReport', ctx, { snapshotId, graph, metrics, healthData });
97
+ return { snapshotId, graph, metrics, healthData };
98
+ }
99
+ }
100
+ export class AnalyzeSnapshot {
101
+ async execute(input) {
102
+ const result = await analyzeSite(input.url, { ...input.options, live: false });
103
+ if (input.plugins && input.plugins.length > 0) {
104
+ const registry = new PluginRegistry(input.plugins);
105
+ const ctx = {
106
+ command: 'analyze',
107
+ scope: 'crawl',
108
+ ...(input.context || {}),
109
+ db: getCrawlithDB()
110
+ };
111
+ await registry.runHook('onInit', ctx);
112
+ await registry.runHook('onReport', ctx, result);
113
+ }
114
+ return result;
115
+ }
116
+ }
117
+ export class PageAnalysisUseCase {
118
+ context;
119
+ constructor(context) {
120
+ this.context = context;
121
+ }
122
+ async execute(input) {
123
+ // When running a live single-page analysis, enable all content modules
124
+ // by default (they all work well on a single page). Callers can still
125
+ // explicitly pass false to disable any of them.
126
+ const isLive = !!input.live;
127
+ const seo = input.seo ?? (isLive ? true : undefined);
128
+ const content = input.content ?? (isLive ? true : undefined);
129
+ const accessibility = input.accessibility ?? (isLive ? true : undefined);
130
+ const health = input.health ?? (isLive ? true : undefined);
131
+ const heading = input.heading ?? (isLive ? true : undefined);
132
+ const result = await analyzeSite(input.url, {
133
+ live: input.live,
134
+ snapshotId: input.snapshotId,
135
+ seo,
136
+ content,
137
+ accessibility,
138
+ rate: input.rate,
139
+ proxyUrl: input.proxyUrl,
140
+ userAgent: input.userAgent,
141
+ maxRedirects: input.maxRedirects,
142
+ maxBytes: input.maxBytes,
143
+ clustering: input.clustering,
144
+ clusterThreshold: input.clusterThreshold,
145
+ minClusterSize: input.minClusterSize,
146
+ sitemap: input.sitemap,
147
+ heading,
148
+ health,
149
+ failOnCritical: input.failOnCritical,
150
+ scoreBreakdown: input.scoreBreakdown,
151
+ computeHits: input.computeHits,
152
+ computePagerank: input.computePagerank,
153
+ orphans: input.orphans,
154
+ orphanSeverity: input.orphanSeverity,
155
+ includeSoftOrphans: input.includeSoftOrphans,
156
+ minInbound: input.minInbound,
157
+ debug: input.debug,
158
+ allPages: input.allPages,
159
+ }, this.context);
160
+ // Run plugins with page scope — only onInit + onPage are called
161
+ if (input.plugins && input.plugins.length > 0) {
162
+ const { PluginRegistry } = await import('../plugin-system/plugin-registry.js');
163
+ const registry = new PluginRegistry(input.plugins);
164
+ const pluginCtx = {
165
+ command: 'page',
166
+ scope: 'page',
167
+ targetUrl: input.url,
168
+ snapshotId: result.snapshotId,
169
+ live: input.live || !!(input.context?.flags?.live),
170
+ ...(input.context || {}),
171
+ db: getCrawlithDB(),
172
+ };
173
+ await registry.applyStorage(pluginCtx);
174
+ await registry.runHook('onInit', pluginCtx);
175
+ // Fire onPage once per analyzed page (normally just 1 for the page command)
176
+ const inputOrigin = new URL(input.url).origin;
177
+ for (const page of result.pages) {
178
+ const absoluteUrl = UrlUtil.toAbsolute(page.url, inputOrigin);
179
+ await registry.runHook('onPage', pluginCtx, {
180
+ url: absoluteUrl,
181
+ html: page.html ?? '',
182
+ status: page.status,
183
+ });
184
+ }
185
+ if (pluginCtx.db && result.snapshotId) {
186
+ pluginCtx.db.aggregateScoreProviders(result.snapshotId, registry.pluginsList);
187
+ }
188
+ }
189
+ return result;
190
+ }
191
+ }
192
+ export class ExportReport {
193
+ async execute(input) {
194
+ return JSON.stringify(loadGraphFromSnapshot(input.snapshotId).toJSON());
195
+ }
196
+ }
197
+ export class DiffSnapshots {
198
+ async execute(input) {
199
+ return compareGraphs(loadGraphFromSnapshot(input.oldSnapshotId), loadGraphFromSnapshot(input.newSnapshotId));
200
+ }
201
+ }
@@ -26,7 +26,7 @@ export async function auditUrl(urlStr, options = {}) {
26
26
  // We handle transport errors differently as they are fatal for the audit (e.g. connection refused)
27
27
  // DNS errors might return partial results but usually if transport works, DNS worked (unless transport used IP)
28
28
  const dnsPromise = resolveDns(url.hostname);
29
- const transportPromise = analyzeTransport(urlStr, timeout);
29
+ const transportPromise = analyzeTransport(urlStr, timeout, options.userAgent);
30
30
  const [dnsResult, transportResult] = await Promise.all([
31
31
  dnsPromise,
32
32
  transportPromise
@@ -1,5 +1,5 @@
1
1
  import { TransportDiagnostics, PerformanceMetrics, AuditIssue } from './types.js';
2
- export declare function analyzeTransport(targetUrl: string, timeout: number): Promise<{
2
+ export declare function analyzeTransport(targetUrl: string, timeout: number, userAgent?: string): Promise<{
3
3
  transport: TransportDiagnostics;
4
4
  performance: PerformanceMetrics;
5
5
  issues: AuditIssue[];
@@ -3,7 +3,8 @@ import http from 'node:http';
3
3
  import tls from 'node:tls';
4
4
  import { URL } from 'node:url';
5
5
  import { IPGuard } from '../core/security/ipGuard.js';
6
- export async function analyzeTransport(targetUrl, timeout) {
6
+ import { DEFAULTS } from '../constants.js';
7
+ export async function analyzeTransport(targetUrl, timeout, userAgent) {
7
8
  const maxRedirects = 10;
8
9
  let currentUrl = targetUrl;
9
10
  let redirectCount = 0;
@@ -18,7 +19,7 @@ export async function analyzeTransport(targetUrl, timeout) {
18
19
  throw new Error(`Blocked: Redirect to internal/private IP prohibited (${currentUrl})`);
19
20
  }
20
21
  try {
21
- const result = await executeRequest(currentUrl, timeout);
22
+ const result = await executeRequest(currentUrl, timeout, userAgent);
22
23
  if (result.redirectUrl) {
23
24
  redirectCount++;
24
25
  totalRedirectTime += result.timings.total;
@@ -113,7 +114,7 @@ export async function analyzeTransport(targetUrl, timeout) {
113
114
  }
114
115
  throw new Error(`Too many redirects (limit: ${maxRedirects})`);
115
116
  }
116
- function executeRequest(urlStr, timeout) {
117
+ function executeRequest(urlStr, timeout, userAgent) {
117
118
  return new Promise((resolve, reject) => {
118
119
  let url;
119
120
  try {
@@ -143,7 +144,7 @@ function executeRequest(urlStr, timeout) {
143
144
  rejectUnauthorized: false,
144
145
  agent: false,
145
146
  headers: {
146
- 'User-Agent': 'Crawlith/Audit',
147
+ 'User-Agent': userAgent || DEFAULTS.USER_AGENT,
147
148
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
148
149
  'Accept-Encoding': 'gzip, deflate, br'
149
150
  }
@@ -85,4 +85,5 @@ export interface AuditOptions {
85
85
  timeout?: number;
86
86
  verbose?: boolean;
87
87
  debug?: boolean;
88
+ userAgent?: string;
88
89
  }
@@ -0,0 +1,17 @@
1
+ export declare const DEFAULTS: {
2
+ readonly MAX_DEPTH: 5;
3
+ readonly MAX_DEPTH_LIMIT: 10;
4
+ readonly CONCURRENCY: 2;
5
+ readonly CONCURRENCY_LIMIT: 10;
6
+ readonly CRAWL_LIMIT: 500;
7
+ readonly USER_AGENT: `crawlith/${string}`;
8
+ readonly RATE_LIMIT: 10;
9
+ readonly MAX_BYTES: 2000000;
10
+ readonly MAX_REDIRECTS: 5;
11
+ readonly MAX_REDIRECTS_LIMIT: 11;
12
+ readonly HEADERS_TIMEOUT: 10000;
13
+ readonly BODY_TIMEOUT: 10000;
14
+ readonly MAX_SNAPSHOTS: 5;
15
+ readonly MAX_SINGLE_SNAPSHOTS: 5;
16
+ readonly GRAPH_PRECISION: 1e-12;
17
+ };
@@ -0,0 +1,23 @@
1
+ import { version } from './utils/version.js';
2
+ export const DEFAULTS = {
3
+ // Crawler defaults
4
+ MAX_DEPTH: 5,
5
+ MAX_DEPTH_LIMIT: 10,
6
+ CONCURRENCY: 2,
7
+ CONCURRENCY_LIMIT: 10,
8
+ CRAWL_LIMIT: 500,
9
+ // Network/Fetcher defaults
10
+ USER_AGENT: `crawlith/${version}`,
11
+ RATE_LIMIT: 10,
12
+ MAX_BYTES: 2000000, // 2MB
13
+ MAX_REDIRECTS: 5,
14
+ MAX_REDIRECTS_LIMIT: 11,
15
+ // Network timeouts
16
+ HEADERS_TIMEOUT: 10000,
17
+ BODY_TIMEOUT: 10000,
18
+ // Keep only last 5 snapshots
19
+ MAX_SNAPSHOTS: 5,
20
+ MAX_SINGLE_SNAPSHOTS: 5,
21
+ // Graph calculation precision
22
+ GRAPH_PRECISION: 1e-12
23
+ };
@@ -15,6 +15,9 @@ export class ScopeManager {
15
15
  }));
16
16
  }
17
17
  isUrlEligible(url) {
18
+ // Root-relative paths (e.g. '/about', '/?q=foo') are always internal
19
+ if (url.startsWith('/'))
20
+ return 'allowed';
18
21
  let hostname;
19
22
  try {
20
23
  hostname = new URL(url).hostname.toLowerCase();
@@ -1,4 +1,4 @@
1
- import { CrawlOptions } from './crawler.js';
1
+ import { type CrawlOptions } from './crawler.js';
2
2
  import { EngineContext } from '../events.js';
3
- export { CrawlOptions };
3
+ export type { CrawlOptions };
4
4
  export declare function crawl(startUrl: string, options: CrawlOptions, context?: EngineContext): Promise<number>;