@crawlith/core 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +70 -0
  3. package/dist/analysis/analysis_list.html +35 -0
  4. package/dist/analysis/analysis_page.html +123 -0
  5. package/dist/analysis/analyze.d.ts +40 -5
  6. package/dist/analysis/analyze.js +395 -347
  7. package/dist/analysis/clustering.d.ts +23 -0
  8. package/dist/analysis/clustering.js +206 -0
  9. package/dist/analysis/content.d.ts +1 -1
  10. package/dist/analysis/content.js +11 -5
  11. package/dist/analysis/duplicate.d.ts +34 -0
  12. package/dist/analysis/duplicate.js +305 -0
  13. package/dist/analysis/heading.d.ts +116 -0
  14. package/dist/analysis/heading.js +356 -0
  15. package/dist/analysis/images.d.ts +1 -1
  16. package/dist/analysis/images.js +6 -5
  17. package/dist/analysis/links.d.ts +1 -1
  18. package/dist/analysis/links.js +8 -8
  19. package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
  20. package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
  21. package/dist/analysis/scoring.js +11 -2
  22. package/dist/analysis/seo.d.ts +8 -4
  23. package/dist/analysis/seo.js +41 -30
  24. package/dist/analysis/soft404.d.ts +17 -0
  25. package/dist/analysis/soft404.js +62 -0
  26. package/dist/analysis/structuredData.d.ts +1 -1
  27. package/dist/analysis/structuredData.js +5 -4
  28. package/dist/analysis/templates.d.ts +2 -0
  29. package/dist/analysis/templates.js +7 -0
  30. package/dist/application/index.d.ts +2 -0
  31. package/dist/application/index.js +2 -0
  32. package/dist/application/usecase.d.ts +3 -0
  33. package/dist/application/usecase.js +1 -0
  34. package/dist/application/usecases.d.ts +114 -0
  35. package/dist/application/usecases.js +201 -0
  36. package/dist/audit/index.js +1 -1
  37. package/dist/audit/transport.d.ts +1 -1
  38. package/dist/audit/transport.js +5 -4
  39. package/dist/audit/types.d.ts +1 -0
  40. package/dist/constants.d.ts +17 -0
  41. package/dist/constants.js +23 -0
  42. package/dist/core/scope/scopeManager.js +3 -0
  43. package/dist/core/security/ipGuard.d.ts +11 -0
  44. package/dist/core/security/ipGuard.js +71 -3
  45. package/dist/crawler/crawl.d.ts +4 -22
  46. package/dist/crawler/crawl.js +4 -335
  47. package/dist/crawler/crawler.d.ts +87 -0
  48. package/dist/crawler/crawler.js +683 -0
  49. package/dist/crawler/extract.d.ts +4 -1
  50. package/dist/crawler/extract.js +7 -2
  51. package/dist/crawler/fetcher.d.ts +2 -1
  52. package/dist/crawler/fetcher.js +26 -11
  53. package/dist/crawler/metricsRunner.d.ts +23 -1
  54. package/dist/crawler/metricsRunner.js +202 -72
  55. package/dist/crawler/normalize.d.ts +41 -0
  56. package/dist/crawler/normalize.js +119 -3
  57. package/dist/crawler/parser.d.ts +1 -3
  58. package/dist/crawler/parser.js +2 -49
  59. package/dist/crawler/resolver.d.ts +11 -0
  60. package/dist/crawler/resolver.js +67 -0
  61. package/dist/crawler/sitemap.d.ts +6 -0
  62. package/dist/crawler/sitemap.js +27 -17
  63. package/dist/crawler/trap.d.ts +5 -1
  64. package/dist/crawler/trap.js +23 -2
  65. package/dist/db/CrawlithDB.d.ts +110 -0
  66. package/dist/db/CrawlithDB.js +500 -0
  67. package/dist/db/graphLoader.js +42 -30
  68. package/dist/db/index.d.ts +11 -0
  69. package/dist/db/index.js +41 -29
  70. package/dist/db/migrations.d.ts +2 -0
  71. package/dist/db/{schema.js → migrations.js} +90 -43
  72. package/dist/db/pluginRegistry.d.ts +9 -0
  73. package/dist/db/pluginRegistry.js +19 -0
  74. package/dist/db/repositories/EdgeRepository.d.ts +13 -0
  75. package/dist/db/repositories/EdgeRepository.js +20 -0
  76. package/dist/db/repositories/MetricsRepository.d.ts +16 -8
  77. package/dist/db/repositories/MetricsRepository.js +28 -7
  78. package/dist/db/repositories/PageRepository.d.ts +15 -2
  79. package/dist/db/repositories/PageRepository.js +169 -25
  80. package/dist/db/repositories/SiteRepository.d.ts +9 -0
  81. package/dist/db/repositories/SiteRepository.js +13 -0
  82. package/dist/db/repositories/SnapshotRepository.d.ts +14 -5
  83. package/dist/db/repositories/SnapshotRepository.js +64 -5
  84. package/dist/db/reset.d.ts +9 -0
  85. package/dist/db/reset.js +32 -0
  86. package/dist/db/statements.d.ts +12 -0
  87. package/dist/db/statements.js +40 -0
  88. package/dist/diff/compare.d.ts +0 -5
  89. package/dist/diff/compare.js +0 -12
  90. package/dist/diff/service.d.ts +16 -0
  91. package/dist/diff/service.js +41 -0
  92. package/dist/domain/index.d.ts +4 -0
  93. package/dist/domain/index.js +4 -0
  94. package/dist/events.d.ts +56 -0
  95. package/dist/events.js +1 -0
  96. package/dist/graph/graph.d.ts +36 -42
  97. package/dist/graph/graph.js +26 -17
  98. package/dist/graph/hits.d.ts +23 -0
  99. package/dist/graph/hits.js +111 -0
  100. package/dist/graph/metrics.d.ts +0 -4
  101. package/dist/graph/metrics.js +25 -9
  102. package/dist/graph/pagerank.d.ts +17 -4
  103. package/dist/graph/pagerank.js +126 -91
  104. package/dist/graph/simhash.d.ts +6 -0
  105. package/dist/graph/simhash.js +14 -0
  106. package/dist/index.d.ts +29 -8
  107. package/dist/index.js +29 -8
  108. package/dist/lock/hashKey.js +1 -1
  109. package/dist/lock/lockManager.d.ts +5 -1
  110. package/dist/lock/lockManager.js +38 -13
  111. package/dist/plugin-system/plugin-cli.d.ts +10 -0
  112. package/dist/plugin-system/plugin-cli.js +31 -0
  113. package/dist/plugin-system/plugin-config.d.ts +16 -0
  114. package/dist/plugin-system/plugin-config.js +36 -0
  115. package/dist/plugin-system/plugin-loader.d.ts +17 -0
  116. package/dist/plugin-system/plugin-loader.js +122 -0
  117. package/dist/plugin-system/plugin-registry.d.ts +25 -0
  118. package/dist/plugin-system/plugin-registry.js +167 -0
  119. package/dist/plugin-system/plugin-types.d.ts +205 -0
  120. package/dist/plugin-system/plugin-types.js +1 -0
  121. package/dist/ports/index.d.ts +9 -0
  122. package/dist/ports/index.js +1 -0
  123. package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
  124. package/dist/report/crawlExport.d.ts +3 -0
  125. package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
  126. package/dist/report/crawl_template.d.ts +1 -0
  127. package/dist/report/crawl_template.js +7 -0
  128. package/dist/report/export.d.ts +3 -0
  129. package/dist/report/export.js +81 -0
  130. package/dist/report/html.js +15 -216
  131. package/dist/report/insight.d.ts +27 -0
  132. package/dist/report/insight.js +103 -0
  133. package/dist/scoring/health.d.ts +56 -0
  134. package/dist/scoring/health.js +213 -0
  135. package/dist/utils/chalk.d.ts +6 -0
  136. package/dist/utils/chalk.js +41 -0
  137. package/dist/utils/secureConfig.d.ts +23 -0
  138. package/dist/utils/secureConfig.js +128 -0
  139. package/package.json +12 -6
  140. package/CHANGELOG.md +0 -7
  141. package/dist/db/schema.d.ts +0 -2
  142. package/dist/graph/cluster.d.ts +0 -6
  143. package/dist/graph/cluster.js +0 -173
  144. package/dist/graph/duplicate.d.ts +0 -10
  145. package/dist/graph/duplicate.js +0 -251
  146. package/dist/report/sitegraphExport.d.ts +0 -3
  147. package/dist/report/sitegraph_template.d.ts +0 -1
  148. package/dist/report/sitegraph_template.js +0 -630
  149. package/dist/scoring/hits.d.ts +0 -9
  150. package/dist/scoring/hits.js +0 -111
  151. package/src/analysis/analyze.ts +0 -548
  152. package/src/analysis/content.ts +0 -62
  153. package/src/analysis/images.ts +0 -28
  154. package/src/analysis/links.ts +0 -41
  155. package/src/analysis/scoring.ts +0 -59
  156. package/src/analysis/seo.ts +0 -82
  157. package/src/analysis/structuredData.ts +0 -62
  158. package/src/audit/dns.ts +0 -49
  159. package/src/audit/headers.ts +0 -98
  160. package/src/audit/index.ts +0 -66
  161. package/src/audit/scoring.ts +0 -232
  162. package/src/audit/transport.ts +0 -258
  163. package/src/audit/types.ts +0 -102
  164. package/src/core/network/proxyAdapter.ts +0 -21
  165. package/src/core/network/rateLimiter.ts +0 -39
  166. package/src/core/network/redirectController.ts +0 -47
  167. package/src/core/network/responseLimiter.ts +0 -34
  168. package/src/core/network/retryPolicy.ts +0 -57
  169. package/src/core/scope/domainFilter.ts +0 -45
  170. package/src/core/scope/scopeManager.ts +0 -52
  171. package/src/core/scope/subdomainPolicy.ts +0 -39
  172. package/src/core/security/ipGuard.ts +0 -92
  173. package/src/crawler/crawl.ts +0 -382
  174. package/src/crawler/extract.ts +0 -34
  175. package/src/crawler/fetcher.ts +0 -233
  176. package/src/crawler/metricsRunner.ts +0 -124
  177. package/src/crawler/normalize.ts +0 -108
  178. package/src/crawler/parser.ts +0 -190
  179. package/src/crawler/sitemap.ts +0 -73
  180. package/src/crawler/trap.ts +0 -96
  181. package/src/db/graphLoader.ts +0 -105
  182. package/src/db/index.ts +0 -70
  183. package/src/db/repositories/EdgeRepository.ts +0 -29
  184. package/src/db/repositories/MetricsRepository.ts +0 -49
  185. package/src/db/repositories/PageRepository.ts +0 -128
  186. package/src/db/repositories/SiteRepository.ts +0 -32
  187. package/src/db/repositories/SnapshotRepository.ts +0 -74
  188. package/src/db/schema.ts +0 -177
  189. package/src/diff/compare.ts +0 -84
  190. package/src/graph/cluster.ts +0 -192
  191. package/src/graph/duplicate.ts +0 -286
  192. package/src/graph/graph.ts +0 -172
  193. package/src/graph/metrics.ts +0 -110
  194. package/src/graph/pagerank.ts +0 -125
  195. package/src/graph/simhash.ts +0 -61
  196. package/src/index.ts +0 -30
  197. package/src/lock/hashKey.ts +0 -51
  198. package/src/lock/lockManager.ts +0 -124
  199. package/src/lock/pidCheck.ts +0 -13
  200. package/src/report/html.ts +0 -227
  201. package/src/report/sitegraphExport.ts +0 -58
  202. package/src/scoring/hits.ts +0 -131
  203. package/src/scoring/orphanSeverity.ts +0 -176
  204. package/src/utils/version.ts +0 -18
  205. package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
  206. package/tests/analysis.unit.test.ts +0 -98
  207. package/tests/analyze.integration.test.ts +0 -98
  208. package/tests/audit/dns.test.ts +0 -31
  209. package/tests/audit/headers.test.ts +0 -45
  210. package/tests/audit/scoring.test.ts +0 -133
  211. package/tests/audit/security.test.ts +0 -12
  212. package/tests/audit/transport.test.ts +0 -112
  213. package/tests/clustering.test.ts +0 -118
  214. package/tests/crawler.test.ts +0 -358
  215. package/tests/db.test.ts +0 -159
  216. package/tests/diff.test.ts +0 -67
  217. package/tests/duplicate.test.ts +0 -110
  218. package/tests/fetcher.test.ts +0 -106
  219. package/tests/fetcher_safety.test.ts +0 -85
  220. package/tests/fixtures/analyze-crawl.json +0 -26
  221. package/tests/hits.test.ts +0 -134
  222. package/tests/html_report.test.ts +0 -58
  223. package/tests/lock/lockManager.test.ts +0 -138
  224. package/tests/metrics.test.ts +0 -196
  225. package/tests/normalize.test.ts +0 -101
  226. package/tests/orphanSeverity.test.ts +0 -160
  227. package/tests/pagerank.test.ts +0 -98
  228. package/tests/parser.test.ts +0 -117
  229. package/tests/proxy_safety.test.ts +0 -57
  230. package/tests/redirect_safety.test.ts +0 -73
  231. package/tests/safety.test.ts +0 -114
  232. package/tests/scope.test.ts +0 -66
  233. package/tests/scoring.test.ts +0 -59
  234. package/tests/sitemap.test.ts +0 -88
  235. package/tests/soft404.test.ts +0 -41
  236. package/tests/trap.test.ts +0 -39
  237. package/tests/visualization_data.test.ts +0 -46
  238. package/tsconfig.json +0 -11
@@ -6,7 +6,7 @@ export class Parser {
6
6
  /**
7
7
  * Parses HTML content to extract metadata and links.
8
8
  */
9
- parse(html, baseUrl, status) {
9
+ parse(html, baseUrl, _status) {
10
10
  const $ = cheerio.load(html);
11
11
  // 1. Robots Meta
12
12
  let noindex = false;
@@ -97,51 +97,6 @@ export class Parser {
97
97
  const uniqueTokens = new Set(tokens);
98
98
  const uniqueTokenRatio = tokens.length > 0 ? (uniqueTokens.size / tokens.length) : 0;
99
99
  const simhash = SimHash.generate(tokens).toString();
100
- // 5. Soft 404 Detection
101
- let soft404Score = 0;
102
- const soft404Signals = [];
103
- if (status === 200) {
104
- const title = $('title').text().toLowerCase();
105
- const h1Text = $('h1').first().text().toLowerCase();
106
- const bodyText = cleanText.toLowerCase();
107
- const errorPatterns = ['404', 'not found', 'error', 'doesn\'t exist', 'unavailable', 'invalid'];
108
- // Pattern checks
109
- for (const pattern of errorPatterns) {
110
- if (title.includes(pattern)) {
111
- soft404Score += 0.4;
112
- soft404Signals.push(`title_pattern_${pattern.replace(/\s+/g, '_')}`);
113
- break;
114
- }
115
- }
116
- for (const pattern of errorPatterns) {
117
- if (h1Text.includes(pattern)) {
118
- soft404Score += 0.3;
119
- soft404Signals.push(`h1_pattern_${pattern.replace(/\s+/g, '_')}`);
120
- break;
121
- }
122
- }
123
- if (bodyText.includes('page not found') || bodyText.includes('404 error')) {
124
- soft404Score += 0.2;
125
- soft404Signals.push('body_error_phrase');
126
- }
127
- // Content length check (Word count approximation)
128
- const words = cleanText.split(/\s+/).filter(w => w.length > 0);
129
- if (words.length < 50) {
130
- soft404Score += 0.3;
131
- soft404Signals.push('very_low_word_count');
132
- }
133
- else if (words.length < 150) {
134
- soft404Score += 0.1;
135
- soft404Signals.push('low_word_count');
136
- }
137
- // Link count check
138
- if (links.size === 0) {
139
- soft404Score += 0.2;
140
- soft404Signals.push('no_outbound_links');
141
- }
142
- // Cap at 1.0
143
- soft404Score = Math.min(1.0, soft404Score);
144
- }
145
100
  return {
146
101
  links: Array.from(links.entries()).map(([url, weight]) => ({ url, weight })),
147
102
  html: html, // pass raw HTML for analysis
@@ -150,9 +105,7 @@ export class Parser {
150
105
  nofollow,
151
106
  contentHash,
152
107
  simhash,
153
- uniqueTokenRatio,
154
- soft404Score,
155
- soft404Signals
108
+ uniqueTokenRatio
156
109
  };
157
110
  }
158
111
  }
@@ -0,0 +1,11 @@
1
+ import { Fetcher } from './fetcher.js';
2
+ import { Site } from '../db/repositories/SiteRepository.js';
3
+ export interface ResolvedUrl {
4
+ url: string;
5
+ site: Site;
6
+ }
7
+ export declare class UrlResolver {
8
+ private siteRepo;
9
+ constructor();
10
+ resolve(inputUrl: string, fetcher: Fetcher): Promise<ResolvedUrl>;
11
+ }
@@ -0,0 +1,67 @@
1
+ import { SiteRepository } from '../db/repositories/SiteRepository.js';
2
+ import { getDb } from '../db/index.js';
3
+ export class UrlResolver {
4
+ siteRepo;
5
+ constructor() {
6
+ this.siteRepo = new SiteRepository(getDb());
7
+ }
8
+ async resolve(inputUrl, fetcher) {
9
+ const hasProtocol = inputUrl.startsWith('http://') || inputUrl.startsWith('https://');
10
+ const workingUrl = hasProtocol ? inputUrl : `https://${inputUrl}`;
11
+ let hostname;
12
+ try {
13
+ hostname = new URL(workingUrl).hostname;
14
+ }
15
+ catch {
16
+ throw new Error(`Invalid URL or domain: ${inputUrl}`);
17
+ }
18
+ const domain = hostname.replace(/^www\./, '');
19
+ let site = this.siteRepo.firstOrCreateSite(domain);
20
+ // If protocol was omitted, we use our discovery logic or stored preference
21
+ if (!hasProtocol) {
22
+ if (site.ssl !== null && site.preferred_url) {
23
+ return {
24
+ url: site.preferred_url,
25
+ site
26
+ };
27
+ }
28
+ // No protocol provided and no stored preference: Probe HTTPS first
29
+ try {
30
+ const res = await fetcher.fetch(`https://${hostname}/`);
31
+ if (typeof res.status === 'number' && res.status >= 200 && res.status < 400) {
32
+ const isSsl = res.finalUrl.startsWith('https:') ? 1 : 0;
33
+ this.siteRepo.updateSitePreference(site.id, { preferred_url: res.finalUrl, ssl: isSsl });
34
+ // Refresh site object
35
+ site = this.siteRepo.getSiteById(site.id);
36
+ return { url: res.finalUrl, site };
37
+ }
38
+ }
39
+ catch {
40
+ // Fallback to HTTP
41
+ }
42
+ // Try HTTP
43
+ try {
44
+ const res = await fetcher.fetch(`http://${hostname}/`);
45
+ if (typeof res.status === 'number' && res.status >= 200 && res.status < 400) {
46
+ const isSsl = res.finalUrl.startsWith('https:') ? 1 : 0;
47
+ this.siteRepo.updateSitePreference(site.id, { preferred_url: res.finalUrl, ssl: isSsl });
48
+ site = this.siteRepo.getSiteById(site.id);
49
+ return { url: res.finalUrl, site };
50
+ }
51
+ }
52
+ catch {
53
+ // If both fail, we still default to the provided input as https
54
+ return { url: workingUrl, site };
55
+ }
56
+ }
57
+ // Protocol was provided, we just return it but ensure site is in sync if it's the first time
58
+ if (site.ssl === null) {
59
+ this.siteRepo.updateSitePreference(site.id, {
60
+ preferred_url: inputUrl,
61
+ ssl: inputUrl.startsWith('https:') ? 1 : 0
62
+ });
63
+ site = this.siteRepo.getSiteById(site.id);
64
+ }
65
+ return { url: inputUrl, site };
66
+ }
67
+ }
@@ -1,4 +1,10 @@
1
+ import { EngineContext } from '../events.js';
2
+ import { Fetcher } from './fetcher.js';
1
3
  export declare class Sitemap {
4
+ private context?;
5
+ private fetcher?;
6
+ private userAgent;
7
+ constructor(context?: EngineContext | undefined, fetcher?: Fetcher | undefined, userAgent?: string);
2
8
  /**
3
9
  * Fetches and parses a sitemap (or sitemap index) to extract URLs.
4
10
  * Recursively handles sitemap indexes with loop detection and depth limits.
@@ -1,7 +1,19 @@
1
- import { request } from 'undici';
2
1
  import * as cheerio from 'cheerio';
2
+ import pLimit from 'p-limit';
3
3
  import { normalizeUrl } from './normalize.js';
4
+ import { DEFAULTS } from '../constants.js';
4
5
  export class Sitemap {
6
+ context;
7
+ fetcher;
8
+ userAgent = DEFAULTS.USER_AGENT;
9
+ constructor(context, fetcher, userAgent) {
10
+ this.context = context;
11
+ this.fetcher = fetcher;
12
+ if (userAgent)
13
+ this.userAgent = userAgent;
14
+ else if (fetcher)
15
+ this.userAgent = fetcher.userAgent;
16
+ }
5
17
  /**
6
18
  * Fetches and parses a sitemap (or sitemap index) to extract URLs.
7
19
  * Recursively handles sitemap indexes with loop detection and depth limits.
@@ -20,14 +32,16 @@ export class Sitemap {
20
32
  if (visited.size > 50)
21
33
  return;
22
34
  try {
23
- const res = await request(url, {
24
- maxRedirections: 3,
25
- headers: { 'User-Agent': 'crawlith/1.0' },
26
- headersTimeout: 10000,
27
- bodyTimeout: 10000
28
- });
29
- if (res.statusCode >= 200 && res.statusCode < 300) {
30
- const xml = await res.body.text();
35
+ const res = this.fetcher
36
+ ? await this.fetcher.fetch(url, { maxBytes: 5000000 })
37
+ : await (async () => {
38
+ const { request } = await import('undici');
39
+ const r = await request(url, { headers: { 'User-Agent': this.userAgent } });
40
+ const b = await r.body.text();
41
+ return { status: r.statusCode, body: b };
42
+ })();
43
+ if (typeof res.status === 'number' && res.status >= 200 && res.status < 300) {
44
+ const xml = res.body;
31
45
  // Basic validation: must verify it looks like XML
32
46
  if (!xml.trim().startsWith('<'))
33
47
  return;
@@ -41,10 +55,9 @@ export class Sitemap {
41
55
  if (loc)
42
56
  childSitemaps.push(loc);
43
57
  });
44
- // Process children sequentially to avoid massive concurrency spike
45
- for (const childUrl of childSitemaps) {
46
- await this.processSitemap(childUrl, visited, urls);
47
- }
58
+ // Process children concurrently but with a limit to avoid massive concurrency spike
59
+ const limit = pLimit(10);
60
+ await Promise.all(childSitemaps.map(childUrl => limit(() => this.processSitemap(childUrl, visited, urls))));
48
61
  }
49
62
  else {
50
63
  // It's a URL Set
@@ -59,12 +72,9 @@ export class Sitemap {
59
72
  });
60
73
  }
61
74
  }
62
- else {
63
- await res.body.dump();
64
- }
65
75
  }
66
76
  catch (e) {
67
- console.warn(`Failed to fetch sitemap ${url}:`, e);
77
+ this.context?.emit({ type: 'warn', message: `Failed to fetch sitemap ${url} (${String(e)})`, context: e });
68
78
  }
69
79
  }
70
80
  }
@@ -16,7 +16,11 @@ export declare class TrapDetector {
16
16
  /**
17
17
  * Checks if a URL represents a potential crawl trap.
18
18
  */
19
- checkTrap(rawUrl: string, _depth: number): TrapResult;
19
+ checkTrap(rawUrl: string, _depth: number, isInternal?: boolean): TrapResult;
20
+ /**
21
+ * Iterates over all nodes in the graph and flags potential traps.
22
+ */
23
+ analyze(graph: any): void;
20
24
  /**
21
25
  * Resets internal state (useful for multi-crawl sessions if needed)
22
26
  */
@@ -1,7 +1,7 @@
1
1
  export class TrapDetector {
2
2
  pathCounters = new Map();
3
3
  paginationCounters = new Map();
4
- sessionParams = new Set(['sid', 'session', 'phpsessid', 'sessid', 'token']);
4
+ sessionParams = new Set(['sid', 'session', 'phpsessid', 'sessid', 'token', 'intended']);
5
5
  // Configurable thresholds
6
6
  PARAM_EXPLOSION_THRESHOLD = 30;
7
7
  PAGINATION_THRESHOLD = 50;
@@ -14,7 +14,12 @@ export class TrapDetector {
14
14
  /**
15
15
  * Checks if a URL represents a potential crawl trap.
16
16
  */
17
- checkTrap(rawUrl, _depth) {
17
+ checkTrap(rawUrl, _depth, isInternal = true) {
18
+ // If it's not internal (e.g., social sharing links), we don't flag it as a trap
19
+ // that affects our crawl health, even though technically it might have many params.
20
+ if (!isInternal) {
21
+ return { risk: 0, type: null };
22
+ }
18
23
  let risk = 0;
19
24
  let type = null;
20
25
  try {
@@ -68,6 +73,22 @@ export class TrapDetector {
68
73
  }
69
74
  return { risk, type };
70
75
  }
76
+ /**
77
+ * Iterates over all nodes in the graph and flags potential traps.
78
+ */
79
+ analyze(graph) {
80
+ const nodes = graph.getNodes();
81
+ for (const node of nodes) {
82
+ if (node.status === 200 || node.status === 0) {
83
+ const res = this.checkTrap(node.url, node.depth || 0, !!node.isInternal);
84
+ if (res.risk > 0.4) {
85
+ node.crawlTrapFlag = true;
86
+ node.crawlTrapRisk = res.risk;
87
+ node.trapType = res.type;
88
+ }
89
+ }
90
+ }
91
+ }
71
92
  /**
72
93
  * Resets internal state (useful for multi-crawl sessions if needed)
73
94
  */
@@ -0,0 +1,110 @@
1
+ import Database from 'better-sqlite3';
2
+ import type { CrawlithPlugin } from '../plugin-system/plugin-types.js';
3
+ export declare class CrawlithDB {
4
+ private db;
5
+ private statements;
6
+ private registry;
7
+ /**
8
+ * @internal
9
+ * Dangerous: Returns the raw better-sqlite3 instance.
10
+ * Core only. Plugins must never use this.
11
+ */
12
+ unsafeGetRawDb(): Database.Database;
13
+ private _pluginName?;
14
+ private _snapshotId?;
15
+ /** Whether live fallback is allowed (from --live flag). Core-controlled. */
16
+ private _live;
17
+ /** Whether this plugin makes network calls. Core-controlled via plugin.storage.fetchMode. */
18
+ private _fetchMode;
19
+ constructor(dbPath: string);
20
+ /**
21
+ * Schema API
22
+ */
23
+ get schema(): {
24
+ define: (columns: Record<string, string>) => void;
25
+ };
26
+ /**
27
+ * Fluent Data API (URL-scoped rows)
28
+ */
29
+ get data(): {
30
+ save: <T>(input: {
31
+ url: string;
32
+ data: T;
33
+ }) => void;
34
+ find: <T>(url: string, options?: {
35
+ maxAge?: string | number;
36
+ global?: boolean;
37
+ }) => T | null;
38
+ all: <T>() => T[];
39
+ /**
40
+ * Cache-first with live fallback. Core-enforced pattern:
41
+ * 1. If cached data exists → return it (always, regardless of age)
42
+ * 2. If no cache + fetchMode='network' + live=false → return null (skip)
43
+ * 3. If no cache + (fetchMode='local' OR live=true) → call fetchFn, save, return
44
+ *
45
+ * Plugin authors NEVER touch ctx.live — the core injects it via scope().
46
+ */
47
+ getOrFetch: <T>(url: string, fetchFn: () => Promise<T>) => Promise<T | null>;
48
+ };
49
+ /**
50
+ * Report API (Global snapshot summary)
51
+ */
52
+ get report(): {
53
+ save: (summary: any, optionalScores?: {
54
+ totalScore?: number;
55
+ scoreCount?: number;
56
+ scoreWeightSum?: number;
57
+ scoreCalculatedAt?: string;
58
+ }) => void;
59
+ find: <T>() => T | null;
60
+ };
61
+ initialize(): void;
62
+ /**
63
+ * Create a scoped instance for a specific plugin.
64
+ * Also bakes in live + fetchMode so getOrFetch() can enforce the protocol
65
+ * without exposing those controls to the plugin author.
66
+ */
67
+ scope(pluginName: string, snapshotId?: number | string, options?: {
68
+ live?: boolean;
69
+ fetchMode?: 'local' | 'network';
70
+ }): CrawlithDB;
71
+ registerPluginDataSchema(pluginNameOrColumns: string | Record<string, string>, extraColumns?: Record<string, string>): void;
72
+ /** @deprecated Use registerPluginDataSchema */
73
+ registerPluginMigration(pluginName: string, migrationSQL: string): void;
74
+ getPageIdByUrl(snapshotId: number | string, url: string): number | null;
75
+ insertPluginReport(input: {
76
+ snapshotId?: number | string;
77
+ pluginName?: string;
78
+ summary: unknown;
79
+ totalScore?: number;
80
+ scoreCount?: number;
81
+ scoreWeightSum?: number;
82
+ scoreCalculatedAt?: string;
83
+ }): void;
84
+ insertPluginRow<T>(input: {
85
+ tableName?: string;
86
+ snapshotId?: number | string;
87
+ url: string;
88
+ data: T;
89
+ }): void;
90
+ getPluginReport(snapshotId?: number | string, pluginName?: string): unknown | null;
91
+ getPluginRows<T>(tableName?: string, snapshotId?: number | string): T[];
92
+ getPluginRow<T>(tableNameOrUrl: string, snapshotId?: number | string, url?: string, options?: {
93
+ maxAge?: string | number;
94
+ global?: boolean;
95
+ }): T | null;
96
+ private _parseDuration;
97
+ private _parseRow;
98
+ deleteSnapshotPlugins(snapshotId: number | string): void;
99
+ private _getOrFetch;
100
+ aggregateScoreProviders(snapshotId: number | string, plugins: CrawlithPlugin[]): void;
101
+ runInTransaction(fn: () => void): void;
102
+ private _resolveTableName;
103
+ /** Converts a plugin name to its canonical SQLite table name, sanitizing invalid characters. */
104
+ private _toTableName;
105
+ close(): void;
106
+ private _isMigrationExecuted;
107
+ private _assertSnapshotExists;
108
+ private _assertTableRegistered;
109
+ private _assertOwnership;
110
+ }