@crawlith/core 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +70 -0
  3. package/dist/analysis/analyze.d.ts +29 -8
  4. package/dist/analysis/analyze.js +325 -221
  5. package/dist/analysis/clustering.d.ts +23 -0
  6. package/dist/analysis/clustering.js +206 -0
  7. package/dist/analysis/content.d.ts +1 -1
  8. package/dist/analysis/content.js +11 -5
  9. package/dist/analysis/duplicate.d.ts +34 -0
  10. package/dist/analysis/duplicate.js +305 -0
  11. package/dist/analysis/heading.d.ts +116 -0
  12. package/dist/analysis/heading.js +356 -0
  13. package/dist/analysis/images.d.ts +1 -1
  14. package/dist/analysis/images.js +6 -5
  15. package/dist/analysis/links.d.ts +1 -1
  16. package/dist/analysis/links.js +8 -8
  17. package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
  18. package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
  19. package/dist/analysis/scoring.js +4 -1
  20. package/dist/analysis/seo.d.ts +8 -4
  21. package/dist/analysis/seo.js +41 -30
  22. package/dist/analysis/soft404.d.ts +17 -0
  23. package/dist/analysis/soft404.js +62 -0
  24. package/dist/analysis/structuredData.d.ts +1 -1
  25. package/dist/analysis/structuredData.js +5 -4
  26. package/dist/application/index.d.ts +2 -0
  27. package/dist/application/index.js +2 -0
  28. package/dist/application/usecase.d.ts +3 -0
  29. package/dist/application/usecase.js +1 -0
  30. package/dist/application/usecases.d.ts +114 -0
  31. package/dist/application/usecases.js +201 -0
  32. package/dist/audit/index.js +1 -1
  33. package/dist/audit/transport.d.ts +1 -1
  34. package/dist/audit/transport.js +5 -4
  35. package/dist/audit/types.d.ts +1 -0
  36. package/dist/constants.d.ts +17 -0
  37. package/dist/constants.js +23 -0
  38. package/dist/core/scope/scopeManager.js +3 -0
  39. package/dist/crawler/crawl.d.ts +2 -2
  40. package/dist/crawler/crawler.d.ts +17 -5
  41. package/dist/crawler/crawler.js +259 -94
  42. package/dist/crawler/fetcher.d.ts +1 -1
  43. package/dist/crawler/fetcher.js +6 -6
  44. package/dist/crawler/metricsRunner.d.ts +21 -1
  45. package/dist/crawler/metricsRunner.js +181 -60
  46. package/dist/crawler/normalize.d.ts +41 -0
  47. package/dist/crawler/normalize.js +119 -3
  48. package/dist/crawler/parser.d.ts +1 -3
  49. package/dist/crawler/parser.js +2 -49
  50. package/dist/crawler/resolver.d.ts +11 -0
  51. package/dist/crawler/resolver.js +67 -0
  52. package/dist/crawler/sitemap.d.ts +4 -1
  53. package/dist/crawler/sitemap.js +24 -18
  54. package/dist/crawler/trap.d.ts +5 -1
  55. package/dist/crawler/trap.js +23 -2
  56. package/dist/db/CrawlithDB.d.ts +110 -0
  57. package/dist/db/CrawlithDB.js +500 -0
  58. package/dist/db/graphLoader.js +15 -32
  59. package/dist/db/index.d.ts +9 -1
  60. package/dist/db/index.js +39 -31
  61. package/dist/db/migrations.d.ts +2 -0
  62. package/dist/db/{schema.js → migrations.js} +90 -43
  63. package/dist/db/pluginRegistry.d.ts +9 -0
  64. package/dist/db/pluginRegistry.js +19 -0
  65. package/dist/db/repositories/EdgeRepository.d.ts +5 -0
  66. package/dist/db/repositories/EdgeRepository.js +7 -0
  67. package/dist/db/repositories/MetricsRepository.d.ts +13 -8
  68. package/dist/db/repositories/MetricsRepository.js +14 -6
  69. package/dist/db/repositories/PageRepository.d.ts +5 -3
  70. package/dist/db/repositories/PageRepository.js +68 -17
  71. package/dist/db/repositories/SiteRepository.d.ts +6 -0
  72. package/dist/db/repositories/SiteRepository.js +4 -0
  73. package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
  74. package/dist/db/repositories/SnapshotRepository.js +48 -10
  75. package/dist/db/reset.d.ts +9 -0
  76. package/dist/db/reset.js +32 -0
  77. package/dist/db/statements.d.ts +12 -0
  78. package/dist/db/statements.js +40 -0
  79. package/dist/diff/compare.d.ts +0 -5
  80. package/dist/diff/compare.js +0 -12
  81. package/dist/diff/service.d.ts +16 -0
  82. package/dist/diff/service.js +41 -0
  83. package/dist/domain/index.d.ts +4 -0
  84. package/dist/domain/index.js +4 -0
  85. package/dist/events.d.ts +8 -0
  86. package/dist/graph/graph.d.ts +20 -42
  87. package/dist/graph/graph.js +12 -16
  88. package/dist/graph/hits.d.ts +23 -0
  89. package/dist/graph/hits.js +111 -0
  90. package/dist/graph/metrics.d.ts +0 -4
  91. package/dist/graph/metrics.js +19 -15
  92. package/dist/graph/pagerank.d.ts +17 -4
  93. package/dist/graph/pagerank.js +126 -93
  94. package/dist/index.d.ts +27 -9
  95. package/dist/index.js +27 -9
  96. package/dist/lock/lockManager.d.ts +1 -0
  97. package/dist/lock/lockManager.js +15 -0
  98. package/dist/plugin-system/plugin-cli.d.ts +10 -0
  99. package/dist/plugin-system/plugin-cli.js +31 -0
  100. package/dist/plugin-system/plugin-config.d.ts +16 -0
  101. package/dist/plugin-system/plugin-config.js +36 -0
  102. package/dist/plugin-system/plugin-loader.d.ts +17 -0
  103. package/dist/plugin-system/plugin-loader.js +122 -0
  104. package/dist/plugin-system/plugin-registry.d.ts +25 -0
  105. package/dist/plugin-system/plugin-registry.js +167 -0
  106. package/dist/plugin-system/plugin-types.d.ts +205 -0
  107. package/dist/plugin-system/plugin-types.js +1 -0
  108. package/dist/ports/index.d.ts +9 -0
  109. package/dist/ports/index.js +1 -0
  110. package/dist/report/export.d.ts +3 -0
  111. package/dist/report/export.js +81 -0
  112. package/dist/report/insight.d.ts +27 -0
  113. package/dist/report/insight.js +103 -0
  114. package/dist/scoring/health.d.ts +17 -11
  115. package/dist/scoring/health.js +183 -140
  116. package/dist/utils/chalk.d.ts +6 -0
  117. package/dist/utils/chalk.js +41 -0
  118. package/dist/utils/secureConfig.d.ts +23 -0
  119. package/dist/utils/secureConfig.js +128 -0
  120. package/package.json +10 -4
  121. package/CHANGELOG.md +0 -13
  122. package/dist/db/schema.d.ts +0 -2
  123. package/dist/graph/cluster.d.ts +0 -6
  124. package/dist/graph/cluster.js +0 -221
  125. package/dist/graph/duplicate.d.ts +0 -10
  126. package/dist/graph/duplicate.js +0 -302
  127. package/dist/scoring/hits.d.ts +0 -10
  128. package/dist/scoring/hits.js +0 -131
  129. package/scripts/copy-assets.js +0 -37
  130. package/src/analysis/analysis_list.html +0 -35
  131. package/src/analysis/analysis_page.html +0 -123
  132. package/src/analysis/analyze.ts +0 -505
  133. package/src/analysis/content.ts +0 -62
  134. package/src/analysis/images.ts +0 -28
  135. package/src/analysis/links.ts +0 -41
  136. package/src/analysis/scoring.ts +0 -66
  137. package/src/analysis/seo.ts +0 -82
  138. package/src/analysis/structuredData.ts +0 -62
  139. package/src/analysis/templates.ts +0 -9
  140. package/src/audit/dns.ts +0 -49
  141. package/src/audit/headers.ts +0 -98
  142. package/src/audit/index.ts +0 -66
  143. package/src/audit/scoring.ts +0 -232
  144. package/src/audit/transport.ts +0 -258
  145. package/src/audit/types.ts +0 -102
  146. package/src/core/network/proxyAdapter.ts +0 -21
  147. package/src/core/network/rateLimiter.ts +0 -39
  148. package/src/core/network/redirectController.ts +0 -47
  149. package/src/core/network/responseLimiter.ts +0 -34
  150. package/src/core/network/retryPolicy.ts +0 -57
  151. package/src/core/scope/domainFilter.ts +0 -45
  152. package/src/core/scope/scopeManager.ts +0 -52
  153. package/src/core/scope/subdomainPolicy.ts +0 -39
  154. package/src/core/security/ipGuard.ts +0 -171
  155. package/src/crawler/crawl.ts +0 -9
  156. package/src/crawler/crawler.ts +0 -601
  157. package/src/crawler/extract.ts +0 -39
  158. package/src/crawler/fetcher.ts +0 -251
  159. package/src/crawler/metricsRunner.ts +0 -137
  160. package/src/crawler/normalize.ts +0 -108
  161. package/src/crawler/parser.ts +0 -190
  162. package/src/crawler/sitemap.ts +0 -76
  163. package/src/crawler/trap.ts +0 -96
  164. package/src/db/graphLoader.ts +0 -135
  165. package/src/db/index.ts +0 -75
  166. package/src/db/repositories/EdgeRepository.ts +0 -43
  167. package/src/db/repositories/MetricsRepository.ts +0 -63
  168. package/src/db/repositories/PageRepository.ts +0 -228
  169. package/src/db/repositories/SiteRepository.ts +0 -43
  170. package/src/db/repositories/SnapshotRepository.ts +0 -99
  171. package/src/db/schema.ts +0 -177
  172. package/src/diff/compare.ts +0 -84
  173. package/src/events.ts +0 -16
  174. package/src/graph/cluster.ts +0 -246
  175. package/src/graph/duplicate.ts +0 -350
  176. package/src/graph/graph.ts +0 -192
  177. package/src/graph/metrics.ts +0 -125
  178. package/src/graph/pagerank.ts +0 -126
  179. package/src/graph/simhash.ts +0 -76
  180. package/src/index.ts +0 -33
  181. package/src/lock/hashKey.ts +0 -51
  182. package/src/lock/lockManager.ts +0 -132
  183. package/src/lock/pidCheck.ts +0 -13
  184. package/src/report/crawl.html +0 -879
  185. package/src/report/crawlExport.ts +0 -58
  186. package/src/report/crawl_template.ts +0 -9
  187. package/src/report/html.ts +0 -27
  188. package/src/scoring/health.ts +0 -241
  189. package/src/scoring/hits.ts +0 -153
  190. package/src/scoring/orphanSeverity.ts +0 -176
  191. package/src/utils/version.ts +0 -18
  192. package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
  193. package/tests/analysis.unit.test.ts +0 -142
  194. package/tests/analyze.integration.test.ts +0 -133
  195. package/tests/analyze_markdown.test.ts +0 -98
  196. package/tests/audit/audit.test.ts +0 -101
  197. package/tests/audit/dns.test.ts +0 -31
  198. package/tests/audit/headers.test.ts +0 -45
  199. package/tests/audit/scoring.test.ts +0 -133
  200. package/tests/audit/security.test.ts +0 -12
  201. package/tests/audit/transport.test.ts +0 -111
  202. package/tests/clustering.test.ts +0 -118
  203. package/tests/clustering_risk.test.ts +0 -118
  204. package/tests/crawler.test.ts +0 -364
  205. package/tests/db/index.test.ts +0 -134
  206. package/tests/db/repositories.test.ts +0 -115
  207. package/tests/db.test.ts +0 -159
  208. package/tests/db_repos.test.ts +0 -72
  209. package/tests/diff.test.ts +0 -67
  210. package/tests/duplicate.test.ts +0 -110
  211. package/tests/extract.test.ts +0 -86
  212. package/tests/fetcher.test.ts +0 -110
  213. package/tests/fetcher_safety.test.ts +0 -91
  214. package/tests/fixtures/analyze-crawl.json +0 -26
  215. package/tests/graph/graph.test.ts +0 -100
  216. package/tests/graphLoader.test.ts +0 -124
  217. package/tests/hits.test.ts +0 -134
  218. package/tests/html_report.test.ts +0 -59
  219. package/tests/ipGuard.test.ts +0 -73
  220. package/tests/lock/lockManager.test.ts +0 -198
  221. package/tests/metrics.test.ts +0 -196
  222. package/tests/normalize.test.ts +0 -88
  223. package/tests/orphanSeverity.test.ts +0 -160
  224. package/tests/pagerank.test.ts +0 -98
  225. package/tests/parser.test.ts +0 -117
  226. package/tests/proxy_safety.test.ts +0 -57
  227. package/tests/redirect_safety.test.ts +0 -77
  228. package/tests/renderAnalysisCsv.test.ts +0 -183
  229. package/tests/safety.test.ts +0 -126
  230. package/tests/scope.test.ts +0 -84
  231. package/tests/scoring.test.ts +0 -60
  232. package/tests/sitemap.test.ts +0 -100
  233. package/tests/soft404.test.ts +0 -41
  234. package/tests/ssrf_fix.test.ts +0 -69
  235. package/tests/trap.test.ts +0 -39
  236. package/tests/visualization_data.test.ts +0 -46
  237. package/tsconfig.json +0 -11
@@ -1,7 +1,10 @@
1
1
  import { EngineContext } from '../events.js';
2
+ import { Fetcher } from './fetcher.js';
2
3
  export declare class Sitemap {
3
4
  private context?;
4
- constructor(context?: EngineContext | undefined);
5
+ private fetcher?;
6
+ private userAgent;
7
+ constructor(context?: EngineContext | undefined, fetcher?: Fetcher | undefined, userAgent?: string);
5
8
  /**
6
9
  * Fetches and parses a sitemap (or sitemap index) to extract URLs.
7
10
  * Recursively handles sitemap indexes with loop detection and depth limits.
@@ -1,10 +1,18 @@
1
- import { request } from 'undici';
2
1
  import * as cheerio from 'cheerio';
2
+ import pLimit from 'p-limit';
3
3
  import { normalizeUrl } from './normalize.js';
4
+ import { DEFAULTS } from '../constants.js';
4
5
  export class Sitemap {
5
6
  context;
6
- constructor(context) {
7
+ fetcher;
8
+ userAgent = DEFAULTS.USER_AGENT;
9
+ constructor(context, fetcher, userAgent) {
7
10
  this.context = context;
11
+ this.fetcher = fetcher;
12
+ if (userAgent)
13
+ this.userAgent = userAgent;
14
+ else if (fetcher)
15
+ this.userAgent = fetcher.userAgent;
8
16
  }
9
17
  /**
10
18
  * Fetches and parses a sitemap (or sitemap index) to extract URLs.
@@ -24,14 +32,16 @@ export class Sitemap {
24
32
  if (visited.size > 50)
25
33
  return;
26
34
  try {
27
- const res = await request(url, {
28
- maxRedirections: 3,
29
- headers: { 'User-Agent': 'crawlith/1.0' },
30
- headersTimeout: 10000,
31
- bodyTimeout: 10000
32
- });
33
- if (res.statusCode >= 200 && res.statusCode < 300) {
34
- const xml = await res.body.text();
35
+ const res = this.fetcher
36
+ ? await this.fetcher.fetch(url, { maxBytes: 5000000 })
37
+ : await (async () => {
38
+ const { request } = await import('undici');
39
+ const r = await request(url, { headers: { 'User-Agent': this.userAgent } });
40
+ const b = await r.body.text();
41
+ return { status: r.statusCode, body: b };
42
+ })();
43
+ if (typeof res.status === 'number' && res.status >= 200 && res.status < 300) {
44
+ const xml = res.body;
35
45
  // Basic validation: must verify it looks like XML
36
46
  if (!xml.trim().startsWith('<'))
37
47
  return;
@@ -45,10 +55,9 @@ export class Sitemap {
45
55
  if (loc)
46
56
  childSitemaps.push(loc);
47
57
  });
48
- // Process children sequentially to avoid massive concurrency spike
49
- for (const childUrl of childSitemaps) {
50
- await this.processSitemap(childUrl, visited, urls);
51
- }
58
+ // Process children concurrently but with a limit to avoid massive concurrency spike
59
+ const limit = pLimit(10);
60
+ await Promise.all(childSitemaps.map(childUrl => limit(() => this.processSitemap(childUrl, visited, urls))));
52
61
  }
53
62
  else {
54
63
  // It's a URL Set
@@ -63,12 +72,9 @@ export class Sitemap {
63
72
  });
64
73
  }
65
74
  }
66
- else {
67
- await res.body.dump();
68
- }
69
75
  }
70
76
  catch (e) {
71
- this.context?.emit({ type: 'warn', message: `Failed to fetch sitemap ${url}`, context: e });
77
+ this.context?.emit({ type: 'warn', message: `Failed to fetch sitemap ${url} (${String(e)})`, context: e });
72
78
  }
73
79
  }
74
80
  }
@@ -16,7 +16,11 @@ export declare class TrapDetector {
16
16
  /**
17
17
  * Checks if a URL represents a potential crawl trap.
18
18
  */
19
- checkTrap(rawUrl: string, _depth: number): TrapResult;
19
+ checkTrap(rawUrl: string, _depth: number, isInternal?: boolean): TrapResult;
20
+ /**
21
+ * Iterates over all nodes in the graph and flags potential traps.
22
+ */
23
+ analyze(graph: any): void;
20
24
  /**
21
25
  * Resets internal state (useful for multi-crawl sessions if needed)
22
26
  */
@@ -1,7 +1,7 @@
1
1
  export class TrapDetector {
2
2
  pathCounters = new Map();
3
3
  paginationCounters = new Map();
4
- sessionParams = new Set(['sid', 'session', 'phpsessid', 'sessid', 'token']);
4
+ sessionParams = new Set(['sid', 'session', 'phpsessid', 'sessid', 'token', 'intended']);
5
5
  // Configurable thresholds
6
6
  PARAM_EXPLOSION_THRESHOLD = 30;
7
7
  PAGINATION_THRESHOLD = 50;
@@ -14,7 +14,12 @@ export class TrapDetector {
14
14
  /**
15
15
  * Checks if a URL represents a potential crawl trap.
16
16
  */
17
- checkTrap(rawUrl, _depth) {
17
+ checkTrap(rawUrl, _depth, isInternal = true) {
18
+ // If it's not internal (e.g., social sharing links), we don't flag it as a trap
19
+ // that affects our crawl health, even though technically it might have many params.
20
+ if (!isInternal) {
21
+ return { risk: 0, type: null };
22
+ }
18
23
  let risk = 0;
19
24
  let type = null;
20
25
  try {
@@ -68,6 +73,22 @@ export class TrapDetector {
68
73
  }
69
74
  return { risk, type };
70
75
  }
76
+ /**
77
+ * Iterates over all nodes in the graph and flags potential traps.
78
+ */
79
+ analyze(graph) {
80
+ const nodes = graph.getNodes();
81
+ for (const node of nodes) {
82
+ if (node.status === 200 || node.status === 0) {
83
+ const res = this.checkTrap(node.url, node.depth || 0, !!node.isInternal);
84
+ if (res.risk > 0.4) {
85
+ node.crawlTrapFlag = true;
86
+ node.crawlTrapRisk = res.risk;
87
+ node.trapType = res.type;
88
+ }
89
+ }
90
+ }
91
+ }
71
92
  /**
72
93
  * Resets internal state (useful for multi-crawl sessions if needed)
73
94
  */
@@ -0,0 +1,110 @@
1
+ import Database from 'better-sqlite3';
2
+ import type { CrawlithPlugin } from '../plugin-system/plugin-types.js';
3
+ export declare class CrawlithDB {
4
+ private db;
5
+ private statements;
6
+ private registry;
7
+ /**
8
+ * @internal
9
+ * Dangerous: Returns the raw better-sqlite3 instance.
10
+ * Core only. Plugins must never use this.
11
+ */
12
+ unsafeGetRawDb(): Database.Database;
13
+ private _pluginName?;
14
+ private _snapshotId?;
15
+ /** Whether live fallback is allowed (from --live flag). Core-controlled. */
16
+ private _live;
17
+ /** Whether this plugin makes network calls. Core-controlled via plugin.storage.fetchMode. */
18
+ private _fetchMode;
19
+ constructor(dbPath: string);
20
+ /**
21
+ * Schema API
22
+ */
23
+ get schema(): {
24
+ define: (columns: Record<string, string>) => void;
25
+ };
26
+ /**
27
+ * Fluent Data API (URL-scoped rows)
28
+ */
29
+ get data(): {
30
+ save: <T>(input: {
31
+ url: string;
32
+ data: T;
33
+ }) => void;
34
+ find: <T>(url: string, options?: {
35
+ maxAge?: string | number;
36
+ global?: boolean;
37
+ }) => T | null;
38
+ all: <T>() => T[];
39
+ /**
40
+ * Cache-first with live fallback. Core-enforced pattern:
41
+ * 1. If cached data exists → return it (always, regardless of age)
42
+ * 2. If no cache + fetchMode='network' + live=false → return null (skip)
43
+ * 3. If no cache + (fetchMode='local' OR live=true) → call fetchFn, save, return
44
+ *
45
+ * Plugin authors NEVER touch ctx.live — the core injects it via scope().
46
+ */
47
+ getOrFetch: <T>(url: string, fetchFn: () => Promise<T>) => Promise<T | null>;
48
+ };
49
+ /**
50
+ * Report API (Global snapshot summary)
51
+ */
52
+ get report(): {
53
+ save: (summary: any, optionalScores?: {
54
+ totalScore?: number;
55
+ scoreCount?: number;
56
+ scoreWeightSum?: number;
57
+ scoreCalculatedAt?: string;
58
+ }) => void;
59
+ find: <T>() => T | null;
60
+ };
61
+ initialize(): void;
62
+ /**
63
+ * Create a scoped instance for a specific plugin.
64
+ * Also bakes in live + fetchMode so getOrFetch() can enforce the protocol
65
+ * without exposing those controls to the plugin author.
66
+ */
67
+ scope(pluginName: string, snapshotId?: number | string, options?: {
68
+ live?: boolean;
69
+ fetchMode?: 'local' | 'network';
70
+ }): CrawlithDB;
71
+ registerPluginDataSchema(pluginNameOrColumns: string | Record<string, string>, extraColumns?: Record<string, string>): void;
72
+ /** @deprecated Use registerPluginDataSchema */
73
+ registerPluginMigration(pluginName: string, migrationSQL: string): void;
74
+ getPageIdByUrl(snapshotId: number | string, url: string): number | null;
75
+ insertPluginReport(input: {
76
+ snapshotId?: number | string;
77
+ pluginName?: string;
78
+ summary: unknown;
79
+ totalScore?: number;
80
+ scoreCount?: number;
81
+ scoreWeightSum?: number;
82
+ scoreCalculatedAt?: string;
83
+ }): void;
84
+ insertPluginRow<T>(input: {
85
+ tableName?: string;
86
+ snapshotId?: number | string;
87
+ url: string;
88
+ data: T;
89
+ }): void;
90
+ getPluginReport(snapshotId?: number | string, pluginName?: string): unknown | null;
91
+ getPluginRows<T>(tableName?: string, snapshotId?: number | string): T[];
92
+ getPluginRow<T>(tableNameOrUrl: string, snapshotId?: number | string, url?: string, options?: {
93
+ maxAge?: string | number;
94
+ global?: boolean;
95
+ }): T | null;
96
+ private _parseDuration;
97
+ private _parseRow;
98
+ deleteSnapshotPlugins(snapshotId: number | string): void;
99
+ private _getOrFetch;
100
+ aggregateScoreProviders(snapshotId: number | string, plugins: CrawlithPlugin[]): void;
101
+ runInTransaction(fn: () => void): void;
102
+ private _resolveTableName;
103
+ /** Converts a plugin name to its canonical SQLite table name, sanitizing invalid characters. */
104
+ private _toTableName;
105
+ close(): void;
106
+ private _isMigrationExecuted;
107
+ private _assertSnapshotExists;
108
+ private _assertTableRegistered;
109
+ private _assertOwnership;
110
+ }