@crawlith/core 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +70 -0
  3. package/dist/analysis/analyze.d.ts +29 -8
  4. package/dist/analysis/analyze.js +325 -221
  5. package/dist/analysis/clustering.d.ts +23 -0
  6. package/dist/analysis/clustering.js +206 -0
  7. package/dist/analysis/content.d.ts +1 -1
  8. package/dist/analysis/content.js +11 -5
  9. package/dist/analysis/duplicate.d.ts +34 -0
  10. package/dist/analysis/duplicate.js +305 -0
  11. package/dist/analysis/heading.d.ts +116 -0
  12. package/dist/analysis/heading.js +356 -0
  13. package/dist/analysis/images.d.ts +1 -1
  14. package/dist/analysis/images.js +6 -5
  15. package/dist/analysis/links.d.ts +1 -1
  16. package/dist/analysis/links.js +8 -8
  17. package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
  18. package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
  19. package/dist/analysis/scoring.js +4 -1
  20. package/dist/analysis/seo.d.ts +8 -4
  21. package/dist/analysis/seo.js +41 -30
  22. package/dist/analysis/soft404.d.ts +17 -0
  23. package/dist/analysis/soft404.js +62 -0
  24. package/dist/analysis/structuredData.d.ts +1 -1
  25. package/dist/analysis/structuredData.js +5 -4
  26. package/dist/application/index.d.ts +2 -0
  27. package/dist/application/index.js +2 -0
  28. package/dist/application/usecase.d.ts +3 -0
  29. package/dist/application/usecase.js +1 -0
  30. package/dist/application/usecases.d.ts +114 -0
  31. package/dist/application/usecases.js +201 -0
  32. package/dist/audit/index.js +1 -1
  33. package/dist/audit/transport.d.ts +1 -1
  34. package/dist/audit/transport.js +5 -4
  35. package/dist/audit/types.d.ts +1 -0
  36. package/dist/constants.d.ts +17 -0
  37. package/dist/constants.js +23 -0
  38. package/dist/core/scope/scopeManager.js +3 -0
  39. package/dist/crawler/crawl.d.ts +2 -2
  40. package/dist/crawler/crawler.d.ts +17 -5
  41. package/dist/crawler/crawler.js +259 -94
  42. package/dist/crawler/fetcher.d.ts +1 -1
  43. package/dist/crawler/fetcher.js +6 -6
  44. package/dist/crawler/metricsRunner.d.ts +21 -1
  45. package/dist/crawler/metricsRunner.js +181 -60
  46. package/dist/crawler/normalize.d.ts +41 -0
  47. package/dist/crawler/normalize.js +119 -3
  48. package/dist/crawler/parser.d.ts +1 -3
  49. package/dist/crawler/parser.js +2 -49
  50. package/dist/crawler/resolver.d.ts +11 -0
  51. package/dist/crawler/resolver.js +67 -0
  52. package/dist/crawler/sitemap.d.ts +4 -1
  53. package/dist/crawler/sitemap.js +24 -18
  54. package/dist/crawler/trap.d.ts +5 -1
  55. package/dist/crawler/trap.js +23 -2
  56. package/dist/db/CrawlithDB.d.ts +110 -0
  57. package/dist/db/CrawlithDB.js +500 -0
  58. package/dist/db/graphLoader.js +15 -32
  59. package/dist/db/index.d.ts +9 -1
  60. package/dist/db/index.js +39 -31
  61. package/dist/db/migrations.d.ts +2 -0
  62. package/dist/db/{schema.js → migrations.js} +90 -43
  63. package/dist/db/pluginRegistry.d.ts +9 -0
  64. package/dist/db/pluginRegistry.js +19 -0
  65. package/dist/db/repositories/EdgeRepository.d.ts +5 -0
  66. package/dist/db/repositories/EdgeRepository.js +7 -0
  67. package/dist/db/repositories/MetricsRepository.d.ts +13 -8
  68. package/dist/db/repositories/MetricsRepository.js +14 -6
  69. package/dist/db/repositories/PageRepository.d.ts +5 -3
  70. package/dist/db/repositories/PageRepository.js +68 -17
  71. package/dist/db/repositories/SiteRepository.d.ts +6 -0
  72. package/dist/db/repositories/SiteRepository.js +4 -0
  73. package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
  74. package/dist/db/repositories/SnapshotRepository.js +48 -10
  75. package/dist/db/reset.d.ts +9 -0
  76. package/dist/db/reset.js +32 -0
  77. package/dist/db/statements.d.ts +12 -0
  78. package/dist/db/statements.js +40 -0
  79. package/dist/diff/compare.d.ts +0 -5
  80. package/dist/diff/compare.js +0 -12
  81. package/dist/diff/service.d.ts +16 -0
  82. package/dist/diff/service.js +41 -0
  83. package/dist/domain/index.d.ts +4 -0
  84. package/dist/domain/index.js +4 -0
  85. package/dist/events.d.ts +8 -0
  86. package/dist/graph/graph.d.ts +20 -42
  87. package/dist/graph/graph.js +12 -16
  88. package/dist/graph/hits.d.ts +23 -0
  89. package/dist/graph/hits.js +111 -0
  90. package/dist/graph/metrics.d.ts +0 -4
  91. package/dist/graph/metrics.js +19 -15
  92. package/dist/graph/pagerank.d.ts +17 -4
  93. package/dist/graph/pagerank.js +126 -93
  94. package/dist/index.d.ts +27 -9
  95. package/dist/index.js +27 -9
  96. package/dist/lock/lockManager.d.ts +1 -0
  97. package/dist/lock/lockManager.js +15 -0
  98. package/dist/plugin-system/plugin-cli.d.ts +10 -0
  99. package/dist/plugin-system/plugin-cli.js +31 -0
  100. package/dist/plugin-system/plugin-config.d.ts +16 -0
  101. package/dist/plugin-system/plugin-config.js +36 -0
  102. package/dist/plugin-system/plugin-loader.d.ts +17 -0
  103. package/dist/plugin-system/plugin-loader.js +122 -0
  104. package/dist/plugin-system/plugin-registry.d.ts +25 -0
  105. package/dist/plugin-system/plugin-registry.js +167 -0
  106. package/dist/plugin-system/plugin-types.d.ts +205 -0
  107. package/dist/plugin-system/plugin-types.js +1 -0
  108. package/dist/ports/index.d.ts +9 -0
  109. package/dist/ports/index.js +1 -0
  110. package/dist/report/export.d.ts +3 -0
  111. package/dist/report/export.js +81 -0
  112. package/dist/report/insight.d.ts +27 -0
  113. package/dist/report/insight.js +103 -0
  114. package/dist/scoring/health.d.ts +17 -11
  115. package/dist/scoring/health.js +183 -140
  116. package/dist/utils/chalk.d.ts +6 -0
  117. package/dist/utils/chalk.js +41 -0
  118. package/dist/utils/secureConfig.d.ts +23 -0
  119. package/dist/utils/secureConfig.js +128 -0
  120. package/package.json +10 -4
  121. package/CHANGELOG.md +0 -13
  122. package/dist/db/schema.d.ts +0 -2
  123. package/dist/graph/cluster.d.ts +0 -6
  124. package/dist/graph/cluster.js +0 -221
  125. package/dist/graph/duplicate.d.ts +0 -10
  126. package/dist/graph/duplicate.js +0 -302
  127. package/dist/scoring/hits.d.ts +0 -10
  128. package/dist/scoring/hits.js +0 -131
  129. package/scripts/copy-assets.js +0 -37
  130. package/src/analysis/analysis_list.html +0 -35
  131. package/src/analysis/analysis_page.html +0 -123
  132. package/src/analysis/analyze.ts +0 -505
  133. package/src/analysis/content.ts +0 -62
  134. package/src/analysis/images.ts +0 -28
  135. package/src/analysis/links.ts +0 -41
  136. package/src/analysis/scoring.ts +0 -66
  137. package/src/analysis/seo.ts +0 -82
  138. package/src/analysis/structuredData.ts +0 -62
  139. package/src/analysis/templates.ts +0 -9
  140. package/src/audit/dns.ts +0 -49
  141. package/src/audit/headers.ts +0 -98
  142. package/src/audit/index.ts +0 -66
  143. package/src/audit/scoring.ts +0 -232
  144. package/src/audit/transport.ts +0 -258
  145. package/src/audit/types.ts +0 -102
  146. package/src/core/network/proxyAdapter.ts +0 -21
  147. package/src/core/network/rateLimiter.ts +0 -39
  148. package/src/core/network/redirectController.ts +0 -47
  149. package/src/core/network/responseLimiter.ts +0 -34
  150. package/src/core/network/retryPolicy.ts +0 -57
  151. package/src/core/scope/domainFilter.ts +0 -45
  152. package/src/core/scope/scopeManager.ts +0 -52
  153. package/src/core/scope/subdomainPolicy.ts +0 -39
  154. package/src/core/security/ipGuard.ts +0 -171
  155. package/src/crawler/crawl.ts +0 -9
  156. package/src/crawler/crawler.ts +0 -601
  157. package/src/crawler/extract.ts +0 -39
  158. package/src/crawler/fetcher.ts +0 -251
  159. package/src/crawler/metricsRunner.ts +0 -137
  160. package/src/crawler/normalize.ts +0 -108
  161. package/src/crawler/parser.ts +0 -190
  162. package/src/crawler/sitemap.ts +0 -76
  163. package/src/crawler/trap.ts +0 -96
  164. package/src/db/graphLoader.ts +0 -135
  165. package/src/db/index.ts +0 -75
  166. package/src/db/repositories/EdgeRepository.ts +0 -43
  167. package/src/db/repositories/MetricsRepository.ts +0 -63
  168. package/src/db/repositories/PageRepository.ts +0 -228
  169. package/src/db/repositories/SiteRepository.ts +0 -43
  170. package/src/db/repositories/SnapshotRepository.ts +0 -99
  171. package/src/db/schema.ts +0 -177
  172. package/src/diff/compare.ts +0 -84
  173. package/src/events.ts +0 -16
  174. package/src/graph/cluster.ts +0 -246
  175. package/src/graph/duplicate.ts +0 -350
  176. package/src/graph/graph.ts +0 -192
  177. package/src/graph/metrics.ts +0 -125
  178. package/src/graph/pagerank.ts +0 -126
  179. package/src/graph/simhash.ts +0 -76
  180. package/src/index.ts +0 -33
  181. package/src/lock/hashKey.ts +0 -51
  182. package/src/lock/lockManager.ts +0 -132
  183. package/src/lock/pidCheck.ts +0 -13
  184. package/src/report/crawl.html +0 -879
  185. package/src/report/crawlExport.ts +0 -58
  186. package/src/report/crawl_template.ts +0 -9
  187. package/src/report/html.ts +0 -27
  188. package/src/scoring/health.ts +0 -241
  189. package/src/scoring/hits.ts +0 -153
  190. package/src/scoring/orphanSeverity.ts +0 -176
  191. package/src/utils/version.ts +0 -18
  192. package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
  193. package/tests/analysis.unit.test.ts +0 -142
  194. package/tests/analyze.integration.test.ts +0 -133
  195. package/tests/analyze_markdown.test.ts +0 -98
  196. package/tests/audit/audit.test.ts +0 -101
  197. package/tests/audit/dns.test.ts +0 -31
  198. package/tests/audit/headers.test.ts +0 -45
  199. package/tests/audit/scoring.test.ts +0 -133
  200. package/tests/audit/security.test.ts +0 -12
  201. package/tests/audit/transport.test.ts +0 -111
  202. package/tests/clustering.test.ts +0 -118
  203. package/tests/clustering_risk.test.ts +0 -118
  204. package/tests/crawler.test.ts +0 -364
  205. package/tests/db/index.test.ts +0 -134
  206. package/tests/db/repositories.test.ts +0 -115
  207. package/tests/db.test.ts +0 -159
  208. package/tests/db_repos.test.ts +0 -72
  209. package/tests/diff.test.ts +0 -67
  210. package/tests/duplicate.test.ts +0 -110
  211. package/tests/extract.test.ts +0 -86
  212. package/tests/fetcher.test.ts +0 -110
  213. package/tests/fetcher_safety.test.ts +0 -91
  214. package/tests/fixtures/analyze-crawl.json +0 -26
  215. package/tests/graph/graph.test.ts +0 -100
  216. package/tests/graphLoader.test.ts +0 -124
  217. package/tests/hits.test.ts +0 -134
  218. package/tests/html_report.test.ts +0 -59
  219. package/tests/ipGuard.test.ts +0 -73
  220. package/tests/lock/lockManager.test.ts +0 -198
  221. package/tests/metrics.test.ts +0 -196
  222. package/tests/normalize.test.ts +0 -88
  223. package/tests/orphanSeverity.test.ts +0 -160
  224. package/tests/pagerank.test.ts +0 -98
  225. package/tests/parser.test.ts +0 -117
  226. package/tests/proxy_safety.test.ts +0 -57
  227. package/tests/redirect_safety.test.ts +0 -77
  228. package/tests/renderAnalysisCsv.test.ts +0 -183
  229. package/tests/safety.test.ts +0 -126
  230. package/tests/scope.test.ts +0 -84
  231. package/tests/scoring.test.ts +0 -60
  232. package/tests/sitemap.test.ts +0 -100
  233. package/tests/soft404.test.ts +0 -41
  234. package/tests/ssrf_fix.test.ts +0 -69
  235. package/tests/trap.test.ts +0 -39
  236. package/tests/visualization_data.test.ts +0 -46
  237. package/tsconfig.json +0 -11
@@ -0,0 +1,16 @@
1
+ export declare class PluginConfig {
2
+ private pluginName;
3
+ constructor(pluginName: string);
4
+ /**
5
+ * Get a decrypted config key for the current plugin.
6
+ */
7
+ get(keyName?: string): string;
8
+ /**
9
+ * Get a decrypted config key, or throw a user-friendly error if it's missing.
10
+ */
11
+ require(keyName?: string): string;
12
+ /**
13
+ * Set/Encrypt a config key for the current plugin.
14
+ */
15
+ set(value: string): void;
16
+ }
@@ -0,0 +1,36 @@
1
+ import { getDecryptedConfigKey, setEncryptedConfigKey } from '../utils/secureConfig.js';
2
+ export class PluginConfig {
3
+ pluginName;
4
+ constructor(pluginName) {
5
+ this.pluginName = pluginName;
6
+ }
7
+ /**
8
+ * Get a decrypted config key for the current plugin.
9
+ */
10
+ get(keyName) {
11
+ const section = keyName || this.pluginName;
12
+ // Safety check: ensure plugins can only access their own config section
13
+ if (section !== this.pluginName) {
14
+ throw new Error(`Security Violation: Plugin "${this.pluginName}" attempted to access config for "${section}"`);
15
+ }
16
+ return getDecryptedConfigKey(section);
17
+ }
18
+ /**
19
+ * Get a decrypted config key, or throw a user-friendly error if it's missing.
20
+ */
21
+ require(keyName) {
22
+ try {
23
+ return this.get(keyName);
24
+ }
25
+ catch (_error) {
26
+ const section = keyName || this.pluginName;
27
+ throw new Error(`Missing ${section} configuration. Please run: crawlith config ${section} set <value>`, { cause: _error });
28
+ }
29
+ }
30
+ /**
31
+ * Set/Encrypt a config key for the current plugin.
32
+ */
33
+ set(value) {
34
+ setEncryptedConfigKey(this.pluginName, value);
35
+ }
36
+ }
@@ -0,0 +1,17 @@
1
+ import { CrawlithPlugin } from './plugin-types.js';
2
+ export interface PluginLoaderLogger {
3
+ debug(msg: string): void;
4
+ info?(msg: string): void;
5
+ warn?(msg: string): void;
6
+ error?(msg: string): void;
7
+ }
8
+ export declare class PluginLoader {
9
+ private plugins;
10
+ private logger?;
11
+ constructor(logger?: PluginLoaderLogger);
12
+ discover(rootPath: string): Promise<CrawlithPlugin[]>;
13
+ private loadFromDir;
14
+ private loadFromNodeModules;
15
+ private tryLoadPlugin;
16
+ private validatePlugin;
17
+ }
@@ -0,0 +1,122 @@
1
+ import fs from 'node:fs';
2
+ import path from 'node:path';
3
+ import { pathToFileURL } from 'node:url';
4
+ export class PluginLoader {
5
+ plugins = new Map();
6
+ logger;
7
+ constructor(logger) {
8
+ this.logger = logger;
9
+ }
10
+ async discover(rootPath) {
11
+ // 1. Discover Internal Plugins
12
+ const internalPath = path.resolve(rootPath, 'packages/plugins');
13
+ if (fs.existsSync(internalPath)) {
14
+ this.logger?.debug(`[plugin] Scanning internal directory: ${internalPath}`);
15
+ await this.loadFromDir(internalPath, 'internal');
16
+ }
17
+ // 2. Discover External Plugins
18
+ const nodeModulesPath = path.resolve(rootPath, 'node_modules');
19
+ if (fs.existsSync(nodeModulesPath)) {
20
+ this.logger?.debug(`[plugin] Scanning node_modules: ${nodeModulesPath}`);
21
+ await this.loadFromNodeModules(nodeModulesPath);
22
+ }
23
+ return Array.from(this.plugins.values());
24
+ }
25
+ async loadFromDir(dirPath, type) {
26
+ const entries = fs.readdirSync(dirPath, { withFileTypes: true });
27
+ for (const entry of entries) {
28
+ if (entry.isDirectory()) {
29
+ const fullPath = path.join(dirPath, entry.name);
30
+ await this.tryLoadPlugin(fullPath, type);
31
+ }
32
+ }
33
+ }
34
+ async loadFromNodeModules(nodeModulesPath) {
35
+ if (!fs.existsSync(nodeModulesPath))
36
+ return;
37
+ const entries = fs.readdirSync(nodeModulesPath, { withFileTypes: true });
38
+ for (const entry of entries) {
39
+ if (!entry.isDirectory())
40
+ continue;
41
+ if (entry.name.startsWith('@')) {
42
+ // Scoped packages
43
+ const scopePath = path.join(nodeModulesPath, entry.name);
44
+ const scopedEntries = fs.readdirSync(scopePath, { withFileTypes: true });
45
+ for (const scopedEntry of scopedEntries) {
46
+ if (scopedEntry.isDirectory()) {
47
+ await this.tryLoadPlugin(path.join(scopePath, scopedEntry.name), 'external');
48
+ }
49
+ }
50
+ }
51
+ else {
52
+ await this.tryLoadPlugin(path.join(nodeModulesPath, entry.name), 'external');
53
+ }
54
+ }
55
+ }
56
+ async tryLoadPlugin(pluginPath, type) {
57
+ const pkgPath = path.join(pluginPath, 'package.json');
58
+ if (!fs.existsSync(pkgPath))
59
+ return;
60
+ try {
61
+ const pkg = JSON.parse(fs.readFileSync(pkgPath, 'utf8'));
62
+ const isCrawlithPlugin = pkg.name?.startsWith('@crawlith/plugin-') ||
63
+ pkg.name?.startsWith('crawlith-plugin-') ||
64
+ pkg.crawlith?.type === 'plugin' ||
65
+ type === 'internal';
66
+ if (!isCrawlithPlugin)
67
+ return;
68
+ let entryPoint = pkg.exports ?? pkg.main ?? 'index.js';
69
+ if (typeof entryPoint === 'object' && entryPoint !== null) {
70
+ entryPoint = entryPoint['.'] ?? entryPoint.import ?? entryPoint.default ?? 'index.js';
71
+ }
72
+ if (typeof entryPoint !== 'string') {
73
+ entryPoint = 'index.js';
74
+ }
75
+ let fullEntryPoint = path.join(pluginPath, entryPoint);
76
+ // If we're loading a .ts file as the intended entry point, but a .js one exists in dist,
77
+ // prefer the .js one to avoid requiring a TS loader at runtime.
78
+ if (fullEntryPoint.endsWith('.ts')) {
79
+ const distJsPath = path.join(pluginPath, 'dist', 'index.js');
80
+ if (fs.existsSync(distJsPath)) {
81
+ fullEntryPoint = distJsPath;
82
+ }
83
+ }
84
+ if (!fs.existsSync(fullEntryPoint) && fs.existsSync(path.join(pluginPath, 'index.ts'))) {
85
+ fullEntryPoint = path.join(pluginPath, 'index.ts');
86
+ }
87
+ if (!fs.existsSync(fullEntryPoint)) {
88
+ this.logger?.debug(`[plugin] Skipped: ${pkg.name} (entry point not found: ${fullEntryPoint})`);
89
+ return;
90
+ }
91
+ const imported = await import(pathToFileURL(fullEntryPoint).href);
92
+ const plugin = imported.default || imported;
93
+ if (this.validatePlugin(plugin, pkg)) {
94
+ if (this.plugins.has(plugin.name)) {
95
+ throw new Error(`Duplicate plugin name: ${plugin.name}`);
96
+ }
97
+ this.plugins.set(plugin.name, plugin);
98
+ this.logger?.debug(`[plugin] Loaded: ${plugin.name} v${plugin.version} (${type})`);
99
+ }
100
+ }
101
+ catch (err) {
102
+ this.logger?.debug(`[plugin] Failed to load plugin at ${pluginPath}: ${err.message}`);
103
+ }
104
+ }
105
+ validatePlugin(plugin, pkg) {
106
+ if (!plugin || typeof plugin !== 'object') {
107
+ this.logger?.debug(`[plugin] Skipped: ${pkg.name} (invalid export)`);
108
+ return false;
109
+ }
110
+ if (!plugin.name) {
111
+ this.logger?.debug(`[plugin] Skipped: ${pkg.name} (missing name)`);
112
+ return false;
113
+ }
114
+ if (!plugin.version) {
115
+ plugin.version = pkg.version || '0.0.0';
116
+ }
117
+ if (!plugin.description) {
118
+ plugin.description = pkg.description || '';
119
+ }
120
+ return true;
121
+ }
122
+ }
@@ -0,0 +1,25 @@
1
+ import { CrawlithPlugin, PluginContext } from './plugin-types.js';
2
+ export declare class PluginRegistry {
3
+ private plugins;
4
+ constructor(plugins?: CrawlithPlugin[]);
5
+ get pluginsList(): CrawlithPlugin[];
6
+ private registeredCommands;
7
+ /**
8
+ * Registers all plugin CLI flags on the given command.
9
+ * Handles both declarative `cli` config and legacy `register(cmd)` callbacks.
10
+ */
11
+ registerPlugins(program: any): void;
12
+ /**
13
+ * Applies declarative `storage` schemas for all plugins.
14
+ * Called by the core after `getCrawlithDB()` is available, before any hooks run.
15
+ */
16
+ applyStorage(context: PluginContext): void;
17
+ /** Hooks that only make sense during a full site crawl. */
18
+ private static readonly CRAWL_ONLY_HOOKS;
19
+ /** Hooks that only make sense during a single-page analysis. */
20
+ private static readonly PAGE_ONLY_HOOKS;
21
+ runHook(hookName: string, context: PluginContext, payload?: any): Promise<void>;
22
+ runSyncBailHook(hookName: string, context: PluginContext, ...args: any[]): any;
23
+ getPlugins(): CrawlithPlugin[];
24
+ addPlugin(plugin: CrawlithPlugin): void;
25
+ }
@@ -0,0 +1,167 @@
1
+ import { Command } from 'commander';
2
+ import { PluginConfig } from './plugin-config.js';
3
+ export class PluginRegistry {
4
+ plugins = [];
5
+ constructor(plugins = []) {
6
+ this.plugins = plugins;
7
+ }
8
+ get pluginsList() {
9
+ return this.plugins;
10
+ }
11
+ registeredCommands = new WeakSet();
12
+ /**
13
+ * Registers all plugin CLI flags on the given command.
14
+ * Handles both declarative `cli` config and legacy `register(cmd)` callbacks.
15
+ */
16
+ registerPlugins(program) {
17
+ if (!(program instanceof Command))
18
+ return;
19
+ const traverse = (cmd) => {
20
+ if (this.registeredCommands.has(cmd))
21
+ return;
22
+ this.registeredCommands.add(cmd);
23
+ const cmdName = cmd.name();
24
+ for (const plugin of this.plugins) {
25
+ // Declarative cli registration (preferred)
26
+ if (plugin.cli) {
27
+ const targets = plugin.cli.for ?? ['page', 'crawl'];
28
+ if (targets.includes(cmdName)) {
29
+ const defaultValue = plugin.cli.defaultValue;
30
+ if (defaultValue !== undefined && defaultValue !== null) {
31
+ cmd.option(plugin.cli.flag, plugin.cli.description, defaultValue);
32
+ }
33
+ else {
34
+ cmd.option(plugin.cli.flag, plugin.cli.description);
35
+ }
36
+ for (const opt of plugin.cli.options ?? []) {
37
+ const dv = opt.defaultValue;
38
+ if (dv !== undefined && dv !== null) {
39
+ cmd.option(opt.flag, opt.description, dv);
40
+ }
41
+ else {
42
+ cmd.option(opt.flag, opt.description);
43
+ }
44
+ }
45
+ }
46
+ }
47
+ // Legacy imperative registration (backwards compat)
48
+ if (typeof plugin.register === 'function') {
49
+ plugin.register(cmd);
50
+ }
51
+ }
52
+ for (const sub of cmd.commands) {
53
+ traverse(sub);
54
+ }
55
+ };
56
+ traverse(program);
57
+ }
58
+ /**
59
+ * Applies declarative `storage` schemas for all plugins.
60
+ * Called by the core after `getCrawlithDB()` is available, before any hooks run.
61
+ */
62
+ applyStorage(context) {
63
+ for (const plugin of this.plugins) {
64
+ if (!plugin.storage?.perPage?.columns)
65
+ continue;
66
+ if (!context.db)
67
+ continue;
68
+ try {
69
+ const scopedDb = context.db.scope(plugin.name, context.snapshotId, {
70
+ live: context.live,
71
+ fetchMode: plugin.storage.fetchMode
72
+ });
73
+ scopedDb.schema.define(plugin.storage.perPage.columns);
74
+ }
75
+ catch (err) {
76
+ context.logger?.error(`[plugin:${plugin.name}] Storage schema failed: ${err.message}`);
77
+ }
78
+ }
79
+ }
80
+ // ─── Scope guards ─────────────────────────────────────────────────────────
81
+ /** Hooks that only make sense during a full site crawl. */
82
+ static CRAWL_ONLY_HOOKS = new Set([
83
+ 'onCrawlStart', 'onPageParsed', 'onGraphBuilt', 'onMetrics', 'onReport', 'shouldEnqueueUrl'
84
+ ]);
85
+ /** Hooks that only make sense during a single-page analysis. */
86
+ static PAGE_ONLY_HOOKS = new Set([
87
+ 'onPage'
88
+ ]);
89
+ // ─── Hook runners ─────────────────────────────────────────────────────────
90
+ async runHook(hookName, context, payload) {
91
+ // Enforce scope: silently skip hooks that don't belong to this execution context.
92
+ // Undefined scope (legacy/test) is treated as permissive.
93
+ if (context.scope === 'page' && PluginRegistry.CRAWL_ONLY_HOOKS.has(hookName))
94
+ return;
95
+ if (context.scope === 'crawl' && PluginRegistry.PAGE_ONLY_HOOKS.has(hookName))
96
+ return;
97
+ for (const plugin of this.plugins) {
98
+ const hooks = plugin.hooks;
99
+ if (hooks && typeof hooks[hookName] === 'function') {
100
+ const scopedDb = context.db?.scope(plugin.name, context.snapshotId || payload?.snapshotId, {
101
+ live: context.live,
102
+ fetchMode: plugin.storage?.fetchMode
103
+ });
104
+ const scopedConfig = new PluginConfig(plugin.name);
105
+ // Resolve targetUrl from payload if available (standard result object)
106
+ const targetUrl = context.targetUrl || payload?.pages?.[0]?.url;
107
+ // Build a CLIWriter that prefixes plugin name — satisfies both cli and logger
108
+ const cliWriter = {
109
+ info: (m) => context.logger?.info(m),
110
+ warn: (m) => context.logger?.warn(m),
111
+ error: (m) => context.logger?.error(m),
112
+ debug: (m) => context.logger?.debug(m),
113
+ };
114
+ const scopedContext = {
115
+ ...context,
116
+ db: scopedDb,
117
+ config: scopedConfig,
118
+ targetUrl,
119
+ cli: cliWriter,
120
+ logger: cliWriter, // keep logger alias in sync
121
+ };
122
+ try {
123
+ if (payload !== undefined) {
124
+ await hooks[hookName](scopedContext, payload);
125
+ }
126
+ else {
127
+ await hooks[hookName](scopedContext);
128
+ }
129
+ }
130
+ catch (err) {
131
+ context.logger?.error(`[plugin:${plugin.name}] Hook ${hookName} failed: ${err.message}`);
132
+ }
133
+ }
134
+ }
135
+ }
136
+ runSyncBailHook(hookName, context, ...args) {
137
+ for (const plugin of this.plugins) {
138
+ const hooks = plugin.hooks;
139
+ if (hooks && typeof hooks[hookName] === 'function') {
140
+ const scopedDb = context.db?.scope(plugin.name, context.snapshotId, {
141
+ live: context.live,
142
+ fetchMode: plugin.storage?.fetchMode
143
+ });
144
+ const scopedConfig = new PluginConfig(plugin.name);
145
+ const scopedContext = { ...context, db: scopedDb, config: scopedConfig };
146
+ try {
147
+ const result = hooks[hookName](scopedContext, ...args);
148
+ if (result !== undefined)
149
+ return result;
150
+ }
151
+ catch (err) {
152
+ context.logger?.error(`[plugin:${plugin.name}] Sync bail hook ${hookName} failed: ${err.message}`);
153
+ }
154
+ }
155
+ }
156
+ return undefined;
157
+ }
158
+ getPlugins() {
159
+ return this.plugins;
160
+ }
161
+ addPlugin(plugin) {
162
+ if (this.plugins.some(p => p.name === plugin.name)) {
163
+ throw new Error(`Duplicate plugin name: ${plugin.name}`);
164
+ }
165
+ this.plugins.push(plugin);
166
+ }
167
+ }
@@ -0,0 +1,205 @@
1
+ import type { CrawlithDB } from '../db/CrawlithDB.js';
2
+ /**
3
+ * Execution scope — set by the core before any hooks run.
4
+ * - `page` → single-URL analysis (crawlith page …)
5
+ * - `crawl` → full site crawl (crawlith crawl …)
6
+ */
7
+ export type PluginScope = 'page' | 'crawl';
8
+ /** Injected into hook contexts to let plugins emit structured CLI output. */
9
+ export interface CLIWriter {
10
+ info(message: string): void;
11
+ warn(message: string): void;
12
+ error(message: string): void;
13
+ debug(message: string): void;
14
+ }
15
+ /**
16
+ * Injected into report-phase hook contexts.
17
+ * Plugins contribute structured data that the core aggregates into the final output.
18
+ */
19
+ export interface ReportWriter {
20
+ /** Attach a named data section to the report output. */
21
+ addSection(pluginName: string, data: unknown): void;
22
+ /** Optionally contribute a weighted score component. */
23
+ contributeScore?(input: {
24
+ label: string;
25
+ score: number;
26
+ weight: number;
27
+ }): void;
28
+ }
29
+ /** Passed to `onPage` — the single URL being analyzed. */
30
+ export interface PageInput {
31
+ /** Absolute URL of the page being analyzed. */
32
+ url: string;
33
+ /** Raw HTML content of the page. */
34
+ html: string;
35
+ /** HTTP status code. */
36
+ status: number;
37
+ /** HTTP Response headers. */
38
+ headers?: Record<string, string | string[]>;
39
+ }
40
+ /** Base context available in every hook. */
41
+ export interface PluginContext {
42
+ /** Execution scope. Undefined only in legacy / test contexts — treated as permissive. */
43
+ scope?: PluginScope;
44
+ command?: string;
45
+ /** Whether live fallback is allowed (from --live flag). Core-controlled. */
46
+ live?: boolean;
47
+ flags?: Record<string, any>;
48
+ snapshotId?: number;
49
+ targetUrl?: string;
50
+ db?: CrawlithDB;
51
+ config?: {
52
+ get(key?: string): string;
53
+ require(key?: string): string;
54
+ set(value: string): void;
55
+ };
56
+ /** CLI writer — populated by the registry before each hook call. */
57
+ cli?: CLIWriter;
58
+ /** Legacy logger alias — kept for backwards compatibility. */
59
+ logger?: CLIWriter;
60
+ metadata?: Record<string, any>;
61
+ /** MCP discovery registry available during MCP startup discovery. */
62
+ mcpDiscovery?: PluginMcpDiscovery;
63
+ [key: string]: any;
64
+ }
65
+ export interface PluginMcpTool {
66
+ name: string;
67
+ description: string;
68
+ /** Zod-like shape object interpreted by the MCP server package. */
69
+ inputSchema?: Record<string, unknown>;
70
+ execute: (input: Record<string, unknown>, ctx: PluginContext) => unknown | Promise<unknown>;
71
+ }
72
+ export interface PluginMcpPrompt {
73
+ name: string;
74
+ description: string;
75
+ /** Zod-like shape object interpreted by the MCP server package. */
76
+ argumentsSchema?: Record<string, unknown>;
77
+ buildMessages: (input: Record<string, unknown>, ctx: PluginContext) => {
78
+ messages: Array<{
79
+ role: 'user' | 'assistant' | 'system';
80
+ content: {
81
+ type: 'text';
82
+ text: string;
83
+ };
84
+ }>;
85
+ };
86
+ }
87
+ export interface PluginMcpDiscovery {
88
+ registerTool(tool: PluginMcpTool): void;
89
+ registerPrompt(prompt: PluginMcpPrompt): void;
90
+ }
91
+ export interface PluginCliOption {
92
+ /** e.g. '--my-flag <value>' */
93
+ flag: string;
94
+ description: string;
95
+ defaultValue?: unknown;
96
+ }
97
+ export type PluginColumnType = 'INTEGER' | 'REAL' | 'TEXT';
98
+ export interface PluginStorage {
99
+ /** Whether this plugin fetches data from a network or computes locally. Defaults to 'network'. */
100
+ fetchMode?: 'local' | 'network';
101
+ /**
102
+ * Per-URL columns to add to the plugin's scoped data table.
103
+ * The core creates the table automatically before `onInit` runs,
104
+ * so plugins never need to call `ctx.db.schema.define()`.
105
+ */
106
+ perPage?: {
107
+ columns: Record<string, PluginColumnType>;
108
+ };
109
+ }
110
+ export interface CrawlithPlugin {
111
+ name: string;
112
+ version?: string;
113
+ description?: string;
114
+ /**
115
+ * Declarative CLI registration.
116
+ * The core registers these options on the appropriate commands —
117
+ * no need to interact with Commander directly.
118
+ *
119
+ * `for` controls which commands expose the options:
120
+ * - `['page', 'crawl']` (default when omitted) — both commands
121
+ * - `['page']` — page command only
122
+ * - `['crawl']` — crawl command only
123
+ */
124
+ cli?: {
125
+ flag: string;
126
+ description: string;
127
+ defaultValue?: unknown;
128
+ for?: ('page' | 'crawl')[];
129
+ options?: PluginCliOption[];
130
+ };
131
+ /**
132
+ * Declarative storage schema.
133
+ * The core creates the plugin's scoped table before `onInit` runs.
134
+ * Plugins that don't persist data can omit this entirely.
135
+ */
136
+ storage?: PluginStorage;
137
+ /**
138
+ * Set to true to declare this plugin as a Score Provider.
139
+ * The core will automatically aggregate the `score` and `weight` columns
140
+ * from this plugin's storage table during snapshot aggregation.
141
+ */
142
+ scoreProvider?: boolean;
143
+ /**
144
+ * Legacy imperative CLI registration — kept for backwards compatibility.
145
+ * Prefer `cli` for new plugins.
146
+ * @deprecated Use `cli` instead.
147
+ */
148
+ register?: (cli: any) => void;
149
+ /** Declarative MCP definitions discovered by @crawlith/mcp at startup. */
150
+ mcp?: {
151
+ tools?: PluginMcpTool[];
152
+ prompts?: PluginMcpPrompt[];
153
+ };
154
+ hooks?: {
155
+ /**
156
+ * Runs on both `page` and `crawl` scopes.
157
+ * Use for any setup that doesn't depend on the scope —
158
+ * e.g. initialising in-memory state, reading config.
159
+ * DB schema is already created by the time this runs (via `storage`).
160
+ */
161
+ onInit?: (ctx: PluginContext) => void | Promise<void>;
162
+ /**
163
+ * Single-page hook — `page` scope only.
164
+ * Receives the target URL, its raw HTML, and HTTP status.
165
+ * Use this for URL-scoped plugins (PageSpeed, heading-health, etc.).
166
+ */
167
+ onPage?: (ctx: PluginContext, page: PageInput) => void | Promise<void>;
168
+ /**
169
+ * Fired at the very start of a crawl — `crawl` scope only.
170
+ * Use to initialise crawl-wide state or validate config.
171
+ */
172
+ onCrawlStart?: (ctx: PluginContext) => void | Promise<void>;
173
+ /**
174
+ * URL enqueue filter — `crawl` scope only.
175
+ * Return `false` to prevent a URL from being crawled.
176
+ */
177
+ shouldEnqueueUrl?: (ctx: PluginContext, url: string, depth: number) => boolean;
178
+ /**
179
+ * Fired after each page is fetched and parsed — `crawl` scope only.
180
+ * Use for real-time per-page processing without waiting for the full graph.
181
+ */
182
+ onPageParsed?: (ctx: PluginContext, page: any) => void | Promise<void>;
183
+ /**
184
+ * Fired after the full link graph is built — `crawl` scope only.
185
+ * Graph structure is complete; metrics have not been computed yet.
186
+ */
187
+ onGraphBuilt?: (ctx: PluginContext, graph: any) => void | Promise<void>;
188
+ /**
189
+ * Graph-level metrics phase — `crawl` scope only.
190
+ * All pages are available; use for cross-page analysis (duplicate
191
+ * detection, PageRank, heading structure across the site, etc.).
192
+ */
193
+ onMetrics?: (ctx: PluginContext, graph: any) => void | Promise<void>;
194
+ /**
195
+ * Final report phase — `crawl` scope only.
196
+ * Attach snapshot-level summary data to the result object.
197
+ */
198
+ onReport?: (ctx: PluginContext, report: any) => void | Promise<void>;
199
+ /**
200
+ * MCP discovery phase — executed by @crawlith/mcp server startup.
201
+ * Plugins can register MCP tools/prompts through `ctx.mcpDiscovery`.
202
+ */
203
+ onMcpDiscovery?: (ctx: PluginContext) => void | Promise<void>;
204
+ };
205
+ }
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,9 @@
1
+ export type PageRepositoryPort = Record<string, unknown>;
2
+ export type EdgeRepositoryPort = Record<string, unknown>;
3
+ export type SnapshotRepositoryPort = Record<string, unknown>;
4
+ export type FetcherPort = Record<string, unknown>;
5
+ export interface LoggerPort {
6
+ info(msg: string): void;
7
+ warn(msg: string): void;
8
+ error(msg: string): void;
9
+ }
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,3 @@
1
+ export declare function parseExportFormats(exportOption: string | boolean | undefined): string[];
2
+ export declare function runCrawlExports(formats: string[], outputDir: string, url: string, graphData: any, metrics: any, graphObj: any, report?: any): Promise<void>;
3
+ export declare function runAnalysisExports(formats: string[], outputDir: string, result: any, isLive: boolean): Promise<void>;