@crawlith/core 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +70 -0
  3. package/dist/analysis/analysis_list.html +35 -0
  4. package/dist/analysis/analysis_page.html +123 -0
  5. package/dist/analysis/analyze.d.ts +40 -5
  6. package/dist/analysis/analyze.js +395 -347
  7. package/dist/analysis/clustering.d.ts +23 -0
  8. package/dist/analysis/clustering.js +206 -0
  9. package/dist/analysis/content.d.ts +1 -1
  10. package/dist/analysis/content.js +11 -5
  11. package/dist/analysis/duplicate.d.ts +34 -0
  12. package/dist/analysis/duplicate.js +305 -0
  13. package/dist/analysis/heading.d.ts +116 -0
  14. package/dist/analysis/heading.js +356 -0
  15. package/dist/analysis/images.d.ts +1 -1
  16. package/dist/analysis/images.js +6 -5
  17. package/dist/analysis/links.d.ts +1 -1
  18. package/dist/analysis/links.js +8 -8
  19. package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
  20. package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
  21. package/dist/analysis/scoring.js +11 -2
  22. package/dist/analysis/seo.d.ts +8 -4
  23. package/dist/analysis/seo.js +41 -30
  24. package/dist/analysis/soft404.d.ts +17 -0
  25. package/dist/analysis/soft404.js +62 -0
  26. package/dist/analysis/structuredData.d.ts +1 -1
  27. package/dist/analysis/structuredData.js +5 -4
  28. package/dist/analysis/templates.d.ts +2 -0
  29. package/dist/analysis/templates.js +7 -0
  30. package/dist/application/index.d.ts +2 -0
  31. package/dist/application/index.js +2 -0
  32. package/dist/application/usecase.d.ts +3 -0
  33. package/dist/application/usecase.js +1 -0
  34. package/dist/application/usecases.d.ts +114 -0
  35. package/dist/application/usecases.js +201 -0
  36. package/dist/audit/index.js +1 -1
  37. package/dist/audit/transport.d.ts +1 -1
  38. package/dist/audit/transport.js +5 -4
  39. package/dist/audit/types.d.ts +1 -0
  40. package/dist/constants.d.ts +17 -0
  41. package/dist/constants.js +23 -0
  42. package/dist/core/scope/scopeManager.js +3 -0
  43. package/dist/core/security/ipGuard.d.ts +11 -0
  44. package/dist/core/security/ipGuard.js +71 -3
  45. package/dist/crawler/crawl.d.ts +4 -22
  46. package/dist/crawler/crawl.js +4 -335
  47. package/dist/crawler/crawler.d.ts +87 -0
  48. package/dist/crawler/crawler.js +683 -0
  49. package/dist/crawler/extract.d.ts +4 -1
  50. package/dist/crawler/extract.js +7 -2
  51. package/dist/crawler/fetcher.d.ts +2 -1
  52. package/dist/crawler/fetcher.js +26 -11
  53. package/dist/crawler/metricsRunner.d.ts +23 -1
  54. package/dist/crawler/metricsRunner.js +202 -72
  55. package/dist/crawler/normalize.d.ts +41 -0
  56. package/dist/crawler/normalize.js +119 -3
  57. package/dist/crawler/parser.d.ts +1 -3
  58. package/dist/crawler/parser.js +2 -49
  59. package/dist/crawler/resolver.d.ts +11 -0
  60. package/dist/crawler/resolver.js +67 -0
  61. package/dist/crawler/sitemap.d.ts +6 -0
  62. package/dist/crawler/sitemap.js +27 -17
  63. package/dist/crawler/trap.d.ts +5 -1
  64. package/dist/crawler/trap.js +23 -2
  65. package/dist/db/CrawlithDB.d.ts +110 -0
  66. package/dist/db/CrawlithDB.js +500 -0
  67. package/dist/db/graphLoader.js +42 -30
  68. package/dist/db/index.d.ts +11 -0
  69. package/dist/db/index.js +41 -29
  70. package/dist/db/migrations.d.ts +2 -0
  71. package/dist/db/{schema.js → migrations.js} +90 -43
  72. package/dist/db/pluginRegistry.d.ts +9 -0
  73. package/dist/db/pluginRegistry.js +19 -0
  74. package/dist/db/repositories/EdgeRepository.d.ts +13 -0
  75. package/dist/db/repositories/EdgeRepository.js +20 -0
  76. package/dist/db/repositories/MetricsRepository.d.ts +16 -8
  77. package/dist/db/repositories/MetricsRepository.js +28 -7
  78. package/dist/db/repositories/PageRepository.d.ts +15 -2
  79. package/dist/db/repositories/PageRepository.js +169 -25
  80. package/dist/db/repositories/SiteRepository.d.ts +9 -0
  81. package/dist/db/repositories/SiteRepository.js +13 -0
  82. package/dist/db/repositories/SnapshotRepository.d.ts +14 -5
  83. package/dist/db/repositories/SnapshotRepository.js +64 -5
  84. package/dist/db/reset.d.ts +9 -0
  85. package/dist/db/reset.js +32 -0
  86. package/dist/db/statements.d.ts +12 -0
  87. package/dist/db/statements.js +40 -0
  88. package/dist/diff/compare.d.ts +0 -5
  89. package/dist/diff/compare.js +0 -12
  90. package/dist/diff/service.d.ts +16 -0
  91. package/dist/diff/service.js +41 -0
  92. package/dist/domain/index.d.ts +4 -0
  93. package/dist/domain/index.js +4 -0
  94. package/dist/events.d.ts +56 -0
  95. package/dist/events.js +1 -0
  96. package/dist/graph/graph.d.ts +36 -42
  97. package/dist/graph/graph.js +26 -17
  98. package/dist/graph/hits.d.ts +23 -0
  99. package/dist/graph/hits.js +111 -0
  100. package/dist/graph/metrics.d.ts +0 -4
  101. package/dist/graph/metrics.js +25 -9
  102. package/dist/graph/pagerank.d.ts +17 -4
  103. package/dist/graph/pagerank.js +126 -91
  104. package/dist/graph/simhash.d.ts +6 -0
  105. package/dist/graph/simhash.js +14 -0
  106. package/dist/index.d.ts +29 -8
  107. package/dist/index.js +29 -8
  108. package/dist/lock/hashKey.js +1 -1
  109. package/dist/lock/lockManager.d.ts +5 -1
  110. package/dist/lock/lockManager.js +38 -13
  111. package/dist/plugin-system/plugin-cli.d.ts +10 -0
  112. package/dist/plugin-system/plugin-cli.js +31 -0
  113. package/dist/plugin-system/plugin-config.d.ts +16 -0
  114. package/dist/plugin-system/plugin-config.js +36 -0
  115. package/dist/plugin-system/plugin-loader.d.ts +17 -0
  116. package/dist/plugin-system/plugin-loader.js +122 -0
  117. package/dist/plugin-system/plugin-registry.d.ts +25 -0
  118. package/dist/plugin-system/plugin-registry.js +167 -0
  119. package/dist/plugin-system/plugin-types.d.ts +205 -0
  120. package/dist/plugin-system/plugin-types.js +1 -0
  121. package/dist/ports/index.d.ts +9 -0
  122. package/dist/ports/index.js +1 -0
  123. package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
  124. package/dist/report/crawlExport.d.ts +3 -0
  125. package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
  126. package/dist/report/crawl_template.d.ts +1 -0
  127. package/dist/report/crawl_template.js +7 -0
  128. package/dist/report/export.d.ts +3 -0
  129. package/dist/report/export.js +81 -0
  130. package/dist/report/html.js +15 -216
  131. package/dist/report/insight.d.ts +27 -0
  132. package/dist/report/insight.js +103 -0
  133. package/dist/scoring/health.d.ts +56 -0
  134. package/dist/scoring/health.js +213 -0
  135. package/dist/utils/chalk.d.ts +6 -0
  136. package/dist/utils/chalk.js +41 -0
  137. package/dist/utils/secureConfig.d.ts +23 -0
  138. package/dist/utils/secureConfig.js +128 -0
  139. package/package.json +12 -6
  140. package/CHANGELOG.md +0 -7
  141. package/dist/db/schema.d.ts +0 -2
  142. package/dist/graph/cluster.d.ts +0 -6
  143. package/dist/graph/cluster.js +0 -173
  144. package/dist/graph/duplicate.d.ts +0 -10
  145. package/dist/graph/duplicate.js +0 -251
  146. package/dist/report/sitegraphExport.d.ts +0 -3
  147. package/dist/report/sitegraph_template.d.ts +0 -1
  148. package/dist/report/sitegraph_template.js +0 -630
  149. package/dist/scoring/hits.d.ts +0 -9
  150. package/dist/scoring/hits.js +0 -111
  151. package/src/analysis/analyze.ts +0 -548
  152. package/src/analysis/content.ts +0 -62
  153. package/src/analysis/images.ts +0 -28
  154. package/src/analysis/links.ts +0 -41
  155. package/src/analysis/scoring.ts +0 -59
  156. package/src/analysis/seo.ts +0 -82
  157. package/src/analysis/structuredData.ts +0 -62
  158. package/src/audit/dns.ts +0 -49
  159. package/src/audit/headers.ts +0 -98
  160. package/src/audit/index.ts +0 -66
  161. package/src/audit/scoring.ts +0 -232
  162. package/src/audit/transport.ts +0 -258
  163. package/src/audit/types.ts +0 -102
  164. package/src/core/network/proxyAdapter.ts +0 -21
  165. package/src/core/network/rateLimiter.ts +0 -39
  166. package/src/core/network/redirectController.ts +0 -47
  167. package/src/core/network/responseLimiter.ts +0 -34
  168. package/src/core/network/retryPolicy.ts +0 -57
  169. package/src/core/scope/domainFilter.ts +0 -45
  170. package/src/core/scope/scopeManager.ts +0 -52
  171. package/src/core/scope/subdomainPolicy.ts +0 -39
  172. package/src/core/security/ipGuard.ts +0 -92
  173. package/src/crawler/crawl.ts +0 -382
  174. package/src/crawler/extract.ts +0 -34
  175. package/src/crawler/fetcher.ts +0 -233
  176. package/src/crawler/metricsRunner.ts +0 -124
  177. package/src/crawler/normalize.ts +0 -108
  178. package/src/crawler/parser.ts +0 -190
  179. package/src/crawler/sitemap.ts +0 -73
  180. package/src/crawler/trap.ts +0 -96
  181. package/src/db/graphLoader.ts +0 -105
  182. package/src/db/index.ts +0 -70
  183. package/src/db/repositories/EdgeRepository.ts +0 -29
  184. package/src/db/repositories/MetricsRepository.ts +0 -49
  185. package/src/db/repositories/PageRepository.ts +0 -128
  186. package/src/db/repositories/SiteRepository.ts +0 -32
  187. package/src/db/repositories/SnapshotRepository.ts +0 -74
  188. package/src/db/schema.ts +0 -177
  189. package/src/diff/compare.ts +0 -84
  190. package/src/graph/cluster.ts +0 -192
  191. package/src/graph/duplicate.ts +0 -286
  192. package/src/graph/graph.ts +0 -172
  193. package/src/graph/metrics.ts +0 -110
  194. package/src/graph/pagerank.ts +0 -125
  195. package/src/graph/simhash.ts +0 -61
  196. package/src/index.ts +0 -30
  197. package/src/lock/hashKey.ts +0 -51
  198. package/src/lock/lockManager.ts +0 -124
  199. package/src/lock/pidCheck.ts +0 -13
  200. package/src/report/html.ts +0 -227
  201. package/src/report/sitegraphExport.ts +0 -58
  202. package/src/scoring/hits.ts +0 -131
  203. package/src/scoring/orphanSeverity.ts +0 -176
  204. package/src/utils/version.ts +0 -18
  205. package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
  206. package/tests/analysis.unit.test.ts +0 -98
  207. package/tests/analyze.integration.test.ts +0 -98
  208. package/tests/audit/dns.test.ts +0 -31
  209. package/tests/audit/headers.test.ts +0 -45
  210. package/tests/audit/scoring.test.ts +0 -133
  211. package/tests/audit/security.test.ts +0 -12
  212. package/tests/audit/transport.test.ts +0 -112
  213. package/tests/clustering.test.ts +0 -118
  214. package/tests/crawler.test.ts +0 -358
  215. package/tests/db.test.ts +0 -159
  216. package/tests/diff.test.ts +0 -67
  217. package/tests/duplicate.test.ts +0 -110
  218. package/tests/fetcher.test.ts +0 -106
  219. package/tests/fetcher_safety.test.ts +0 -85
  220. package/tests/fixtures/analyze-crawl.json +0 -26
  221. package/tests/hits.test.ts +0 -134
  222. package/tests/html_report.test.ts +0 -58
  223. package/tests/lock/lockManager.test.ts +0 -138
  224. package/tests/metrics.test.ts +0 -196
  225. package/tests/normalize.test.ts +0 -101
  226. package/tests/orphanSeverity.test.ts +0 -160
  227. package/tests/pagerank.test.ts +0 -98
  228. package/tests/parser.test.ts +0 -117
  229. package/tests/proxy_safety.test.ts +0 -57
  230. package/tests/redirect_safety.test.ts +0 -73
  231. package/tests/safety.test.ts +0 -114
  232. package/tests/scope.test.ts +0 -66
  233. package/tests/scoring.test.ts +0 -59
  234. package/tests/sitemap.test.ts +0 -88
  235. package/tests/soft404.test.ts +0 -41
  236. package/tests/trap.test.ts +0 -39
  237. package/tests/visualization_data.test.ts +0 -46
  238. package/tsconfig.json +0 -11
@@ -0,0 +1,122 @@
1
+ import fs from 'node:fs';
2
+ import path from 'node:path';
3
+ import { pathToFileURL } from 'node:url';
4
+ export class PluginLoader {
5
+ plugins = new Map();
6
+ logger;
7
+ constructor(logger) {
8
+ this.logger = logger;
9
+ }
10
+ async discover(rootPath) {
11
+ // 1. Discover Internal Plugins
12
+ const internalPath = path.resolve(rootPath, 'packages/plugins');
13
+ if (fs.existsSync(internalPath)) {
14
+ this.logger?.debug(`[plugin] Scanning internal directory: ${internalPath}`);
15
+ await this.loadFromDir(internalPath, 'internal');
16
+ }
17
+ // 2. Discover External Plugins
18
+ const nodeModulesPath = path.resolve(rootPath, 'node_modules');
19
+ if (fs.existsSync(nodeModulesPath)) {
20
+ this.logger?.debug(`[plugin] Scanning node_modules: ${nodeModulesPath}`);
21
+ await this.loadFromNodeModules(nodeModulesPath);
22
+ }
23
+ return Array.from(this.plugins.values());
24
+ }
25
+ async loadFromDir(dirPath, type) {
26
+ const entries = fs.readdirSync(dirPath, { withFileTypes: true });
27
+ for (const entry of entries) {
28
+ if (entry.isDirectory()) {
29
+ const fullPath = path.join(dirPath, entry.name);
30
+ await this.tryLoadPlugin(fullPath, type);
31
+ }
32
+ }
33
+ }
34
+ async loadFromNodeModules(nodeModulesPath) {
35
+ if (!fs.existsSync(nodeModulesPath))
36
+ return;
37
+ const entries = fs.readdirSync(nodeModulesPath, { withFileTypes: true });
38
+ for (const entry of entries) {
39
+ if (!entry.isDirectory())
40
+ continue;
41
+ if (entry.name.startsWith('@')) {
42
+ // Scoped packages
43
+ const scopePath = path.join(nodeModulesPath, entry.name);
44
+ const scopedEntries = fs.readdirSync(scopePath, { withFileTypes: true });
45
+ for (const scopedEntry of scopedEntries) {
46
+ if (scopedEntry.isDirectory()) {
47
+ await this.tryLoadPlugin(path.join(scopePath, scopedEntry.name), 'external');
48
+ }
49
+ }
50
+ }
51
+ else {
52
+ await this.tryLoadPlugin(path.join(nodeModulesPath, entry.name), 'external');
53
+ }
54
+ }
55
+ }
56
+ async tryLoadPlugin(pluginPath, type) {
57
+ const pkgPath = path.join(pluginPath, 'package.json');
58
+ if (!fs.existsSync(pkgPath))
59
+ return;
60
+ try {
61
+ const pkg = JSON.parse(fs.readFileSync(pkgPath, 'utf8'));
62
+ const isCrawlithPlugin = pkg.name?.startsWith('@crawlith/plugin-') ||
63
+ pkg.name?.startsWith('crawlith-plugin-') ||
64
+ pkg.crawlith?.type === 'plugin' ||
65
+ type === 'internal';
66
+ if (!isCrawlithPlugin)
67
+ return;
68
+ let entryPoint = pkg.exports ?? pkg.main ?? 'index.js';
69
+ if (typeof entryPoint === 'object' && entryPoint !== null) {
70
+ entryPoint = entryPoint['.'] ?? entryPoint.import ?? entryPoint.default ?? 'index.js';
71
+ }
72
+ if (typeof entryPoint !== 'string') {
73
+ entryPoint = 'index.js';
74
+ }
75
+ let fullEntryPoint = path.join(pluginPath, entryPoint);
76
+ // If we're loading a .ts file as the intended entry point, but a .js one exists in dist,
77
+ // prefer the .js one to avoid requiring a TS loader at runtime.
78
+ if (fullEntryPoint.endsWith('.ts')) {
79
+ const distJsPath = path.join(pluginPath, 'dist', 'index.js');
80
+ if (fs.existsSync(distJsPath)) {
81
+ fullEntryPoint = distJsPath;
82
+ }
83
+ }
84
+ if (!fs.existsSync(fullEntryPoint) && fs.existsSync(path.join(pluginPath, 'index.ts'))) {
85
+ fullEntryPoint = path.join(pluginPath, 'index.ts');
86
+ }
87
+ if (!fs.existsSync(fullEntryPoint)) {
88
+ this.logger?.debug(`[plugin] Skipped: ${pkg.name} (entry point not found: ${fullEntryPoint})`);
89
+ return;
90
+ }
91
+ const imported = await import(pathToFileURL(fullEntryPoint).href);
92
+ const plugin = imported.default || imported;
93
+ if (this.validatePlugin(plugin, pkg)) {
94
+ if (this.plugins.has(plugin.name)) {
95
+ throw new Error(`Duplicate plugin name: ${plugin.name}`);
96
+ }
97
+ this.plugins.set(plugin.name, plugin);
98
+ this.logger?.debug(`[plugin] Loaded: ${plugin.name} v${plugin.version} (${type})`);
99
+ }
100
+ }
101
+ catch (err) {
102
+ this.logger?.debug(`[plugin] Failed to load plugin at ${pluginPath}: ${err.message}`);
103
+ }
104
+ }
105
+ validatePlugin(plugin, pkg) {
106
+ if (!plugin || typeof plugin !== 'object') {
107
+ this.logger?.debug(`[plugin] Skipped: ${pkg.name} (invalid export)`);
108
+ return false;
109
+ }
110
+ if (!plugin.name) {
111
+ this.logger?.debug(`[plugin] Skipped: ${pkg.name} (missing name)`);
112
+ return false;
113
+ }
114
+ if (!plugin.version) {
115
+ plugin.version = pkg.version || '0.0.0';
116
+ }
117
+ if (!plugin.description) {
118
+ plugin.description = pkg.description || '';
119
+ }
120
+ return true;
121
+ }
122
+ }
@@ -0,0 +1,25 @@
1
+ import { CrawlithPlugin, PluginContext } from './plugin-types.js';
2
+ export declare class PluginRegistry {
3
+ private plugins;
4
+ constructor(plugins?: CrawlithPlugin[]);
5
+ get pluginsList(): CrawlithPlugin[];
6
+ private registeredCommands;
7
+ /**
8
+ * Registers all plugin CLI flags on the given command.
9
+ * Handles both declarative `cli` config and legacy `register(cmd)` callbacks.
10
+ */
11
+ registerPlugins(program: any): void;
12
+ /**
13
+ * Applies declarative `storage` schemas for all plugins.
14
+ * Called by the core after `getCrawlithDB()` is available, before any hooks run.
15
+ */
16
+ applyStorage(context: PluginContext): void;
17
+ /** Hooks that only make sense during a full site crawl. */
18
+ private static readonly CRAWL_ONLY_HOOKS;
19
+ /** Hooks that only make sense during a single-page analysis. */
20
+ private static readonly PAGE_ONLY_HOOKS;
21
+ runHook(hookName: string, context: PluginContext, payload?: any): Promise<void>;
22
+ runSyncBailHook(hookName: string, context: PluginContext, ...args: any[]): any;
23
+ getPlugins(): CrawlithPlugin[];
24
+ addPlugin(plugin: CrawlithPlugin): void;
25
+ }
@@ -0,0 +1,167 @@
1
+ import { Command } from 'commander';
2
+ import { PluginConfig } from './plugin-config.js';
3
+ export class PluginRegistry {
4
+ plugins = [];
5
+ constructor(plugins = []) {
6
+ this.plugins = plugins;
7
+ }
8
+ get pluginsList() {
9
+ return this.plugins;
10
+ }
11
+ registeredCommands = new WeakSet();
12
+ /**
13
+ * Registers all plugin CLI flags on the given command.
14
+ * Handles both declarative `cli` config and legacy `register(cmd)` callbacks.
15
+ */
16
+ registerPlugins(program) {
17
+ if (!(program instanceof Command))
18
+ return;
19
+ const traverse = (cmd) => {
20
+ if (this.registeredCommands.has(cmd))
21
+ return;
22
+ this.registeredCommands.add(cmd);
23
+ const cmdName = cmd.name();
24
+ for (const plugin of this.plugins) {
25
+ // Declarative cli registration (preferred)
26
+ if (plugin.cli) {
27
+ const targets = plugin.cli.for ?? ['page', 'crawl'];
28
+ if (targets.includes(cmdName)) {
29
+ const defaultValue = plugin.cli.defaultValue;
30
+ if (defaultValue !== undefined && defaultValue !== null) {
31
+ cmd.option(plugin.cli.flag, plugin.cli.description, defaultValue);
32
+ }
33
+ else {
34
+ cmd.option(plugin.cli.flag, plugin.cli.description);
35
+ }
36
+ for (const opt of plugin.cli.options ?? []) {
37
+ const dv = opt.defaultValue;
38
+ if (dv !== undefined && dv !== null) {
39
+ cmd.option(opt.flag, opt.description, dv);
40
+ }
41
+ else {
42
+ cmd.option(opt.flag, opt.description);
43
+ }
44
+ }
45
+ }
46
+ }
47
+ // Legacy imperative registration (backwards compat)
48
+ if (typeof plugin.register === 'function') {
49
+ plugin.register(cmd);
50
+ }
51
+ }
52
+ for (const sub of cmd.commands) {
53
+ traverse(sub);
54
+ }
55
+ };
56
+ traverse(program);
57
+ }
58
+ /**
59
+ * Applies declarative `storage` schemas for all plugins.
60
+ * Called by the core after `getCrawlithDB()` is available, before any hooks run.
61
+ */
62
+ applyStorage(context) {
63
+ for (const plugin of this.plugins) {
64
+ if (!plugin.storage?.perPage?.columns)
65
+ continue;
66
+ if (!context.db)
67
+ continue;
68
+ try {
69
+ const scopedDb = context.db.scope(plugin.name, context.snapshotId, {
70
+ live: context.live,
71
+ fetchMode: plugin.storage.fetchMode
72
+ });
73
+ scopedDb.schema.define(plugin.storage.perPage.columns);
74
+ }
75
+ catch (err) {
76
+ context.logger?.error(`[plugin:${plugin.name}] Storage schema failed: ${err.message}`);
77
+ }
78
+ }
79
+ }
80
+ // ─── Scope guards ─────────────────────────────────────────────────────────
81
+ /** Hooks that only make sense during a full site crawl. */
82
+ static CRAWL_ONLY_HOOKS = new Set([
83
+ 'onCrawlStart', 'onPageParsed', 'onGraphBuilt', 'onMetrics', 'onReport', 'shouldEnqueueUrl'
84
+ ]);
85
+ /** Hooks that only make sense during a single-page analysis. */
86
+ static PAGE_ONLY_HOOKS = new Set([
87
+ 'onPage'
88
+ ]);
89
+ // ─── Hook runners ─────────────────────────────────────────────────────────
90
+ async runHook(hookName, context, payload) {
91
+ // Enforce scope: silently skip hooks that don't belong to this execution context.
92
+ // Undefined scope (legacy/test) is treated as permissive.
93
+ if (context.scope === 'page' && PluginRegistry.CRAWL_ONLY_HOOKS.has(hookName))
94
+ return;
95
+ if (context.scope === 'crawl' && PluginRegistry.PAGE_ONLY_HOOKS.has(hookName))
96
+ return;
97
+ for (const plugin of this.plugins) {
98
+ const hooks = plugin.hooks;
99
+ if (hooks && typeof hooks[hookName] === 'function') {
100
+ const scopedDb = context.db?.scope(plugin.name, context.snapshotId || payload?.snapshotId, {
101
+ live: context.live,
102
+ fetchMode: plugin.storage?.fetchMode
103
+ });
104
+ const scopedConfig = new PluginConfig(plugin.name);
105
+ // Resolve targetUrl from payload if available (standard result object)
106
+ const targetUrl = context.targetUrl || payload?.pages?.[0]?.url;
107
+ // Build a CLIWriter that prefixes plugin name — satisfies both cli and logger
108
+ const cliWriter = {
109
+ info: (m) => context.logger?.info(m),
110
+ warn: (m) => context.logger?.warn(m),
111
+ error: (m) => context.logger?.error(m),
112
+ debug: (m) => context.logger?.debug(m),
113
+ };
114
+ const scopedContext = {
115
+ ...context,
116
+ db: scopedDb,
117
+ config: scopedConfig,
118
+ targetUrl,
119
+ cli: cliWriter,
120
+ logger: cliWriter, // keep logger alias in sync
121
+ };
122
+ try {
123
+ if (payload !== undefined) {
124
+ await hooks[hookName](scopedContext, payload);
125
+ }
126
+ else {
127
+ await hooks[hookName](scopedContext);
128
+ }
129
+ }
130
+ catch (err) {
131
+ context.logger?.error(`[plugin:${plugin.name}] Hook ${hookName} failed: ${err.message}`);
132
+ }
133
+ }
134
+ }
135
+ }
136
+ runSyncBailHook(hookName, context, ...args) {
137
+ for (const plugin of this.plugins) {
138
+ const hooks = plugin.hooks;
139
+ if (hooks && typeof hooks[hookName] === 'function') {
140
+ const scopedDb = context.db?.scope(plugin.name, context.snapshotId, {
141
+ live: context.live,
142
+ fetchMode: plugin.storage?.fetchMode
143
+ });
144
+ const scopedConfig = new PluginConfig(plugin.name);
145
+ const scopedContext = { ...context, db: scopedDb, config: scopedConfig };
146
+ try {
147
+ const result = hooks[hookName](scopedContext, ...args);
148
+ if (result !== undefined)
149
+ return result;
150
+ }
151
+ catch (err) {
152
+ context.logger?.error(`[plugin:${plugin.name}] Sync bail hook ${hookName} failed: ${err.message}`);
153
+ }
154
+ }
155
+ }
156
+ return undefined;
157
+ }
158
+ getPlugins() {
159
+ return this.plugins;
160
+ }
161
+ addPlugin(plugin) {
162
+ if (this.plugins.some(p => p.name === plugin.name)) {
163
+ throw new Error(`Duplicate plugin name: ${plugin.name}`);
164
+ }
165
+ this.plugins.push(plugin);
166
+ }
167
+ }
@@ -0,0 +1,205 @@
1
+ import type { CrawlithDB } from '../db/CrawlithDB.js';
2
+ /**
3
+ * Execution scope — set by the core before any hooks run.
4
+ * - `page` → single-URL analysis (crawlith page …)
5
+ * - `crawl` → full site crawl (crawlith crawl …)
6
+ */
7
+ export type PluginScope = 'page' | 'crawl';
8
+ /** Injected into hook contexts to let plugins emit structured CLI output. */
9
+ export interface CLIWriter {
10
+ info(message: string): void;
11
+ warn(message: string): void;
12
+ error(message: string): void;
13
+ debug(message: string): void;
14
+ }
15
+ /**
16
+ * Injected into report-phase hook contexts.
17
+ * Plugins contribute structured data that the core aggregates into the final output.
18
+ */
19
+ export interface ReportWriter {
20
+ /** Attach a named data section to the report output. */
21
+ addSection(pluginName: string, data: unknown): void;
22
+ /** Optionally contribute a weighted score component. */
23
+ contributeScore?(input: {
24
+ label: string;
25
+ score: number;
26
+ weight: number;
27
+ }): void;
28
+ }
29
+ /** Passed to `onPage` — the single URL being analyzed. */
30
+ export interface PageInput {
31
+ /** Absolute URL of the page being analyzed. */
32
+ url: string;
33
+ /** Raw HTML content of the page. */
34
+ html: string;
35
+ /** HTTP status code. */
36
+ status: number;
37
+ /** HTTP Response headers. */
38
+ headers?: Record<string, string | string[]>;
39
+ }
40
+ /** Base context available in every hook. */
41
+ export interface PluginContext {
42
+ /** Execution scope. Undefined only in legacy / test contexts — treated as permissive. */
43
+ scope?: PluginScope;
44
+ command?: string;
45
+ /** Whether live fallback is allowed (from --live flag). Core-controlled. */
46
+ live?: boolean;
47
+ flags?: Record<string, any>;
48
+ snapshotId?: number;
49
+ targetUrl?: string;
50
+ db?: CrawlithDB;
51
+ config?: {
52
+ get(key?: string): string;
53
+ require(key?: string): string;
54
+ set(value: string): void;
55
+ };
56
+ /** CLI writer — populated by the registry before each hook call. */
57
+ cli?: CLIWriter;
58
+ /** Legacy logger alias — kept for backwards compatibility. */
59
+ logger?: CLIWriter;
60
+ metadata?: Record<string, any>;
61
+ /** MCP discovery registry available during MCP startup discovery. */
62
+ mcpDiscovery?: PluginMcpDiscovery;
63
+ [key: string]: any;
64
+ }
65
+ export interface PluginMcpTool {
66
+ name: string;
67
+ description: string;
68
+ /** Zod-like shape object interpreted by the MCP server package. */
69
+ inputSchema?: Record<string, unknown>;
70
+ execute: (input: Record<string, unknown>, ctx: PluginContext) => unknown | Promise<unknown>;
71
+ }
72
+ export interface PluginMcpPrompt {
73
+ name: string;
74
+ description: string;
75
+ /** Zod-like shape object interpreted by the MCP server package. */
76
+ argumentsSchema?: Record<string, unknown>;
77
+ buildMessages: (input: Record<string, unknown>, ctx: PluginContext) => {
78
+ messages: Array<{
79
+ role: 'user' | 'assistant' | 'system';
80
+ content: {
81
+ type: 'text';
82
+ text: string;
83
+ };
84
+ }>;
85
+ };
86
+ }
87
+ export interface PluginMcpDiscovery {
88
+ registerTool(tool: PluginMcpTool): void;
89
+ registerPrompt(prompt: PluginMcpPrompt): void;
90
+ }
91
+ export interface PluginCliOption {
92
+ /** e.g. '--my-flag <value>' */
93
+ flag: string;
94
+ description: string;
95
+ defaultValue?: unknown;
96
+ }
97
+ export type PluginColumnType = 'INTEGER' | 'REAL' | 'TEXT';
98
+ export interface PluginStorage {
99
+ /** Whether this plugin fetches data from a network or computes locally. Defaults to 'network'. */
100
+ fetchMode?: 'local' | 'network';
101
+ /**
102
+ * Per-URL columns to add to the plugin's scoped data table.
103
+ * The core creates the table automatically before `onInit` runs,
104
+ * so plugins never need to call `ctx.db.schema.define()`.
105
+ */
106
+ perPage?: {
107
+ columns: Record<string, PluginColumnType>;
108
+ };
109
+ }
110
+ export interface CrawlithPlugin {
111
+ name: string;
112
+ version?: string;
113
+ description?: string;
114
+ /**
115
+ * Declarative CLI registration.
116
+ * The core registers these options on the appropriate commands —
117
+ * no need to interact with Commander directly.
118
+ *
119
+ * `for` controls which commands expose the options:
120
+ * - `['page', 'crawl']` (default when omitted) — both commands
121
+ * - `['page']` — page command only
122
+ * - `['crawl']` — crawl command only
123
+ */
124
+ cli?: {
125
+ flag: string;
126
+ description: string;
127
+ defaultValue?: unknown;
128
+ for?: ('page' | 'crawl')[];
129
+ options?: PluginCliOption[];
130
+ };
131
+ /**
132
+ * Declarative storage schema.
133
+ * The core creates the plugin's scoped table before `onInit` runs.
134
+ * Plugins that don't persist data can omit this entirely.
135
+ */
136
+ storage?: PluginStorage;
137
+ /**
138
+ * Set to true to declare this plugin as a Score Provider.
139
+ * The core will automatically aggregate the `score` and `weight` columns
140
+ * from this plugin's storage table during snapshot aggregation.
141
+ */
142
+ scoreProvider?: boolean;
143
+ /**
144
+ * Legacy imperative CLI registration — kept for backwards compatibility.
145
+ * Prefer `cli` for new plugins.
146
+ * @deprecated Use `cli` instead.
147
+ */
148
+ register?: (cli: any) => void;
149
+ /** Declarative MCP definitions discovered by @crawlith/mcp at startup. */
150
+ mcp?: {
151
+ tools?: PluginMcpTool[];
152
+ prompts?: PluginMcpPrompt[];
153
+ };
154
+ hooks?: {
155
+ /**
156
+ * Runs on both `page` and `crawl` scopes.
157
+ * Use for any setup that doesn't depend on the scope —
158
+ * e.g. initialising in-memory state, reading config.
159
+ * DB schema is already created by the time this runs (via `storage`).
160
+ */
161
+ onInit?: (ctx: PluginContext) => void | Promise<void>;
162
+ /**
163
+ * Single-page hook — `page` scope only.
164
+ * Receives the target URL, its raw HTML, and HTTP status.
165
+ * Use this for URL-scoped plugins (PageSpeed, heading-health, etc.).
166
+ */
167
+ onPage?: (ctx: PluginContext, page: PageInput) => void | Promise<void>;
168
+ /**
169
+ * Fired at the very start of a crawl — `crawl` scope only.
170
+ * Use to initialise crawl-wide state or validate config.
171
+ */
172
+ onCrawlStart?: (ctx: PluginContext) => void | Promise<void>;
173
+ /**
174
+ * URL enqueue filter — `crawl` scope only.
175
+ * Return `false` to prevent a URL from being crawled.
176
+ */
177
+ shouldEnqueueUrl?: (ctx: PluginContext, url: string, depth: number) => boolean;
178
+ /**
179
+ * Fired after each page is fetched and parsed — `crawl` scope only.
180
+ * Use for real-time per-page processing without waiting for the full graph.
181
+ */
182
+ onPageParsed?: (ctx: PluginContext, page: any) => void | Promise<void>;
183
+ /**
184
+ * Fired after the full link graph is built — `crawl` scope only.
185
+ * Graph structure is complete; metrics have not been computed yet.
186
+ */
187
+ onGraphBuilt?: (ctx: PluginContext, graph: any) => void | Promise<void>;
188
+ /**
189
+ * Graph-level metrics phase — `crawl` scope only.
190
+ * All pages are available; use for cross-page analysis (duplicate
191
+ * detection, PageRank, heading structure across the site, etc.).
192
+ */
193
+ onMetrics?: (ctx: PluginContext, graph: any) => void | Promise<void>;
194
+ /**
195
+ * Final report phase — `crawl` scope only.
196
+ * Attach snapshot-level summary data to the result object.
197
+ */
198
+ onReport?: (ctx: PluginContext, report: any) => void | Promise<void>;
199
+ /**
200
+ * MCP discovery phase — executed by @crawlith/mcp server startup.
201
+ * Plugins can register MCP tools/prompts through `ctx.mcpDiscovery`.
202
+ */
203
+ onMcpDiscovery?: (ctx: PluginContext) => void | Promise<void>;
204
+ };
205
+ }
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,9 @@
1
+ export type PageRepositoryPort = Record<string, unknown>;
2
+ export type EdgeRepositoryPort = Record<string, unknown>;
3
+ export type SnapshotRepositoryPort = Record<string, unknown>;
4
+ export type FetcherPort = Record<string, unknown>;
5
+ export interface LoggerPort {
6
+ info(msg: string): void;
7
+ warn(msg: string): void;
8
+ error(msg: string): void;
9
+ }
@@ -0,0 +1 @@
1
+ export {};