@pruddiman/hem 0.0.1-beta-5671db0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. package/LICENSE +21 -0
  2. package/dist/agents/arbiter-agent.d.ts +72 -0
  3. package/dist/agents/arbiter-agent.js +149 -0
  4. package/dist/agents/architecture-agent.d.ts +148 -0
  5. package/dist/agents/architecture-agent.js +459 -0
  6. package/dist/agents/base-agent.d.ts +44 -0
  7. package/dist/agents/base-agent.js +57 -0
  8. package/dist/agents/crossref-agent.d.ts +140 -0
  9. package/dist/agents/crossref-agent.js +560 -0
  10. package/dist/agents/crossref-arbiter-agent.d.ts +72 -0
  11. package/dist/agents/crossref-arbiter-agent.js +147 -0
  12. package/dist/agents/documentation-agent.d.ts +55 -0
  13. package/dist/agents/documentation-agent.js +159 -0
  14. package/dist/agents/exploration-agent.d.ts +58 -0
  15. package/dist/agents/exploration-agent.js +102 -0
  16. package/dist/agents/grouping-agent.d.ts +167 -0
  17. package/dist/agents/grouping-agent.js +557 -0
  18. package/dist/agents/index-agent.d.ts +86 -0
  19. package/dist/agents/index-agent.js +360 -0
  20. package/dist/agents/organization-agent.d.ts +144 -0
  21. package/dist/agents/organization-agent.js +607 -0
  22. package/dist/auth.d.ts +372 -0
  23. package/dist/auth.js +1072 -0
  24. package/dist/broadcast-mcp.d.ts +21 -0
  25. package/dist/broadcast-mcp.js +59 -0
  26. package/dist/changelog.d.ts +85 -0
  27. package/dist/changelog.js +223 -0
  28. package/dist/decision-queue.d.ts +173 -0
  29. package/dist/decision-queue.js +265 -0
  30. package/dist/diff-scope.d.ts +24 -0
  31. package/dist/diff-scope.js +28 -0
  32. package/dist/discovery.d.ts +54 -0
  33. package/dist/discovery.js +405 -0
  34. package/dist/grouping.d.ts +37 -0
  35. package/dist/grouping.js +343 -0
  36. package/dist/helpers/format.d.ts +5 -0
  37. package/dist/helpers/format.js +13 -0
  38. package/dist/helpers/index.d.ts +11 -0
  39. package/dist/helpers/index.js +11 -0
  40. package/dist/helpers/parsing.d.ts +52 -0
  41. package/dist/helpers/parsing.js +128 -0
  42. package/dist/helpers/paths.d.ts +41 -0
  43. package/dist/helpers/paths.js +67 -0
  44. package/dist/helpers/strings.d.ts +45 -0
  45. package/dist/helpers/strings.js +97 -0
  46. package/dist/index.d.ts +135 -0
  47. package/dist/index.js +1087 -0
  48. package/dist/merge-utils.d.ts +22 -0
  49. package/dist/merge-utils.js +34 -0
  50. package/dist/orchestrator.d.ts +194 -0
  51. package/dist/orchestrator.js +1169 -0
  52. package/dist/output.d.ts +106 -0
  53. package/dist/output.js +243 -0
  54. package/dist/progress.d.ts +228 -0
  55. package/dist/progress.js +644 -0
  56. package/dist/providers/copilot.d.ts +247 -0
  57. package/dist/providers/copilot.js +598 -0
  58. package/dist/providers/index.d.ts +15 -0
  59. package/dist/providers/index.js +12 -0
  60. package/dist/providers/opencode.d.ts +156 -0
  61. package/dist/providers/opencode.js +416 -0
  62. package/dist/providers/types.d.ts +156 -0
  63. package/dist/providers/types.js +16 -0
  64. package/dist/resources.d.ts +76 -0
  65. package/dist/resources.js +151 -0
  66. package/dist/search-index.d.ts +71 -0
  67. package/dist/search-index.js +187 -0
  68. package/dist/search-mcp.d.ts +25 -0
  69. package/dist/search-mcp.js +100 -0
  70. package/dist/server-utils.d.ts +56 -0
  71. package/dist/server-utils.js +135 -0
  72. package/dist/session.d.ts +227 -0
  73. package/dist/session.js +370 -0
  74. package/dist/types.d.ts +272 -0
  75. package/dist/types.js +5 -0
  76. package/dist/worktree.d.ts +82 -0
  77. package/dist/worktree.js +187 -0
  78. package/package.json +45 -0
@@ -0,0 +1,156 @@
1
+ /**
2
+ * Provider abstraction layer type definitions for Hem.
3
+ *
4
+ * Defines the {@link Provider} interface that all LLM provider
5
+ * implementations must satisfy, the {@link ProviderConfig} configuration
6
+ * type, and the {@link SseEvent} type used by the SSE relay.
7
+ *
8
+ * Aligns with Dispatch's `ProviderInstance` pattern:
9
+ * - `createSession()` — create an isolated session, returns session ID
10
+ * - `prompt(sessionId, text)` — send a prompt and wait for completion
11
+ * - `cleanup()` — tear down the provider
12
+ *
13
+ * Also exposes low-level `session` and `event` properties for agents that
14
+ * need direct session control (OrganizationAgent, CrossRefAgent, etc.).
15
+ */
16
+ import type { SseEvent } from "../session.js";
17
+ import type { ModelSelection, AgentPermissionConfig } from "../types.js";
18
+ export type { SseEvent };
19
+ /**
20
+ * Configuration for initializing an LLM provider.
21
+ *
22
+ * Combines model selection, destination path for scoped file permissions,
23
+ * and optional per-agent permission overrides. Provider implementations
24
+ * translate these into provider-specific server or client configuration.
25
+ */
26
+ export interface ProviderConfig {
27
+ /** The resolved model to use (provider ID + model ID). */
28
+ model: ModelSelection;
29
+ /**
30
+ * Absolute path to the destination directory for generated documentation.
31
+ * Used by providers that scope file-write permissions to a specific directory.
32
+ */
33
+ destinationPath: string;
34
+ /**
35
+ * Optional per-agent permission overrides.
36
+ * When provided, the provider applies these permission scopes to its
37
+ * underlying session engine. The key is the agent name.
38
+ */
39
+ agentPermissions?: Record<string, AgentPermissionConfig>;
40
+ /**
41
+ * Optional verbose logging callback.
42
+ * Providers should call this for diagnostic output when set.
43
+ */
44
+ verbose?: (msg: string) => void;
45
+ /**
46
+ * Absolute path to the `.hem/search-index.db` file.
47
+ * When set, the `hem-search` MCP tool is registered and available
48
+ * to doc/org/xref agents for on-demand doc search.
49
+ */
50
+ searchDbPath?: string;
51
+ }
52
+ /**
53
+ * Core provider interface that all LLM provider implementations must satisfy.
54
+ *
55
+ * Aligns with Dispatch's `ProviderInstance` pattern — providers expose
56
+ * `createSession()`, `prompt()`, and `cleanup()` for the common case, plus
57
+ * low-level `session` and `event` properties for agents that need direct
58
+ * session control (e.g., OrganizationAgent's broadcast relay).
59
+ *
60
+ * Providers are constructed and initialized via `static async create()`.
61
+ * Callers receive an already-initialized provider and never call
62
+ * `initialize()` through the interface.
63
+ */
64
+ export interface Provider {
65
+ /** Human-readable provider name (e.g. "copilot", "opencode"). */
66
+ readonly name: string;
67
+ /**
68
+ * Full model identifier (e.g. "anthropic/claude-sonnet-4"), if known.
69
+ * May be undefined until after the first session is created.
70
+ */
71
+ readonly model?: string;
72
+ /**
73
+ * Create a new isolated session for a single task.
74
+ * Returns an opaque session identifier.
75
+ */
76
+ createSession(): Promise<string>;
77
+ /**
78
+ * Send a prompt to an existing session and wait for the agent to finish.
79
+ * Returns the agent's text response, or null if no response was produced.
80
+ *
81
+ * @param sessionId - The session ID returned by `createSession()`.
82
+ * @param text - The prompt text to send.
83
+ * @param options - Optional: `agent` selects the permission profile
84
+ * (OpenCode only; ignored by Copilot provider).
85
+ */
86
+ prompt(sessionId: string, text: string, options?: {
87
+ agent?: string;
88
+ }): Promise<string | null>;
89
+ /**
90
+ * Tear down the provider — stop servers, release resources.
91
+ * Safe to call multiple times.
92
+ */
93
+ cleanup(): Promise<void>;
94
+ readonly session: {
95
+ /**
96
+ * Fire-and-forget prompt — does not wait for the session to become idle.
97
+ * Used by OrganizationAgent to relay broadcast messages concurrently.
98
+ */
99
+ promptAsync(options: {
100
+ path: {
101
+ id: string;
102
+ };
103
+ body: {
104
+ parts: Array<{
105
+ type: "text";
106
+ text: string;
107
+ }>;
108
+ agent?: string;
109
+ };
110
+ }): Promise<{
111
+ data?: void;
112
+ error?: unknown;
113
+ }>;
114
+ /**
115
+ * Abort any in-progress work in the session (best-effort).
116
+ * Called before `delete()` to ensure the session is idle.
117
+ */
118
+ abort(options: {
119
+ path: {
120
+ id: string;
121
+ };
122
+ }): Promise<{
123
+ data?: boolean;
124
+ error?: unknown;
125
+ }>;
126
+ /**
127
+ * Delete/destroy the session and release its resources.
128
+ */
129
+ delete(options: {
130
+ path: {
131
+ id: string;
132
+ };
133
+ }): Promise<{
134
+ data?: boolean;
135
+ error?: unknown;
136
+ }>;
137
+ };
138
+ readonly event: {
139
+ /**
140
+ * Subscribe to SSE events from the provider.
141
+ *
142
+ * Returns an async generator that yields events. Callers iterate
143
+ * the stream until they receive a terminal event or break.
144
+ *
145
+ * Used by OrganizationAgent and CrossRefAgent to intercept broadcast
146
+ * tool calls and child session creation events.
147
+ */
148
+ subscribe(options?: Record<string, unknown>): Promise<{
149
+ stream: AsyncGenerator<SseEvent, void, unknown>;
150
+ }>;
151
+ };
152
+ /**
153
+ * Returns the configuration used to initialize this provider.
154
+ */
155
+ readonly config: ProviderConfig;
156
+ }
@@ -0,0 +1,16 @@
1
+ /**
2
+ * Provider abstraction layer type definitions for Hem.
3
+ *
4
+ * Defines the {@link Provider} interface that all LLM provider
5
+ * implementations must satisfy, the {@link ProviderConfig} configuration
6
+ * type, and the {@link SseEvent} type used by the SSE relay.
7
+ *
8
+ * Aligns with Dispatch's `ProviderInstance` pattern:
9
+ * - `createSession()` — create an isolated session, returns session ID
10
+ * - `prompt(sessionId, text)` — send a prompt and wait for completion
11
+ * - `cleanup()` — tear down the provider
12
+ *
13
+ * Also exposes low-level `session` and `event` properties for agents that
14
+ * need direct session control (OrganizationAgent, CrossRefAgent, etc.).
15
+ */
16
+ export {};
@@ -0,0 +1,76 @@
1
+ /**
2
+ * Resource-aware concurrency limits.
3
+ *
4
+ * OpenCode runs as a single server process with multiple sessions sharing
5
+ * memory (~500 MB per session). Sessions are CPU-intensive.
6
+ * This module inspects system resources at runtime to compute a safe upper
7
+ * bound on the number of concurrent sessions Hem should run.
8
+ *
9
+ * Rules:
10
+ * - Memory: 1 session per 500 MB of **free** system memory.
11
+ * - CPU: 1 session per CPU core.
12
+ * - Floor: at least {@link CONCURRENCY_FLOOR} regardless of resources.
13
+ * - Effective limit: `max(floor, min(cpuCount, floor(freeMem_MB / 500)))`.
14
+ *
15
+ * The floor exists because when using cloud-hosted LLMs the local memory and
16
+ * CPU are not the bottleneck; overly conservative limits would serialize work
17
+ * that can safely run in parallel. Use `--concurrency 1` to restrict when
18
+ * running a local model on a constrained machine.
19
+ *
20
+ * The user's `--concurrency` flag (if provided) is clamped to this ceiling.
21
+ */
22
+ /**
23
+ * File count above which Hem activates multi-agent scaling.
24
+ *
25
+ * When a project has more files than this threshold, exploration and
26
+ * documentation phases distribute work across multiple agents per group.
27
+ */
28
+ export declare const LARGE_PROJECT_THRESHOLD = 200;
29
+ /**
30
+ * Compute the maximum safe concurrency based on system resources.
31
+ *
32
+ * @param userRequested - Optional user-provided `--concurrency` value.
33
+ * When supplied the result is clamped to the resource
34
+ * ceiling; when omitted the resource ceiling itself is
35
+ * returned.
36
+ * @returns The effective concurrency (always ≥ 1).
37
+ */
38
+ export declare function computeMaxConcurrency(userRequested?: number): number;
39
+ /**
40
+ * Return a human-readable summary of the resource snapshot used for
41
+ * concurrency decisions. Useful for verbose / debug logging.
42
+ */
43
+ export declare function describeResourceLimits(userRequested?: number): string;
44
+ /**
45
+ * Compute the effective number of agents to run per group.
46
+ *
47
+ * For small projects (≤ {@link LARGE_PROJECT_THRESHOLD} files) this returns 1
48
+ * (single-agent-per-group, the current default).
49
+ *
50
+ * For larger projects the count scales linearly from
51
+ * {@link MIN_AGENTS_PER_GROUP} to {@link MAX_AGENTS_PER_GROUP} based on file
52
+ * count, then is clamped to the resource ceiling so the machine is not
53
+ * over-committed.
54
+ *
55
+ * @param fileCount - Total number of source files in the project.
56
+ * @param resourceCeiling - Maximum concurrent sessions the system can sustain.
57
+ * Defaults to {@link computeMaxConcurrency}().
58
+ * @returns Agents per group — 1 for small projects, 4-8 for large ones.
59
+ */
60
+ export declare function computeAgentsPerGroup(fileCount: number, resourceCeiling?: number): number;
61
+ /**
62
+ * Compute the effective number of parallel organization workers.
63
+ *
64
+ * Scales linearly from {@link MIN_ORG_WORKERS} (4) to {@link MAX_ORG_WORKERS}
65
+ * (7) based on the number of documentation files, then clamps to the resource
66
+ * ceiling so the machine is not over-committed.
67
+ *
68
+ * The scaling range is 9–50 files (9 is the minimum for parallel mode,
69
+ * 50 files reaches the maximum worker count).
70
+ *
71
+ * @param fileCount - Number of documentation files to organize.
72
+ * @param resourceCeiling - Maximum concurrent sessions the system can sustain.
73
+ * Defaults to {@link computeMaxConcurrency}().
74
+ * @returns Worker count — 4 to 7, clamped to resource ceiling (but always ≥ 1).
75
+ */
76
+ export declare function computeOrgWorkers(fileCount: number, resourceCeiling?: number): number;
@@ -0,0 +1,151 @@
1
+ /**
2
+ * Resource-aware concurrency limits.
3
+ *
4
+ * OpenCode runs as a single server process with multiple sessions sharing
5
+ * memory (~500 MB per session). Sessions are CPU-intensive.
6
+ * This module inspects system resources at runtime to compute a safe upper
7
+ * bound on the number of concurrent sessions Hem should run.
8
+ *
9
+ * Rules:
10
+ * - Memory: 1 session per 500 MB of **free** system memory.
11
+ * - CPU: 1 session per CPU core.
12
+ * - Floor: at least {@link CONCURRENCY_FLOOR} regardless of resources.
13
+ * - Effective limit: `max(floor, min(cpuCount, floor(freeMem_MB / 500)))`.
14
+ *
15
+ * The floor exists because when using cloud-hosted LLMs the local memory and
16
+ * CPU are not the bottleneck; overly conservative limits would serialize work
17
+ * that can safely run in parallel. Use `--concurrency 1` to restrict when
18
+ * running a local model on a constrained machine.
19
+ *
20
+ * The user's `--concurrency` flag (if provided) is clamped to this ceiling.
21
+ */
22
+ import { freemem, cpus } from "node:os";
23
+ /** Bytes of free memory required per concurrent session. */
24
+ const BYTES_PER_SESSION = 500 * 1024 * 1024; // 500 MB
25
+ /** CPU cores required per concurrent session. */
26
+ const CORES_PER_SESSION = 1;
27
+ /**
28
+ * Minimum concurrency regardless of available system resources.
29
+ * Prevents over-serialization when using cloud-hosted LLMs where local
30
+ * memory and CPU are not the throughput bottleneck.
31
+ */
32
+ const CONCURRENCY_FLOOR = 4;
33
+ // ── Large-project scaling ───────────────────────────────────────────────
34
+ /**
35
+ * File count above which Hem activates multi-agent scaling.
36
+ *
37
+ * When a project has more files than this threshold, exploration and
38
+ * documentation phases distribute work across multiple agents per group.
39
+ */
40
+ export const LARGE_PROJECT_THRESHOLD = 200;
41
+ /** Minimum agents per group when multi-agent scaling is active. */
42
+ const MIN_AGENTS_PER_GROUP = 4;
43
+ /** Maximum agents per group when multi-agent scaling is active. */
44
+ const MAX_AGENTS_PER_GROUP = 8;
45
+ /**
46
+ * Compute the maximum safe concurrency based on system resources.
47
+ *
48
+ * @param userRequested - Optional user-provided `--concurrency` value.
49
+ * When supplied the result is clamped to the resource
50
+ * ceiling; when omitted the resource ceiling itself is
51
+ * returned.
52
+ * @returns The effective concurrency (always ≥ 1).
53
+ */
54
+ export function computeMaxConcurrency(userRequested) {
55
+ const maxByMemory = Math.max(1, Math.floor(freemem() / BYTES_PER_SESSION));
56
+ const maxByCpu = Math.max(1, Math.floor(cpus().length / CORES_PER_SESSION));
57
+ const resourceLimit = Math.max(CONCURRENCY_FLOOR, Math.min(maxByMemory, maxByCpu));
58
+ if (userRequested != null && userRequested > 0) {
59
+ return Math.min(userRequested, resourceLimit);
60
+ }
61
+ return resourceLimit;
62
+ }
63
+ /**
64
+ * Return a human-readable summary of the resource snapshot used for
65
+ * concurrency decisions. Useful for verbose / debug logging.
66
+ */
67
+ export function describeResourceLimits(userRequested) {
68
+ const freeGiB = (freemem() / (1024 * 1024 * 1024)).toFixed(1);
69
+ const cpuCount = cpus().length;
70
+ const maxByMemory = Math.max(1, Math.floor(freemem() / BYTES_PER_SESSION));
71
+ const maxByCpu = Math.max(1, Math.floor(cpuCount / CORES_PER_SESSION));
72
+ const effective = computeMaxConcurrency(userRequested);
73
+ const resourceFloor = effective === CONCURRENCY_FLOOR && Math.min(maxByMemory, maxByCpu) < CONCURRENCY_FLOOR
74
+ ? ` (floor=${CONCURRENCY_FLOOR})`
75
+ : "";
76
+ const parts = [
77
+ `free_mem=${freeGiB}GB (limit ${maxByMemory})`,
78
+ `cpus=${cpuCount} (limit ${maxByCpu})`,
79
+ `effective=${effective}${resourceFloor}`,
80
+ ];
81
+ if (userRequested != null && userRequested > effective) {
82
+ parts.push(`user_requested=${userRequested} (clamped)`);
83
+ }
84
+ return parts.join(", ");
85
+ }
86
+ // ── Multi-agent scaling ─────────────────────────────────────────────────
87
+ /**
88
+ * Compute the effective number of agents to run per group.
89
+ *
90
+ * For small projects (≤ {@link LARGE_PROJECT_THRESHOLD} files) this returns 1
91
+ * (single-agent-per-group, the current default).
92
+ *
93
+ * For larger projects the count scales linearly from
94
+ * {@link MIN_AGENTS_PER_GROUP} to {@link MAX_AGENTS_PER_GROUP} based on file
95
+ * count, then is clamped to the resource ceiling so the machine is not
96
+ * over-committed.
97
+ *
98
+ * @param fileCount - Total number of source files in the project.
99
+ * @param resourceCeiling - Maximum concurrent sessions the system can sustain.
100
+ * Defaults to {@link computeMaxConcurrency}().
101
+ * @returns Agents per group — 1 for small projects, 4-8 for large ones.
102
+ */
103
+ export function computeAgentsPerGroup(fileCount, resourceCeiling) {
104
+ if (fileCount <= LARGE_PROJECT_THRESHOLD)
105
+ return 1;
106
+ const ceiling = resourceCeiling != null && resourceCeiling > 0
107
+ ? resourceCeiling
108
+ : computeMaxConcurrency();
109
+ // Linear interpolation: 4 at threshold, 8 at 5× threshold (1000 files).
110
+ const span = LARGE_PROJECT_THRESHOLD * 4; // 800 files of range
111
+ const ratio = Math.min(1, (fileCount - LARGE_PROJECT_THRESHOLD) / span);
112
+ const desired = Math.round(MIN_AGENTS_PER_GROUP + ratio * (MAX_AGENTS_PER_GROUP - MIN_AGENTS_PER_GROUP));
113
+ // Clamp to resource ceiling, but always return at least MIN_AGENTS_PER_GROUP.
114
+ return Math.max(MIN_AGENTS_PER_GROUP, Math.min(desired, ceiling));
115
+ }
116
+ // ── Organization worker scaling ─────────────────────────────────────────
117
+ /** Minimum org workers when parallel mode is active. */
118
+ const MIN_ORG_WORKERS = 4;
119
+ /** Maximum org workers when parallel mode is active. */
120
+ const MAX_ORG_WORKERS = 7;
121
+ /**
122
+ * Compute the effective number of parallel organization workers.
123
+ *
124
+ * Scales linearly from {@link MIN_ORG_WORKERS} (4) to {@link MAX_ORG_WORKERS}
125
+ * (7) based on the number of documentation files, then clamps to the resource
126
+ * ceiling so the machine is not over-committed.
127
+ *
128
+ * The scaling range is 9–50 files (9 is the minimum for parallel mode,
129
+ * 50 files reaches the maximum worker count).
130
+ *
131
+ * @param fileCount - Number of documentation files to organize.
132
+ * @param resourceCeiling - Maximum concurrent sessions the system can sustain.
133
+ * Defaults to {@link computeMaxConcurrency}().
134
+ * @returns Worker count — 4 to 7, clamped to resource ceiling (but always ≥ 1).
135
+ */
136
+ export function computeOrgWorkers(fileCount, resourceCeiling) {
137
+ const ceiling = resourceCeiling != null && resourceCeiling > 0
138
+ ? resourceCeiling
139
+ : computeMaxConcurrency();
140
+ // Below parallel threshold, caller should use single-agent path;
141
+ // but if called, return 1 as a safe fallback.
142
+ if (fileCount <= 8)
143
+ return 1;
144
+ // Linear interpolation: 4 workers at 9 files, 7 workers at 50 files.
145
+ const scaleStart = 9;
146
+ const scaleEnd = 50;
147
+ const span = scaleEnd - scaleStart;
148
+ const ratio = Math.min(1, Math.max(0, (fileCount - scaleStart) / span));
149
+ const desired = Math.round(MIN_ORG_WORKERS + ratio * (MAX_ORG_WORKERS - MIN_ORG_WORKERS));
150
+ return Math.max(1, Math.min(desired, ceiling));
151
+ }
@@ -0,0 +1,71 @@
1
+ /**
2
+ * SQLite-backed search index for existing documentation files.
3
+ *
4
+ * Provides:
5
+ * - Full-text search (FTS5 with porter stemmer) over doc content
6
+ * - Incremental updates via SHA-256 hash comparison (re-index only changed docs)
7
+ * - Source-file → doc-file mapping for targeted regeneration
8
+ *
9
+ * Stored in `.hem/search-index.db` alongside the grouping cache.
10
+ *
11
+ * Architecture:
12
+ * - `doc_meta`: tracks path → content_hash for incremental update decisions
13
+ * - `doc_fts`: FTS5 virtual table for ranked keyword search
14
+ * - `source_doc_map`: many-to-many map from source code files to doc files
15
+ *
16
+ * The main process opens the index read-write to build/update it.
17
+ * The optional `hem-search` MCP server opens it read-only for agent queries.
18
+ */
19
+ export interface SearchResult {
20
+ path: string;
21
+ snippet: string;
22
+ }
23
+ export declare class SearchIndex {
24
+ private readonly db;
25
+ private constructor();
26
+ /**
27
+ * Open (or create) the SQLite index at the given path.
28
+ * The parent directory is created if it does not exist.
29
+ *
30
+ * @param dbPath - Absolute path to the `.db` file.
31
+ * @param readonly - When `true`, open in read-only mode (for MCP server).
32
+ */
33
+ static open(dbPath: string, readonly?: boolean): SearchIndex;
34
+ /**
35
+ * Upsert a document into the index.
36
+ *
37
+ * Computes SHA-256 of `content`. If the stored hash matches, skips the write
38
+ * and returns `false`. Otherwise (re)indexes and returns `true`.
39
+ */
40
+ upsertDoc(path: string, content: string): boolean;
41
+ /**
42
+ * Remove a document from the index and its source mappings.
43
+ */
44
+ removeDoc(path: string): void;
45
+ /**
46
+ * Full-text search over indexed doc content.
47
+ *
48
+ * Returns up to `limit` results ranked by BM25 relevance, each with a
49
+ * short snippet highlighting matching terms.
50
+ *
51
+ * Returns an empty array if the query is blank or produces no results.
52
+ */
53
+ search(query: string, limit?: number): SearchResult[];
54
+ /**
55
+ * Replace the source-file → doc-file mappings for a given doc.
56
+ *
57
+ * Any previously recorded source paths for `docPath` are removed first.
58
+ */
59
+ setSourceMappings(docPath: string, sourcePaths: string[]): void;
60
+ /**
61
+ * Returns the paths of all docs that cover a given source file.
62
+ */
63
+ getDocsForSource(sourcePath: string): string[];
64
+ /**
65
+ * Returns a map of all indexed paths to their stored content hashes.
66
+ * Used during startup to determine which docs are stale or deleted.
67
+ */
68
+ getAllHashes(): Map<string, string>;
69
+ /** Close the underlying database connection. */
70
+ close(): void;
71
+ }
@@ -0,0 +1,187 @@
1
+ /**
2
+ * SQLite-backed search index for existing documentation files.
3
+ *
4
+ * Provides:
5
+ * - Full-text search (FTS5 with porter stemmer) over doc content
6
+ * - Incremental updates via SHA-256 hash comparison (re-index only changed docs)
7
+ * - Source-file → doc-file mapping for targeted regeneration
8
+ *
9
+ * Stored in `.hem/search-index.db` alongside the grouping cache.
10
+ *
11
+ * Architecture:
12
+ * - `doc_meta`: tracks path → content_hash for incremental update decisions
13
+ * - `doc_fts`: FTS5 virtual table for ranked keyword search
14
+ * - `source_doc_map`: many-to-many map from source code files to doc files
15
+ *
16
+ * The main process opens the index read-write to build/update it.
17
+ * The optional `hem-search` MCP server opens it read-only for agent queries.
18
+ */
19
+ import Database from "better-sqlite3";
20
+ import { createHash } from "node:crypto";
21
+ import { mkdirSync, existsSync } from "node:fs";
22
+ import { dirname } from "node:path";
23
+ // ── Schema DDL ────────────────────────────────────────────────────────
24
+ const SCHEMA_SQL = `
25
+ PRAGMA journal_mode = WAL;
26
+
27
+ CREATE TABLE IF NOT EXISTS doc_meta (
28
+ path TEXT PRIMARY KEY,
29
+ content_hash TEXT NOT NULL,
30
+ indexed_at INTEGER NOT NULL
31
+ );
32
+
33
+ CREATE VIRTUAL TABLE IF NOT EXISTS doc_fts USING fts5(
34
+ path UNINDEXED,
35
+ content,
36
+ tokenize = 'porter ascii'
37
+ );
38
+
39
+ CREATE TABLE IF NOT EXISTS source_doc_map (
40
+ source_path TEXT NOT NULL,
41
+ doc_path TEXT NOT NULL,
42
+ PRIMARY KEY (source_path, doc_path)
43
+ );
44
+
45
+ CREATE INDEX IF NOT EXISTS idx_sdm_doc ON source_doc_map(doc_path);
46
+ `;
47
+ export class SearchIndex {
48
+ db;
49
+ constructor(db) {
50
+ this.db = db;
51
+ }
52
+ /**
53
+ * Open (or create) the SQLite index at the given path.
54
+ * The parent directory is created if it does not exist.
55
+ *
56
+ * @param dbPath - Absolute path to the `.db` file.
57
+ * @param readonly - When `true`, open in read-only mode (for MCP server).
58
+ */
59
+ static open(dbPath, readonly = false) {
60
+ if (!readonly) {
61
+ const dir = dirname(dbPath);
62
+ if (!existsSync(dir)) {
63
+ mkdirSync(dir, { recursive: true });
64
+ }
65
+ }
66
+ const db = new Database(dbPath, { readonly });
67
+ if (!readonly) {
68
+ // Apply schema (idempotent)
69
+ db.exec(SCHEMA_SQL);
70
+ }
71
+ return new SearchIndex(db);
72
+ }
73
+ /**
74
+ * Upsert a document into the index.
75
+ *
76
+ * Computes SHA-256 of `content`. If the stored hash matches, skips the write
77
+ * and returns `false`. Otherwise (re)indexes and returns `true`.
78
+ */
79
+ upsertDoc(path, content) {
80
+ const hash = sha256(content);
81
+ const existing = this.db
82
+ .prepare("SELECT content_hash FROM doc_meta WHERE path = ?")
83
+ .get(path);
84
+ if (existing?.content_hash === hash) {
85
+ return false;
86
+ }
87
+ const deleteFts = this.db.prepare("DELETE FROM doc_fts WHERE path = ?");
88
+ const insertFts = this.db.prepare("INSERT INTO doc_fts(path, content) VALUES (?, ?)");
89
+ const upsertMeta = this.db.prepare("INSERT INTO doc_meta(path, content_hash, indexed_at) VALUES (?, ?, ?) " +
90
+ "ON CONFLICT(path) DO UPDATE SET content_hash = excluded.content_hash, indexed_at = excluded.indexed_at");
91
+ this.db.transaction(() => {
92
+ deleteFts.run(path);
93
+ insertFts.run(path, content);
94
+ upsertMeta.run(path, hash, Date.now());
95
+ })();
96
+ return true;
97
+ }
98
+ /**
99
+ * Remove a document from the index and its source mappings.
100
+ */
101
+ removeDoc(path) {
102
+ this.db.prepare("DELETE FROM doc_fts WHERE path = ?").run(path);
103
+ this.db.prepare("DELETE FROM doc_meta WHERE path = ?").run(path);
104
+ this.db.prepare("DELETE FROM source_doc_map WHERE doc_path = ?").run(path);
105
+ }
106
+ /**
107
+ * Full-text search over indexed doc content.
108
+ *
109
+ * Returns up to `limit` results ranked by BM25 relevance, each with a
110
+ * short snippet highlighting matching terms.
111
+ *
112
+ * Returns an empty array if the query is blank or produces no results.
113
+ */
114
+ search(query, limit = 5) {
115
+ const trimmed = query.trim();
116
+ if (!trimmed)
117
+ return [];
118
+ // Build a safe FTS5 query: extract alphanumeric words and join with spaces.
119
+ // FTS5 treats space-separated words as implicit AND, so each word must
120
+ // appear in the document. Quoting the whole query as a phrase would require
121
+ // exact adjacency, which is too strict for our use case.
122
+ const words = trimmed
123
+ .toLowerCase()
124
+ .split(/[^a-z0-9]+/)
125
+ .filter((w) => w.length > 1);
126
+ if (words.length === 0)
127
+ return [];
128
+ const safeQuery = words.join(" ");
129
+ try {
130
+ const rows = this.db
131
+ .prepare(`SELECT path,
132
+ snippet(doc_fts, 1, '<b>', '</b>', '…', 24) AS snippet
133
+ FROM doc_fts
134
+ WHERE doc_fts MATCH ?
135
+ ORDER BY rank
136
+ LIMIT ?`)
137
+ .all(safeQuery, limit);
138
+ return rows.map((r) => ({ path: r.path, snippet: r.snippet }));
139
+ }
140
+ catch {
141
+ // FTS5 syntax error (e.g., query is just punctuation) — return nothing
142
+ return [];
143
+ }
144
+ }
145
+ /**
146
+ * Replace the source-file → doc-file mappings for a given doc.
147
+ *
148
+ * Any previously recorded source paths for `docPath` are removed first.
149
+ */
150
+ setSourceMappings(docPath, sourcePaths) {
151
+ const deleteExisting = this.db.prepare("DELETE FROM source_doc_map WHERE doc_path = ?");
152
+ const insert = this.db.prepare("INSERT OR IGNORE INTO source_doc_map(source_path, doc_path) VALUES (?, ?)");
153
+ this.db.transaction((paths) => {
154
+ deleteExisting.run(docPath);
155
+ for (const sp of paths) {
156
+ insert.run(sp, docPath);
157
+ }
158
+ })(sourcePaths);
159
+ }
160
+ /**
161
+ * Returns the paths of all docs that cover a given source file.
162
+ */
163
+ getDocsForSource(sourcePath) {
164
+ const rows = this.db
165
+ .prepare("SELECT doc_path FROM source_doc_map WHERE source_path = ?")
166
+ .all(sourcePath);
167
+ return rows.map((r) => r.doc_path);
168
+ }
169
+ /**
170
+ * Returns a map of all indexed paths to their stored content hashes.
171
+ * Used during startup to determine which docs are stale or deleted.
172
+ */
173
+ getAllHashes() {
174
+ const rows = this.db
175
+ .prepare("SELECT path, content_hash FROM doc_meta")
176
+ .all();
177
+ return new Map(rows.map((r) => [r.path, r.content_hash]));
178
+ }
179
+ /** Close the underlying database connection. */
180
+ close() {
181
+ this.db.close();
182
+ }
183
+ }
184
+ // ── Helpers ───────────────────────────────────────────────────────────
185
+ function sha256(input) {
186
+ return createHash("sha256").update(input, "utf-8").digest("hex");
187
+ }
@@ -0,0 +1,25 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * Standalone stdio-based MCP server exposing the Hem doc search index.
4
+ *
5
+ * Exposes two tools:
6
+ * - `search_docs({ query, limit? })` — FTS5 keyword search over indexed docs
7
+ * - `get_docs_for_source({ source_path })` — find docs covering a source file
8
+ *
9
+ * Receives the SQLite DB path as the first CLI argument:
10
+ * node dist/search-mcp.js /path/to/.hem/search-index.db
11
+ *
12
+ * Opens the DB in read-only mode. Multiple agent processes may connect
13
+ * simultaneously; SQLite WAL mode handles concurrent reads safely.
14
+ *
15
+ * Registered in OpenCode via config.mcp as:
16
+ * { type: "local", command: ["node", "dist/search-mcp.js", dbPath], enabled: true }
17
+ */
18
+ import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
19
+ import { SearchIndex } from "./search-index.js";
20
+ /**
21
+ * Build the hem-search MCP server bound to the given SearchIndex.
22
+ * Exported for tests; the CLI bootstrap below wires it up to a stdio
23
+ * transport with a real read-only SQLite-backed index.
24
+ */
25
+ export declare function createSearchMcpServer(index: SearchIndex): McpServer;