pi-doc-injector 0.5.0 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # Pi Doc Injector
2
2
 
3
- A [Pi](https://pi.dev) extension that automatically injects relevant project documentation into the LLM system prompt by monitoring streaming output for keyword matches.
3
+ A [Pi](https://pi.dev) extension that automatically injects relevant project documentation into the LLM context by monitoring streaming output for keyword matches. Docs are delivered as a `CustomMessage` so the system prompt stays untouched and the provider's prompt cache stays warm.
4
4
 
5
5
  ## Installation
6
6
 
@@ -29,8 +29,9 @@ git clone https://github.com/yourname/pi-doc-injector.git .pi/extensions/doc-inj
29
29
  1. Create a `docs/` folder in your project root.
30
30
  2. Add markdown files with frontmatter (`title` + `keywords`). See [Document Format](#document-format) for supported formats.
31
31
  3. Start Pi. The extension scans `docs/` on session start.
32
- 4. When the user mentions a keyword, the matching doc is injected into the
33
- system prompt **before the assistant responds** — no one-turn delay.
32
+ 4. When the user mentions a keyword, the matching doc is injected as a
33
+ `CustomMessage` into the conversation **before the assistant responds** —
34
+ no one-turn delay. The system prompt is never modified.
34
35
  5. If the assistant mentions a NEW keyword mid-response, generation is
35
36
  automatically aborted and restarted with the doc injected immediately.
36
37
 
@@ -208,18 +209,41 @@ The extension uses a per-session injection model:
208
209
  - **Assistant streaming**: if the assistant mentions a NEW keyword mid-response,
209
210
  generation is aborted and restarted with the doc injected immediately.
210
211
 
211
- ### System Prompt Lifecycle
212
+ ### Injection Mechanism
212
213
 
213
- Pi **reconstructs the system prompt from source files each turn** (verified against pi v0.70.6).
214
+ On match, the extension returns a `message` field from `before_agent_start`
215
+ with `customType: "doc-injector"`. Pi appends this to the session and sends
216
+ it to the LLM as part of the conversation. The system prompt is **never**
217
+ mutated.
214
218
 
215
- When `before_agent_start` fires, the `systemPrompt` passed to the extension is a freshly rebuilt prompt from `AGENTS.md`, `SYSTEM.md`, skills, and tool snippets. It is **not** accumulated from previous turns.
219
+ #### Why a CustomMessage, not the system prompt?
216
220
 
217
- This means:
221
+ - The system prompt is the highest-value prompt-cache slot. Each unique
222
+ system prompt text breaks the cache (5-min TTL by default). Appending
223
+ per-turn doc content there would invalidate the cache on every first
224
+ injection.
225
+ - A `CustomMessage` only adds to the conversation prefix, leaving the
226
+ system prompt byte-identical across turns and the cache warm.
218
227
 
219
- - Injections apply to the **current turn only** and do not persist in subsequent turns.
220
- - There is no risk of duplicate injection sections stacking up over time.
221
- - The `injected` flag alone is sufficient to prevent re-injection — no additional deduplication or marker-based stripping is needed.
228
+ #### Double-injection prevention
222
229
 
230
+ Two independent guards make duplicate injection impossible in a session:
231
+
232
+ 1. **Matcher guard** — `buildMatcher()` only includes non-injected entries
233
+ (via `getNonInjectedEntries()`), so already-injected docs cannot be
234
+ re-matched.
235
+ 2. **Mark guard** — `markInjected()` runs inside `before_agent_start` before
236
+ the LLM call, so even if the matcher ever produced a duplicate, the
237
+ mark would still prevent a second send.
238
+
239
+ In practice, the matcher guard is the primary defense; the mark guard is
240
+ defense-in-depth for race conditions (e.g. if `resources_discover` rebuilds
241
+ the registry mid-injection).
242
+
243
+ The `injected` flag is per-session: it's reset on `session_start` and can
244
+ be manually cleared with `/doc-inject reset`.
245
+
246
+ For the full source-level verification, see the JSDoc block in `index.ts`.
223
247
  For the full source-level verification, see the JSDoc block in `index.ts`.
224
248
 
225
249
  ## Development
package/cache.ts CHANGED
@@ -9,6 +9,7 @@
9
9
  import { mkdir, readFile, writeFile } from "node:fs/promises";
10
10
  import { dirname, join } from "node:path";
11
11
  import type { KeywordCache } from "./types";
12
+ import type { Notifier } from "./notifier";
12
13
 
13
14
  const CACHE_FILENAME = ".pi/doc-injector-cache.json";
14
15
  const CACHE_VERSION = 1;
@@ -17,8 +18,11 @@ const CACHE_VERSION = 1;
17
18
  * Load the keyword cache from disk.
18
19
  * Returns an empty cache (version 1, no files) if the file doesn't exist,
19
20
  * has wrong version, or is corrupted.
21
+ *
22
+ * Recoverable issues (corrupt JSON, wrong version) emit a warning via the
23
+ * `notifier`. ENOENT (no cache file yet) is silent.
20
24
  */
21
- export async function loadCache(cwd: string): Promise<KeywordCache> {
25
+ export async function loadCache(cwd: string, notifier: Notifier): Promise<KeywordCache> {
22
26
  const cachePath = join(cwd, CACHE_FILENAME);
23
27
 
24
28
  try {
@@ -26,7 +30,7 @@ export async function loadCache(cwd: string): Promise<KeywordCache> {
26
30
  const parsed: unknown = JSON.parse(raw);
27
31
 
28
32
  if (!isValidCache(parsed)) {
29
- console.warn(
33
+ notifier.warn(
30
34
  `[doc-injector] Invalid cache format or version at ${cachePath}, resetting.`,
31
35
  );
32
36
  return emptyCache();
@@ -36,10 +40,8 @@ export async function loadCache(cwd: string): Promise<KeywordCache> {
36
40
  } catch (err) {
37
41
  // ENOENT = no cache file yet, that's fine
38
42
  if ((err as NodeJS.ErrnoException).code !== "ENOENT") {
39
- console.warn(
40
- `[doc-injector] Failed to read cache at ${cachePath}:`,
41
- err instanceof Error ? err.message : String(err),
42
- );
43
+ const detail = err instanceof Error ? err.message : String(err);
44
+ notifier.warn(`[doc-injector] Failed to read cache at ${cachePath}: ${detail}`);
43
45
  }
44
46
  return emptyCache();
45
47
  }
package/config.ts CHANGED
@@ -5,47 +5,56 @@
5
5
  import { readFile } from "node:fs/promises";
6
6
  import { join } from "node:path";
7
7
  import { DEFAULT_CONFIG, type DocInjectorConfig } from "./types";
8
+ import type { Notifier } from "./notifier";
8
9
 
9
10
  /**
10
11
  * Clamp an integer value to [min, max] range.
11
- * Warns and clamps if out of range. Returns the default if not a number.
12
+ * Warns via the `notifier` and clamps if out of range. Returns the default
13
+ * if not a number.
12
14
  */
13
15
  function clampInt(
14
- value: unknown,
15
- defaultVal: number,
16
- min: number,
17
- max: number,
18
- fieldName: string,
16
+ value: unknown,
17
+ defaultVal: number,
18
+ min: number,
19
+ max: number,
20
+ fieldName: string,
21
+ notifier: Notifier,
19
22
  ): number {
20
- if (typeof value !== "number" || Number.isNaN(value)) {
21
- return defaultVal;
22
- }
23
- const intVal = Math.trunc(value);
24
- if (intVal < min || intVal > max) {
25
- const clamped = Math.max(min, Math.min(max, intVal));
26
- console.warn(`[doc-injector] ${fieldName} must be ${min}-${max}, got ${intVal}. Clamping to ${clamped}.`);
27
- return clamped;
28
- }
29
- return intVal;
23
+ if (typeof value !== "number" || Number.isNaN(value)) {
24
+ return defaultVal;
25
+ }
26
+ const intVal = Math.trunc(value);
27
+ if (intVal < min || intVal > max) {
28
+ const clamped = Math.max(min, Math.min(max, intVal));
29
+ notifier.warn(`[doc-injector] ${fieldName} must be ${min}-${max}, got ${intVal}. Clamping to ${clamped}.`);
30
+ return clamped;
31
+ }
32
+ return intVal;
30
33
  }
31
34
 
35
+ /**
32
36
  /**
33
37
  * Validate a glob pattern array.
34
38
  * Rejects non-array or entries that aren't strings. Returns default on error.
39
+ * Warns via the `notifier` for non-string entries.
35
40
  */
36
- function validateGlobArray(value: unknown, defaultVal: string[]): string[] {
37
- if (!Array.isArray(value)) {
38
- return [...defaultVal];
39
- }
40
- const result: string[] = [];
41
- for (const item of value) {
42
- if (typeof item === "string") {
43
- result.push(item);
44
- } else {
45
- console.warn(`[doc-injector] Non-string entry in glob array ignored: ${String(item)}`);
41
+ function validateGlobArray(
42
+ value: unknown,
43
+ defaultVal: string[],
44
+ notifier: Notifier,
45
+ ): string[] {
46
+ if (!Array.isArray(value)) {
47
+ return [...defaultVal];
48
+ }
49
+ const result: string[] = [];
50
+ for (const item of value) {
51
+ if (typeof item === "string") {
52
+ result.push(item);
53
+ } else {
54
+ notifier.warn(`[doc-injector] Non-string entry in glob array ignored: ${String(item)}`);
55
+ }
46
56
  }
47
- }
48
- return result.length > 0 ? result : [...defaultVal];
57
+ return result.length > 0 ? result : [...defaultVal];
49
58
  }
50
59
 
51
60
  /**
@@ -54,33 +63,37 @@ function validateGlobArray(value: unknown, defaultVal: string[]): string[] {
54
63
  * Validates and clamps all numeric fields. Falls back to DEFAULT_CONFIG
55
64
  * if file doesn't exist or is invalid.
56
65
  */
57
- export async function loadConfig(cwd: string): Promise<DocInjectorConfig> {
58
- const configPath = join(cwd, ".pi", "doc-injector.json");
66
+ /**
67
+ * Load config from `.pi/doc-injector.json` relative to the given cwd.
68
+ * Async — uses readFile from fs/promises. Validates and clamps all numeric
69
+ * fields. Falls back to DEFAULT_CONFIG if the file doesn't exist or is
70
+ * invalid. Warnings (clamping, invalid entries) go through the `notifier`.
71
+ */
72
+ export async function loadConfig(cwd: string, notifier: Notifier): Promise<DocInjectorConfig> {
73
+ const configPath = join(cwd, ".pi", "doc-injector.json");
59
74
 
60
- try {
61
- const raw = await readFile(configPath, "utf-8");
62
- const parsed = JSON.parse(raw) as Partial<DocInjectorConfig>;
75
+ try {
76
+ const raw = await readFile(configPath, "utf-8");
77
+ const parsed = JSON.parse(raw) as Partial<DocInjectorConfig>;
63
78
 
64
- return {
65
- docsPath: parsed.docsPath ?? DEFAULT_CONFIG.docsPath,
66
- matchThreshold: clampInt(parsed.matchThreshold, DEFAULT_CONFIG.matchThreshold, 1, Infinity, "matchThreshold"),
67
- contextThreshold: clampInt(parsed.contextThreshold, DEFAULT_CONFIG.contextThreshold, 0, 100, "contextThreshold"),
68
- recursive: parsed.recursive ?? DEFAULT_CONFIG.recursive,
69
- include: validateGlobArray(parsed.include, DEFAULT_CONFIG.include),
70
- exclude: validateGlobArray(parsed.exclude, DEFAULT_CONFIG.exclude),
71
- maxFileSize: clampInt(parsed.maxFileSize, DEFAULT_CONFIG.maxFileSize, 1024, 10 * 1024 * 1024, "maxFileSize"),
72
- autoKeywords: parsed.autoKeywords ?? DEFAULT_CONFIG.autoKeywords,
73
- llmKeywords: parsed.llmKeywords ?? DEFAULT_CONFIG.llmKeywords,
74
- maxConcurrent: clampInt(parsed.maxConcurrent, DEFAULT_CONFIG.maxConcurrent, 1, 100, "maxConcurrent"),
75
- llmBatchSize: clampInt(parsed.llmBatchSize, DEFAULT_CONFIG.llmBatchSize, 1, 100, "llmBatchSize"),
76
- };
77
- } catch (err) {
78
- if ((err as NodeJS.ErrnoException).code !== "ENOENT") {
79
- console.warn(
80
- `[doc-injector] Failed to parse config at ${configPath}:`,
81
- err instanceof Error ? err.message : String(err),
82
- );
79
+ return {
80
+ docsPath: parsed.docsPath ?? DEFAULT_CONFIG.docsPath,
81
+ matchThreshold: clampInt(parsed.matchThreshold, DEFAULT_CONFIG.matchThreshold, 1, Infinity, "matchThreshold", notifier),
82
+ contextThreshold: clampInt(parsed.contextThreshold, DEFAULT_CONFIG.contextThreshold, 0, 100, "contextThreshold", notifier),
83
+ recursive: parsed.recursive ?? DEFAULT_CONFIG.recursive,
84
+ include: validateGlobArray(parsed.include, DEFAULT_CONFIG.include, notifier),
85
+ exclude: validateGlobArray(parsed.exclude, DEFAULT_CONFIG.exclude, notifier),
86
+ maxFileSize: clampInt(parsed.maxFileSize, DEFAULT_CONFIG.maxFileSize, 1024, 10 * 1024 * 1024, "maxFileSize", notifier),
87
+ autoKeywords: parsed.autoKeywords ?? DEFAULT_CONFIG.autoKeywords,
88
+ llmKeywords: parsed.llmKeywords ?? DEFAULT_CONFIG.llmKeywords,
89
+ maxConcurrent: clampInt(parsed.maxConcurrent, DEFAULT_CONFIG.maxConcurrent, 1, 100, "maxConcurrent", notifier),
90
+ llmBatchSize: clampInt(parsed.llmBatchSize, DEFAULT_CONFIG.llmBatchSize, 1, 100, "llmBatchSize", notifier),
91
+ };
92
+ } catch (err) {
93
+ if ((err as NodeJS.ErrnoException).code !== "ENOENT") {
94
+ const detail = err instanceof Error ? err.message : String(err);
95
+ notifier.warn(`[doc-injector] Failed to parse config at ${configPath}: ${detail}`);
96
+ }
97
+ return { ...DEFAULT_CONFIG };
83
98
  }
84
- return { ...DEFAULT_CONFIG };
85
- }
86
99
  }
package/index.ts CHANGED
@@ -71,13 +71,19 @@ import { loadConfig } from "./config";
71
71
  import { buildInjectionContent, notifyInjection } from "./injector";
72
72
  import { buildKeywordGenPrompt } from "./keyword-llm";
73
73
  import { extractText, KeywordMatcher } from "./matcher";
74
+ import { ExtensionNotifier, type Notifier } from "./notifier";
74
75
  import { DocRegistry } from "./registry";
75
- import { DEFAULT_MATCHER_OPTIONS, type DocEntry, type MatchResult, type KeywordCache, type CacheEntry } from "./types";
76
+ import { DEFAULT_MATCHER_OPTIONS, LLM_CACHE_SENTINEL, type DocEntry, type MatchResult, type KeywordCache, type CacheEntry } from "./types";
76
77
  import { registerCommands } from "./commands";
77
78
 
78
79
  export default async function docInjectorExtension(pi: ExtensionAPI) {
79
80
  // ---- State ----
80
- let config = await loadConfig(process.cwd());
81
+ // The notifier buffers warnings emitted during startup (loadConfig,
82
+ // loadCache, initRegistry) and flushes them via ctx.ui.notify() in
83
+ // session_start. The notifier is bound to the extension lifecycle so
84
+ // startup messages aren't lost.
85
+ const notifier: Notifier = new ExtensionNotifier();
86
+ let config = await loadConfig(process.cwd(), notifier);
81
87
  let registry: DocRegistry | null = null;
82
88
  let initRegistryPromise: Promise<void> | null = null;
83
89
  let enabled = true;
@@ -102,7 +108,7 @@ export default async function docInjectorExtension(pi: ExtensionAPI) {
102
108
  const safeSaveCache = async (cwd: string, dirtyEntries: Record<string, CacheEntry>) => {
103
109
  // MAJOR-2 fix: before saveCache, re-read cache from disk to merge
104
110
  // LLM-written entries that may have landed during the scan.
105
- const freshCache = await loadCache(cwd);
111
+ const freshCache = await loadCache(cwd, notifier);
106
112
  const mergedCache: KeywordCache = { version: 1, files: {} };
107
113
 
108
114
  // Start with fresh (disk) entries — includes any LLM writes during scan
@@ -119,10 +125,10 @@ export default async function docInjectorExtension(pi: ExtensionAPI) {
119
125
  };
120
126
 
121
127
  const initRegistry = async (cwd: string) => {
122
- config = await loadConfig(cwd);
128
+ config = await loadConfig(cwd, notifier);
123
129
  const docsPath = resolve(cwd, config.docsPath);
124
- cache = await loadCache(cwd);
125
- registry = await DocRegistry.create(docsPath, config, cache);
130
+ cache = await loadCache(cwd, notifier);
131
+ registry = await DocRegistry.create(docsPath, config, cache, notifier);
126
132
 
127
133
  const dirty = registry.getDirtyCache();
128
134
  if (Object.keys(dirty).length > 0) {
@@ -178,7 +184,9 @@ export default async function docInjectorExtension(pi: ExtensionAPI) {
178
184
  continue;
179
185
  }
180
186
  cache.files[item.path] = {
181
- mtimeMs: fileStat.mtimeMs,
187
+ // Use the sentinel — never the real mtime — so the next rebuild
188
+ // surfaces this entry as keywordSource: "llm" instead of "cache".
189
+ mtimeMs: LLM_CACHE_SENTINEL,
182
190
  keywords: item.keywords.map((k) => k.toLowerCase()).slice(0, 20),
183
191
  };
184
192
  saved++;
@@ -203,6 +211,12 @@ export default async function docInjectorExtension(pi: ExtensionAPI) {
203
211
  llmBatchesCompleted = 0;
204
212
  llmTotalFiles = 0;
205
213
 
214
+ // Bind the notifier to the live context FIRST so any warnings emitted
215
+ // during initRegistry below go directly to the TUI instead of being
216
+ // buffered. Messages buffered from earlier (e.g. the factory-body
217
+ // loadConfig call) are flushed here in arrival order.
218
+ notifier.setContext(ctx);
219
+
206
220
  if (event.reason === "reload") return;
207
221
 
208
222
  if (initRegistryPromise) {
@@ -223,7 +237,7 @@ export default async function docInjectorExtension(pi: ExtensionAPI) {
223
237
  const effectiveCwd = cwd ?? process.cwd();
224
238
 
225
239
  // Reload cache from disk to pick up LLM-generated entries
226
- const freshCache = await loadCache(effectiveCwd);
240
+ const freshCache = await loadCache(effectiveCwd, notifier);
227
241
  cache = freshCache;
228
242
  registry.updateCache(cache);
229
243
 
package/notifier.ts ADDED
@@ -0,0 +1,71 @@
1
+ /**
2
+ * Notifier — thin wrapper around Pi's `ctx.ui.notify()` that buffers
3
+ * messages until a context is available.
4
+ *
5
+ * ## Why a buffer?
6
+ *
7
+ * Several warnings fire at startup (during `loadConfig` and `initRegistry`),
8
+ * before any `ExtensionContext` exists — extensions are constructed first,
9
+ * events fire later. The `Notifier` interface accepts messages at any time:
10
+ *
11
+ * - If a context has been set, messages are forwarded to `ctx.ui.notify()`.
12
+ * - If not, messages are buffered in memory and flushed on the next
13
+ * `setContext()` call (typically from `session_start`).
14
+ *
15
+ * Production code uses `ExtensionNotifier`. Tests inject a plain object
16
+ * satisfying the `Notifier` interface (or a `vi.fn()` spy) — no real
17
+ * extension context is needed.
18
+ */
19
+ import type { ExtensionContext } from "@mariozechner/pi-coding-agent";
20
+
21
+ export type NotifierLevel = "info" | "warning" | "error";
22
+
23
+ export interface Notifier {
24
+ /** Show an informational message. */
25
+ info(message: string): void;
26
+ /** Show a warning. */
27
+ warn(message: string): void;
28
+ /** Show an error. */
29
+ error(message: string): void;
30
+ /**
31
+ * Bind a context. Flushes any buffered messages via `ctx.ui.notify()`
32
+ * in arrival order. Idempotent: re-calling replaces the context and
33
+ * clears the buffer (already-flushed messages are not re-sent).
34
+ */
35
+ setContext(ctx: ExtensionContext): void;
36
+ }
37
+
38
+ /** Production notifier. Buffers until a context is bound. */
39
+ export class ExtensionNotifier implements Notifier {
40
+ private ctx: ExtensionContext | null = null;
41
+ private buffer: Array<{ level: NotifierLevel; message: string }> = [];
42
+
43
+ setContext(ctx: ExtensionContext): void {
44
+ this.ctx = ctx;
45
+ const pending = this.buffer;
46
+ this.buffer = [];
47
+ for (const { level, message } of pending) {
48
+ ctx.ui.notify(message, level);
49
+ }
50
+ }
51
+
52
+ info(message: string): void {
53
+ this.emit("info", message);
54
+ }
55
+
56
+ warn(message: string): void {
57
+ this.emit("warning", message);
58
+ }
59
+
60
+ error(message: string): void {
61
+ this.emit("error", message);
62
+ }
63
+
64
+ private emit(level: NotifierLevel, message: string): void {
65
+ if (this.ctx) {
66
+ this.ctx.ui.notify(message, level);
67
+ } else {
68
+ this.buffer.push({ level, message });
69
+ }
70
+ }
71
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "pi-doc-injector",
3
- "version": "0.5.0",
3
+ "version": "0.5.2",
4
4
  "description": "Auto-inject relevant project documentation into Pi's LLM context based on keyword matching",
5
5
  "type": "module",
6
6
  "main": "./index.ts",
package/registry.ts CHANGED
@@ -8,7 +8,8 @@
8
8
  import type { Dirent } from "node:fs";
9
9
  import { readdir, readFile, stat } from "node:fs/promises";
10
10
  import { basename, extname, join, relative, resolve } from "node:path";
11
- import type { CacheEntry, DocEntry, DocInjectorConfig, KeywordCache } from "./types";
11
+ import { LLM_CACHE_SENTINEL, type CacheEntry, type DocEntry, type DocInjectorConfig, type KeywordCache } from "./types";
12
+ import type { Notifier } from "./notifier";
12
13
  import { createGlobFilter } from "./globber";
13
14
  import { generateKeywords } from "./keyword-gen";
14
15
 
@@ -225,28 +226,36 @@ class PromisePool {
225
226
  * Document Registry class. Scans a docs folder and maintains an index of DocEntry.
226
227
  */
227
228
  export class DocRegistry {
228
- private entries: DocEntry[] = [];
229
- private docsPath: string;
230
- private config: DocInjectorConfig;
231
- private cache: KeywordCache | null = null;
232
- private dirtyCache: KeywordCache = { version: 1, files: {} };
233
-
234
- private constructor(docsPath: string, config: DocInjectorConfig, cache?: KeywordCache) {
235
- this.docsPath = docsPath;
236
- this.config = config;
237
- this.cache = cache ?? null;
238
- }
229
+ private entries: DocEntry[] = [];
230
+ private docsPath: string;
231
+ private config: DocInjectorConfig;
232
+ private cache: KeywordCache | null = null;
233
+ private dirtyCache: KeywordCache = { version: 1, files: {} };
234
+ private notifier: Notifier;
235
+
236
+ private constructor(
237
+ docsPath: string,
238
+ config: DocInjectorConfig,
239
+ cache: KeywordCache | undefined,
240
+ notifier: Notifier,
241
+ ) {
242
+ this.docsPath = docsPath;
243
+ this.config = config;
244
+ this.cache = cache ?? null;
245
+ this.notifier = notifier;
246
+ }
239
247
 
240
- /** Create a registry by scanning the docs folder. */
241
- static async create(
242
- docsPath: string,
243
- config: DocInjectorConfig,
244
- cache?: KeywordCache,
245
- ): Promise<DocRegistry> {
246
- const registry = new DocRegistry(docsPath, config, cache);
247
- await registry.rebuild();
248
- return registry;
249
- }
248
+ /** Create a registry by scanning the docs folder. */
249
+ static async create(
250
+ docsPath: string,
251
+ config: DocInjectorConfig,
252
+ cache: KeywordCache | undefined,
253
+ notifier: Notifier,
254
+ ): Promise<DocRegistry> {
255
+ const registry = new DocRegistry(docsPath, config, cache, notifier);
256
+ await registry.rebuild();
257
+ return registry;
258
+ }
250
259
 
251
260
  /** Re-scan the docs folder and rebuild the index. */
252
261
  async rebuild(): Promise<void> {
@@ -274,106 +283,124 @@ export class DocRegistry {
274
283
  const results = await pool.all(tasks);
275
284
  this.entries = results.filter((e): e is DocEntry => e !== null);
276
285
  } catch {
277
- console.warn(`[doc-injector] Docs folder not found: ${resolved}`);
286
+ this.notifier.warn(`[doc-injector] Docs folder not found: ${resolved}`);
278
287
  this.entries = [];
279
288
  }
280
289
  }
281
290
 
282
291
  /**
283
- * Process a single file through the full pipeline.
292
+ * Process a single file through the priority chain.
284
293
  * Returns a DocEntry or null if the file should be skipped.
294
+ *
295
+ * Priority (highest to lowest):
296
+ * 1. Frontmatter (authoritative — explicitly written by the doc author)
297
+ * 2. Cache (perf layer — mtime match means content hasn't changed)
298
+ * 3. Heuristic (free, automatic, local — filename + headings + code symbols)
299
+ * 4. Skip (no frontmatter, no cache, autoKeywords disabled)
300
+ *
301
+ * LLM-generated keywords populate the cache via the `_doc_injector_keywords`
302
+ * tool, so they surface as `keywordSource: "cache"` on the next rebuild
303
+ * (their `mtimeMs` is set to the file's current mtime when written).
285
304
  */
286
305
  private async processFile(
287
306
  { filePath, relativePath, fileName }: ScanResult,
288
307
  preserved: Map<string, boolean>,
289
308
  ): Promise<DocEntry | null> {
290
309
  try {
291
- // ═══ METADATA + CACHE ═══
292
-
293
- // Step 1: Stat the file for size and mtime
310
+ // ─── METADATA ─────────────────────────────────────────────
294
311
  const fileStat = await stat(filePath);
295
312
 
296
- // Step 2: Skip files exceeding maxFileSize
297
313
  if (fileStat.size > this.config.maxFileSize) {
298
- console.warn(
314
+ this.notifier.warn(
299
315
  `[doc-injector] Skipping ${relativePath}: size ${fileStat.size} > max ${this.config.maxFileSize}`,
300
316
  );
301
317
  return null;
302
318
  }
303
319
 
304
- const cachedEntry = this.cache?.files[relativePath];
305
-
306
- // Step 6: Cache hit — mtime matches, use cached keywords
307
- if (cachedEntry && cachedEntry.mtimeMs === fileStat.mtimeMs) {
308
- // Still read the file for content and title (needed for injection),
309
- // but skip keyword generation entirely
310
- const raw = await readFile(filePath, "utf-8");
311
- const title = extractTitle(raw, fileName);
320
+ // Read once — needed for frontmatter parse, content, and title.
321
+ const raw = await readFile(filePath, "utf-8");
312
322
 
323
+ // ─── PRIORITY 1: Frontmatter (authoritative) ─────────────
324
+ const parsed = parseFrontmatter(raw);
325
+ if (parsed) {
326
+ // Frontmatter is self-caching (lives in the file), no dirty mark needed.
313
327
  return {
314
328
  filePath,
315
329
  fileName,
316
330
  relativePath,
317
- title,
318
- keywords: cachedEntry.keywords,
331
+ title: parsed.title,
332
+ keywords: parsed.keywords,
319
333
  content: raw,
320
334
  injected: preserved.get(filePath) ?? false,
321
- keywordSource: "cache",
335
+ keywordSource: "frontmatter",
322
336
  };
323
337
  }
324
338
 
325
- // ═══ FULL READ + PARSE (cache miss) ═══
326
-
327
- // Step 7: Read file content
328
- const raw = await readFile(filePath, "utf-8");
339
+ // ─── PRIORITY 2: Cache (mtime match means content unchanged) ──
340
+ const cachedEntry = this.cache?.files[relativePath];
341
+ if (cachedEntry) {
342
+ // LLM-generated: sentinel mtime never matches a real file
343
+ if (cachedEntry.mtimeMs === LLM_CACHE_SENTINEL) {
344
+ const title = extractTitle(raw, fileName);
345
+ return {
346
+ filePath,
347
+ fileName,
348
+ relativePath,
349
+ title,
350
+ keywords: cachedEntry.keywords,
351
+ content: raw,
352
+ injected: preserved.get(filePath) ?? false,
353
+ keywordSource: "llm",
354
+ };
355
+ }
356
+ // Real mtime match: heuristic or prior LLM-upgrade cache hit
357
+ if (cachedEntry.mtimeMs === fileStat.mtimeMs) {
358
+ const title = extractTitle(raw, fileName);
359
+ return {
360
+ filePath,
361
+ fileName,
362
+ relativePath,
363
+ title,
364
+ keywords: cachedEntry.keywords,
365
+ content: raw,
366
+ injected: preserved.get(filePath) ?? false,
367
+ keywordSource: "cache",
368
+ };
369
+ }
370
+ }
329
371
 
330
- // Step 8: Try frontmatter parsing
331
- const parsed = parseFrontmatter(raw);
372
+ // ─── PRIORITY 3: Heuristic (free, automatic fallback) ─────────
373
+ if (this.config.autoKeywords) {
374
+ const title = extractTitle(raw, fileName);
375
+ const keywords = generateKeywords(fileName, raw);
332
376
 
333
- let title: string;
334
- let keywords: string[];
335
- let keywordSource: DocEntry["keywordSource"];
377
+ // Mark cache dirty (newly generated keywords must be persisted).
378
+ this.dirtyCache.files[relativePath] = {
379
+ mtimeMs: fileStat.mtimeMs,
380
+ keywords,
381
+ };
336
382
 
337
- if (parsed) {
338
- // Step 9: Frontmatter found — use its title and keywords
339
- title = parsed.title;
340
- keywords = parsed.keywords;
341
- keywordSource = "frontmatter";
342
- } else if (this.config.autoKeywords) {
343
- // Step 10: No frontmatter, generate keywords heuristically
344
- title = extractTitle(raw, fileName);
345
- keywords = generateKeywords(fileName, raw);
346
- keywordSource = "heuristic";
347
- } else {
348
- // Step 11: No frontmatter and autoKeywords disabled — skip
349
- console.warn(
350
- `[doc-injector] Skipping ${relativePath}: no valid frontmatter with keywords`,
351
- );
352
- return null;
383
+ return {
384
+ filePath,
385
+ fileName,
386
+ relativePath,
387
+ title,
388
+ keywords,
389
+ content: raw,
390
+ injected: preserved.get(filePath) ?? false,
391
+ keywordSource: "heuristic",
392
+ };
353
393
  }
354
394
 
355
- // ═══ CACHE UPDATE ═══
356
-
357
- // Step 12: Mark as dirty (mtime changed or keywords generated)
358
- this.dirtyCache.files[relativePath] = {
359
- mtimeMs: fileStat.mtimeMs,
360
- keywords,
361
- };
362
-
363
- return {
364
- filePath,
365
- fileName,
366
- relativePath,
367
- title,
368
- keywords,
369
- content: raw,
370
- injected: preserved.get(filePath) ?? false,
371
- keywordSource,
372
- };
395
+ // ─── PRIORITY 4: Skip ───────────────────────────────────────────
396
+ this.notifier.warn(
397
+ `[doc-injector] Skipping ${relativePath}: no valid frontmatter with keywords`,
398
+ );
399
+ return null;
373
400
  } catch (err) {
374
401
  // Only warn for unexpected errors, not ENOENT (file deleted/moved after scan)
375
402
  if ((err as NodeJS.ErrnoException).code !== "ENOENT") {
376
- console.warn(`[doc-injector] Error reading ${relativePath}:`, err);
403
+ this.notifier.warn(`[doc-injector] Error reading ${relativePath}: ${err instanceof Error ? err.message : String(err)}`);
377
404
  }
378
405
  return null;
379
406
  }
package/types.ts CHANGED
@@ -107,4 +107,16 @@ export const DEFAULT_CONFIG: DocInjectorConfig = {
107
107
  export const DEFAULT_MATCHER_OPTIONS: MatcherOptions = {
108
108
  matchThreshold: DEFAULT_CONFIG.matchThreshold,
109
109
  caseSensitive: false,
110
- };
110
+ };
111
+
112
+ /**
113
+ * Sentinel value used in CacheEntry.mtimeMs to mark entries written by the
114
+ * LLM keyword generator. -1 is chosen because Node.Stats.mtimeMs is documented
115
+ * as a non-negative integer (milliseconds since the Unix Epoch), so a real
116
+ * file can never have mtimeMs === -1. Heuristic-written entries use the real
117
+ * file mtime, which is always >= 0.
118
+ *
119
+ * If you find yourself writing LLM_CACHE_SENTINEL into a real cache entry
120
+ * from a non-LLM code path, that's a bug.
121
+ */
122
+ export const LLM_CACHE_SENTINEL = -1;