pi-doc-injector 0.5.0 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +34 -10
- package/cache.ts +8 -6
- package/config.ts +67 -54
- package/index.ts +22 -8
- package/notifier.ts +71 -0
- package/package.json +1 -1
- package/registry.ts +111 -84
- package/types.ts +13 -1
package/README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# Pi Doc Injector
|
|
2
2
|
|
|
3
|
-
A [Pi](https://pi.dev) extension that automatically injects relevant project documentation into the LLM
|
|
3
|
+
A [Pi](https://pi.dev) extension that automatically injects relevant project documentation into the LLM context by monitoring streaming output for keyword matches. Docs are delivered as a `CustomMessage` so the system prompt stays untouched and the provider's prompt cache stays warm.
|
|
4
4
|
|
|
5
5
|
## Installation
|
|
6
6
|
|
|
@@ -29,8 +29,9 @@ git clone https://github.com/yourname/pi-doc-injector.git .pi/extensions/doc-inj
|
|
|
29
29
|
1. Create a `docs/` folder in your project root.
|
|
30
30
|
2. Add markdown files with frontmatter (`title` + `keywords`). See [Document Format](#document-format) for supported formats.
|
|
31
31
|
3. Start Pi. The extension scans `docs/` on session start.
|
|
32
|
-
4. When the user mentions a keyword, the matching doc is injected
|
|
33
|
-
|
|
32
|
+
4. When the user mentions a keyword, the matching doc is injected as a
|
|
33
|
+
`CustomMessage` into the conversation **before the assistant responds** —
|
|
34
|
+
no one-turn delay. The system prompt is never modified.
|
|
34
35
|
5. If the assistant mentions a NEW keyword mid-response, generation is
|
|
35
36
|
automatically aborted and restarted with the doc injected immediately.
|
|
36
37
|
|
|
@@ -208,18 +209,41 @@ The extension uses a per-session injection model:
|
|
|
208
209
|
- **Assistant streaming**: if the assistant mentions a NEW keyword mid-response,
|
|
209
210
|
generation is aborted and restarted with the doc injected immediately.
|
|
210
211
|
|
|
211
|
-
###
|
|
212
|
+
### Injection Mechanism
|
|
212
213
|
|
|
213
|
-
|
|
214
|
+
On match, the extension returns a `message` field from `before_agent_start`
|
|
215
|
+
with `customType: "doc-injector"`. Pi appends this to the session and sends
|
|
216
|
+
it to the LLM as part of the conversation. The system prompt is **never**
|
|
217
|
+
mutated.
|
|
214
218
|
|
|
215
|
-
|
|
219
|
+
#### Why a CustomMessage, not the system prompt?
|
|
216
220
|
|
|
217
|
-
|
|
221
|
+
- The system prompt is the highest-value prompt-cache slot. Each unique
|
|
222
|
+
system prompt text breaks the cache (5-min TTL by default). Appending
|
|
223
|
+
per-turn doc content there would invalidate the cache on every first
|
|
224
|
+
injection.
|
|
225
|
+
- A `CustomMessage` only adds to the conversation prefix, leaving the
|
|
226
|
+
system prompt byte-identical across turns and the cache warm.
|
|
218
227
|
|
|
219
|
-
-
|
|
220
|
-
- There is no risk of duplicate injection sections stacking up over time.
|
|
221
|
-
- The `injected` flag alone is sufficient to prevent re-injection — no additional deduplication or marker-based stripping is needed.
|
|
228
|
+
#### Double-injection prevention
|
|
222
229
|
|
|
230
|
+
Two independent guards make duplicate injection impossible in a session:
|
|
231
|
+
|
|
232
|
+
1. **Matcher guard** — `buildMatcher()` only includes non-injected entries
|
|
233
|
+
(via `getNonInjectedEntries()`), so already-injected docs cannot be
|
|
234
|
+
re-matched.
|
|
235
|
+
2. **Mark guard** — `markInjected()` runs inside `before_agent_start` before
|
|
236
|
+
the LLM call, so even if the matcher ever produced a duplicate, the
|
|
237
|
+
mark would still prevent a second send.
|
|
238
|
+
|
|
239
|
+
In practice, the matcher guard is the primary defense; the mark guard is
|
|
240
|
+
defense-in-depth for race conditions (e.g. if `resources_discover` rebuilds
|
|
241
|
+
the registry mid-injection).
|
|
242
|
+
|
|
243
|
+
The `injected` flag is per-session: it's reset on `session_start` and can
|
|
244
|
+
be manually cleared with `/doc-inject reset`.
|
|
245
|
+
|
|
246
|
+
For the full source-level verification, see the JSDoc block in `index.ts`.
|
|
223
247
|
For the full source-level verification, see the JSDoc block in `index.ts`.
|
|
224
248
|
|
|
225
249
|
## Development
|
package/cache.ts
CHANGED
|
@@ -9,6 +9,7 @@
|
|
|
9
9
|
import { mkdir, readFile, writeFile } from "node:fs/promises";
|
|
10
10
|
import { dirname, join } from "node:path";
|
|
11
11
|
import type { KeywordCache } from "./types";
|
|
12
|
+
import type { Notifier } from "./notifier";
|
|
12
13
|
|
|
13
14
|
const CACHE_FILENAME = ".pi/doc-injector-cache.json";
|
|
14
15
|
const CACHE_VERSION = 1;
|
|
@@ -17,8 +18,11 @@ const CACHE_VERSION = 1;
|
|
|
17
18
|
* Load the keyword cache from disk.
|
|
18
19
|
* Returns an empty cache (version 1, no files) if the file doesn't exist,
|
|
19
20
|
* has wrong version, or is corrupted.
|
|
21
|
+
*
|
|
22
|
+
* Recoverable issues (corrupt JSON, wrong version) emit a warning via the
|
|
23
|
+
* `notifier`. ENOENT (no cache file yet) is silent.
|
|
20
24
|
*/
|
|
21
|
-
export async function loadCache(cwd: string): Promise<KeywordCache> {
|
|
25
|
+
export async function loadCache(cwd: string, notifier: Notifier): Promise<KeywordCache> {
|
|
22
26
|
const cachePath = join(cwd, CACHE_FILENAME);
|
|
23
27
|
|
|
24
28
|
try {
|
|
@@ -26,7 +30,7 @@ export async function loadCache(cwd: string): Promise<KeywordCache> {
|
|
|
26
30
|
const parsed: unknown = JSON.parse(raw);
|
|
27
31
|
|
|
28
32
|
if (!isValidCache(parsed)) {
|
|
29
|
-
|
|
33
|
+
notifier.warn(
|
|
30
34
|
`[doc-injector] Invalid cache format or version at ${cachePath}, resetting.`,
|
|
31
35
|
);
|
|
32
36
|
return emptyCache();
|
|
@@ -36,10 +40,8 @@ export async function loadCache(cwd: string): Promise<KeywordCache> {
|
|
|
36
40
|
} catch (err) {
|
|
37
41
|
// ENOENT = no cache file yet, that's fine
|
|
38
42
|
if ((err as NodeJS.ErrnoException).code !== "ENOENT") {
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
err instanceof Error ? err.message : String(err),
|
|
42
|
-
);
|
|
43
|
+
const detail = err instanceof Error ? err.message : String(err);
|
|
44
|
+
notifier.warn(`[doc-injector] Failed to read cache at ${cachePath}: ${detail}`);
|
|
43
45
|
}
|
|
44
46
|
return emptyCache();
|
|
45
47
|
}
|
package/config.ts
CHANGED
|
@@ -5,47 +5,56 @@
|
|
|
5
5
|
import { readFile } from "node:fs/promises";
|
|
6
6
|
import { join } from "node:path";
|
|
7
7
|
import { DEFAULT_CONFIG, type DocInjectorConfig } from "./types";
|
|
8
|
+
import type { Notifier } from "./notifier";
|
|
8
9
|
|
|
9
10
|
/**
|
|
10
11
|
* Clamp an integer value to [min, max] range.
|
|
11
|
-
* Warns and clamps if out of range. Returns the default
|
|
12
|
+
* Warns via the `notifier` and clamps if out of range. Returns the default
|
|
13
|
+
* if not a number.
|
|
12
14
|
*/
|
|
13
15
|
function clampInt(
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
16
|
+
value: unknown,
|
|
17
|
+
defaultVal: number,
|
|
18
|
+
min: number,
|
|
19
|
+
max: number,
|
|
20
|
+
fieldName: string,
|
|
21
|
+
notifier: Notifier,
|
|
19
22
|
): number {
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
23
|
+
if (typeof value !== "number" || Number.isNaN(value)) {
|
|
24
|
+
return defaultVal;
|
|
25
|
+
}
|
|
26
|
+
const intVal = Math.trunc(value);
|
|
27
|
+
if (intVal < min || intVal > max) {
|
|
28
|
+
const clamped = Math.max(min, Math.min(max, intVal));
|
|
29
|
+
notifier.warn(`[doc-injector] ${fieldName} must be ${min}-${max}, got ${intVal}. Clamping to ${clamped}.`);
|
|
30
|
+
return clamped;
|
|
31
|
+
}
|
|
32
|
+
return intVal;
|
|
30
33
|
}
|
|
31
34
|
|
|
35
|
+
/**
|
|
32
36
|
/**
|
|
33
37
|
* Validate a glob pattern array.
|
|
34
38
|
* Rejects non-array or entries that aren't strings. Returns default on error.
|
|
39
|
+
* Warns via the `notifier` for non-string entries.
|
|
35
40
|
*/
|
|
36
|
-
function validateGlobArray(
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
41
|
+
function validateGlobArray(
|
|
42
|
+
value: unknown,
|
|
43
|
+
defaultVal: string[],
|
|
44
|
+
notifier: Notifier,
|
|
45
|
+
): string[] {
|
|
46
|
+
if (!Array.isArray(value)) {
|
|
47
|
+
return [...defaultVal];
|
|
48
|
+
}
|
|
49
|
+
const result: string[] = [];
|
|
50
|
+
for (const item of value) {
|
|
51
|
+
if (typeof item === "string") {
|
|
52
|
+
result.push(item);
|
|
53
|
+
} else {
|
|
54
|
+
notifier.warn(`[doc-injector] Non-string entry in glob array ignored: ${String(item)}`);
|
|
55
|
+
}
|
|
46
56
|
}
|
|
47
|
-
|
|
48
|
-
return result.length > 0 ? result : [...defaultVal];
|
|
57
|
+
return result.length > 0 ? result : [...defaultVal];
|
|
49
58
|
}
|
|
50
59
|
|
|
51
60
|
/**
|
|
@@ -54,33 +63,37 @@ function validateGlobArray(value: unknown, defaultVal: string[]): string[] {
|
|
|
54
63
|
* Validates and clamps all numeric fields. Falls back to DEFAULT_CONFIG
|
|
55
64
|
* if file doesn't exist or is invalid.
|
|
56
65
|
*/
|
|
57
|
-
|
|
58
|
-
|
|
66
|
+
/**
|
|
67
|
+
* Load config from `.pi/doc-injector.json` relative to the given cwd.
|
|
68
|
+
* Async — uses readFile from fs/promises. Validates and clamps all numeric
|
|
69
|
+
* fields. Falls back to DEFAULT_CONFIG if the file doesn't exist or is
|
|
70
|
+
* invalid. Warnings (clamping, invalid entries) go through the `notifier`.
|
|
71
|
+
*/
|
|
72
|
+
export async function loadConfig(cwd: string, notifier: Notifier): Promise<DocInjectorConfig> {
|
|
73
|
+
const configPath = join(cwd, ".pi", "doc-injector.json");
|
|
59
74
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
75
|
+
try {
|
|
76
|
+
const raw = await readFile(configPath, "utf-8");
|
|
77
|
+
const parsed = JSON.parse(raw) as Partial<DocInjectorConfig>;
|
|
63
78
|
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
79
|
+
return {
|
|
80
|
+
docsPath: parsed.docsPath ?? DEFAULT_CONFIG.docsPath,
|
|
81
|
+
matchThreshold: clampInt(parsed.matchThreshold, DEFAULT_CONFIG.matchThreshold, 1, Infinity, "matchThreshold", notifier),
|
|
82
|
+
contextThreshold: clampInt(parsed.contextThreshold, DEFAULT_CONFIG.contextThreshold, 0, 100, "contextThreshold", notifier),
|
|
83
|
+
recursive: parsed.recursive ?? DEFAULT_CONFIG.recursive,
|
|
84
|
+
include: validateGlobArray(parsed.include, DEFAULT_CONFIG.include, notifier),
|
|
85
|
+
exclude: validateGlobArray(parsed.exclude, DEFAULT_CONFIG.exclude, notifier),
|
|
86
|
+
maxFileSize: clampInt(parsed.maxFileSize, DEFAULT_CONFIG.maxFileSize, 1024, 10 * 1024 * 1024, "maxFileSize", notifier),
|
|
87
|
+
autoKeywords: parsed.autoKeywords ?? DEFAULT_CONFIG.autoKeywords,
|
|
88
|
+
llmKeywords: parsed.llmKeywords ?? DEFAULT_CONFIG.llmKeywords,
|
|
89
|
+
maxConcurrent: clampInt(parsed.maxConcurrent, DEFAULT_CONFIG.maxConcurrent, 1, 100, "maxConcurrent", notifier),
|
|
90
|
+
llmBatchSize: clampInt(parsed.llmBatchSize, DEFAULT_CONFIG.llmBatchSize, 1, 100, "llmBatchSize", notifier),
|
|
91
|
+
};
|
|
92
|
+
} catch (err) {
|
|
93
|
+
if ((err as NodeJS.ErrnoException).code !== "ENOENT") {
|
|
94
|
+
const detail = err instanceof Error ? err.message : String(err);
|
|
95
|
+
notifier.warn(`[doc-injector] Failed to parse config at ${configPath}: ${detail}`);
|
|
96
|
+
}
|
|
97
|
+
return { ...DEFAULT_CONFIG };
|
|
83
98
|
}
|
|
84
|
-
return { ...DEFAULT_CONFIG };
|
|
85
|
-
}
|
|
86
99
|
}
|
package/index.ts
CHANGED
|
@@ -71,13 +71,19 @@ import { loadConfig } from "./config";
|
|
|
71
71
|
import { buildInjectionContent, notifyInjection } from "./injector";
|
|
72
72
|
import { buildKeywordGenPrompt } from "./keyword-llm";
|
|
73
73
|
import { extractText, KeywordMatcher } from "./matcher";
|
|
74
|
+
import { ExtensionNotifier, type Notifier } from "./notifier";
|
|
74
75
|
import { DocRegistry } from "./registry";
|
|
75
|
-
import { DEFAULT_MATCHER_OPTIONS, type DocEntry, type MatchResult, type KeywordCache, type CacheEntry } from "./types";
|
|
76
|
+
import { DEFAULT_MATCHER_OPTIONS, LLM_CACHE_SENTINEL, type DocEntry, type MatchResult, type KeywordCache, type CacheEntry } from "./types";
|
|
76
77
|
import { registerCommands } from "./commands";
|
|
77
78
|
|
|
78
79
|
export default async function docInjectorExtension(pi: ExtensionAPI) {
|
|
79
80
|
// ---- State ----
|
|
80
|
-
|
|
81
|
+
// The notifier buffers warnings emitted during startup (loadConfig,
|
|
82
|
+
// loadCache, initRegistry) and flushes them via ctx.ui.notify() in
|
|
83
|
+
// session_start. The notifier is bound to the extension lifecycle so
|
|
84
|
+
// startup messages aren't lost.
|
|
85
|
+
const notifier: Notifier = new ExtensionNotifier();
|
|
86
|
+
let config = await loadConfig(process.cwd(), notifier);
|
|
81
87
|
let registry: DocRegistry | null = null;
|
|
82
88
|
let initRegistryPromise: Promise<void> | null = null;
|
|
83
89
|
let enabled = true;
|
|
@@ -102,7 +108,7 @@ export default async function docInjectorExtension(pi: ExtensionAPI) {
|
|
|
102
108
|
const safeSaveCache = async (cwd: string, dirtyEntries: Record<string, CacheEntry>) => {
|
|
103
109
|
// MAJOR-2 fix: before saveCache, re-read cache from disk to merge
|
|
104
110
|
// LLM-written entries that may have landed during the scan.
|
|
105
|
-
const freshCache = await loadCache(cwd);
|
|
111
|
+
const freshCache = await loadCache(cwd, notifier);
|
|
106
112
|
const mergedCache: KeywordCache = { version: 1, files: {} };
|
|
107
113
|
|
|
108
114
|
// Start with fresh (disk) entries — includes any LLM writes during scan
|
|
@@ -119,10 +125,10 @@ export default async function docInjectorExtension(pi: ExtensionAPI) {
|
|
|
119
125
|
};
|
|
120
126
|
|
|
121
127
|
const initRegistry = async (cwd: string) => {
|
|
122
|
-
config = await loadConfig(cwd);
|
|
128
|
+
config = await loadConfig(cwd, notifier);
|
|
123
129
|
const docsPath = resolve(cwd, config.docsPath);
|
|
124
|
-
cache = await loadCache(cwd);
|
|
125
|
-
registry = await DocRegistry.create(docsPath, config, cache);
|
|
130
|
+
cache = await loadCache(cwd, notifier);
|
|
131
|
+
registry = await DocRegistry.create(docsPath, config, cache, notifier);
|
|
126
132
|
|
|
127
133
|
const dirty = registry.getDirtyCache();
|
|
128
134
|
if (Object.keys(dirty).length > 0) {
|
|
@@ -178,7 +184,9 @@ export default async function docInjectorExtension(pi: ExtensionAPI) {
|
|
|
178
184
|
continue;
|
|
179
185
|
}
|
|
180
186
|
cache.files[item.path] = {
|
|
181
|
-
|
|
187
|
+
// Use the sentinel — never the real mtime — so the next rebuild
|
|
188
|
+
// surfaces this entry as keywordSource: "llm" instead of "cache".
|
|
189
|
+
mtimeMs: LLM_CACHE_SENTINEL,
|
|
182
190
|
keywords: item.keywords.map((k) => k.toLowerCase()).slice(0, 20),
|
|
183
191
|
};
|
|
184
192
|
saved++;
|
|
@@ -203,6 +211,12 @@ export default async function docInjectorExtension(pi: ExtensionAPI) {
|
|
|
203
211
|
llmBatchesCompleted = 0;
|
|
204
212
|
llmTotalFiles = 0;
|
|
205
213
|
|
|
214
|
+
// Bind the notifier to the live context FIRST so any warnings emitted
|
|
215
|
+
// during initRegistry below go directly to the TUI instead of being
|
|
216
|
+
// buffered. Messages buffered from earlier (e.g. the factory-body
|
|
217
|
+
// loadConfig call) are flushed here in arrival order.
|
|
218
|
+
notifier.setContext(ctx);
|
|
219
|
+
|
|
206
220
|
if (event.reason === "reload") return;
|
|
207
221
|
|
|
208
222
|
if (initRegistryPromise) {
|
|
@@ -223,7 +237,7 @@ export default async function docInjectorExtension(pi: ExtensionAPI) {
|
|
|
223
237
|
const effectiveCwd = cwd ?? process.cwd();
|
|
224
238
|
|
|
225
239
|
// Reload cache from disk to pick up LLM-generated entries
|
|
226
|
-
const freshCache = await loadCache(effectiveCwd);
|
|
240
|
+
const freshCache = await loadCache(effectiveCwd, notifier);
|
|
227
241
|
cache = freshCache;
|
|
228
242
|
registry.updateCache(cache);
|
|
229
243
|
|
package/notifier.ts
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Notifier — thin wrapper around Pi's `ctx.ui.notify()` that buffers
|
|
3
|
+
* messages until a context is available.
|
|
4
|
+
*
|
|
5
|
+
* ## Why a buffer?
|
|
6
|
+
*
|
|
7
|
+
* Several warnings fire at startup (during `loadConfig` and `initRegistry`),
|
|
8
|
+
* before any `ExtensionContext` exists — extensions are constructed first,
|
|
9
|
+
* events fire later. The `Notifier` interface accepts messages at any time:
|
|
10
|
+
*
|
|
11
|
+
* - If a context has been set, messages are forwarded to `ctx.ui.notify()`.
|
|
12
|
+
* - If not, messages are buffered in memory and flushed on the next
|
|
13
|
+
* `setContext()` call (typically from `session_start`).
|
|
14
|
+
*
|
|
15
|
+
* Production code uses `ExtensionNotifier`. Tests inject a plain object
|
|
16
|
+
* satisfying the `Notifier` interface (or a `vi.fn()` spy) — no real
|
|
17
|
+
* extension context is needed.
|
|
18
|
+
*/
|
|
19
|
+
import type { ExtensionContext } from "@mariozechner/pi-coding-agent";
|
|
20
|
+
|
|
21
|
+
export type NotifierLevel = "info" | "warning" | "error";
|
|
22
|
+
|
|
23
|
+
export interface Notifier {
|
|
24
|
+
/** Show an informational message. */
|
|
25
|
+
info(message: string): void;
|
|
26
|
+
/** Show a warning. */
|
|
27
|
+
warn(message: string): void;
|
|
28
|
+
/** Show an error. */
|
|
29
|
+
error(message: string): void;
|
|
30
|
+
/**
|
|
31
|
+
* Bind a context. Flushes any buffered messages via `ctx.ui.notify()`
|
|
32
|
+
* in arrival order. Idempotent: re-calling replaces the context and
|
|
33
|
+
* clears the buffer (already-flushed messages are not re-sent).
|
|
34
|
+
*/
|
|
35
|
+
setContext(ctx: ExtensionContext): void;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/** Production notifier. Buffers until a context is bound. */
|
|
39
|
+
export class ExtensionNotifier implements Notifier {
|
|
40
|
+
private ctx: ExtensionContext | null = null;
|
|
41
|
+
private buffer: Array<{ level: NotifierLevel; message: string }> = [];
|
|
42
|
+
|
|
43
|
+
setContext(ctx: ExtensionContext): void {
|
|
44
|
+
this.ctx = ctx;
|
|
45
|
+
const pending = this.buffer;
|
|
46
|
+
this.buffer = [];
|
|
47
|
+
for (const { level, message } of pending) {
|
|
48
|
+
ctx.ui.notify(message, level);
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
info(message: string): void {
|
|
53
|
+
this.emit("info", message);
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
warn(message: string): void {
|
|
57
|
+
this.emit("warning", message);
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
error(message: string): void {
|
|
61
|
+
this.emit("error", message);
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
private emit(level: NotifierLevel, message: string): void {
|
|
65
|
+
if (this.ctx) {
|
|
66
|
+
this.ctx.ui.notify(message, level);
|
|
67
|
+
} else {
|
|
68
|
+
this.buffer.push({ level, message });
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
}
|
package/package.json
CHANGED
package/registry.ts
CHANGED
|
@@ -8,7 +8,8 @@
|
|
|
8
8
|
import type { Dirent } from "node:fs";
|
|
9
9
|
import { readdir, readFile, stat } from "node:fs/promises";
|
|
10
10
|
import { basename, extname, join, relative, resolve } from "node:path";
|
|
11
|
-
import type
|
|
11
|
+
import { LLM_CACHE_SENTINEL, type CacheEntry, type DocEntry, type DocInjectorConfig, type KeywordCache } from "./types";
|
|
12
|
+
import type { Notifier } from "./notifier";
|
|
12
13
|
import { createGlobFilter } from "./globber";
|
|
13
14
|
import { generateKeywords } from "./keyword-gen";
|
|
14
15
|
|
|
@@ -225,28 +226,36 @@ class PromisePool {
|
|
|
225
226
|
* Document Registry class. Scans a docs folder and maintains an index of DocEntry.
|
|
226
227
|
*/
|
|
227
228
|
export class DocRegistry {
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
229
|
+
private entries: DocEntry[] = [];
|
|
230
|
+
private docsPath: string;
|
|
231
|
+
private config: DocInjectorConfig;
|
|
232
|
+
private cache: KeywordCache | null = null;
|
|
233
|
+
private dirtyCache: KeywordCache = { version: 1, files: {} };
|
|
234
|
+
private notifier: Notifier;
|
|
235
|
+
|
|
236
|
+
private constructor(
|
|
237
|
+
docsPath: string,
|
|
238
|
+
config: DocInjectorConfig,
|
|
239
|
+
cache: KeywordCache | undefined,
|
|
240
|
+
notifier: Notifier,
|
|
241
|
+
) {
|
|
242
|
+
this.docsPath = docsPath;
|
|
243
|
+
this.config = config;
|
|
244
|
+
this.cache = cache ?? null;
|
|
245
|
+
this.notifier = notifier;
|
|
246
|
+
}
|
|
239
247
|
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
248
|
+
/** Create a registry by scanning the docs folder. */
|
|
249
|
+
static async create(
|
|
250
|
+
docsPath: string,
|
|
251
|
+
config: DocInjectorConfig,
|
|
252
|
+
cache: KeywordCache | undefined,
|
|
253
|
+
notifier: Notifier,
|
|
254
|
+
): Promise<DocRegistry> {
|
|
255
|
+
const registry = new DocRegistry(docsPath, config, cache, notifier);
|
|
256
|
+
await registry.rebuild();
|
|
257
|
+
return registry;
|
|
258
|
+
}
|
|
250
259
|
|
|
251
260
|
/** Re-scan the docs folder and rebuild the index. */
|
|
252
261
|
async rebuild(): Promise<void> {
|
|
@@ -274,106 +283,124 @@ export class DocRegistry {
|
|
|
274
283
|
const results = await pool.all(tasks);
|
|
275
284
|
this.entries = results.filter((e): e is DocEntry => e !== null);
|
|
276
285
|
} catch {
|
|
277
|
-
|
|
286
|
+
this.notifier.warn(`[doc-injector] Docs folder not found: ${resolved}`);
|
|
278
287
|
this.entries = [];
|
|
279
288
|
}
|
|
280
289
|
}
|
|
281
290
|
|
|
282
291
|
/**
|
|
283
|
-
* Process a single file through the
|
|
292
|
+
* Process a single file through the priority chain.
|
|
284
293
|
* Returns a DocEntry or null if the file should be skipped.
|
|
294
|
+
*
|
|
295
|
+
* Priority (highest to lowest):
|
|
296
|
+
* 1. Frontmatter (authoritative — explicitly written by the doc author)
|
|
297
|
+
* 2. Cache (perf layer — mtime match means content hasn't changed)
|
|
298
|
+
* 3. Heuristic (free, automatic, local — filename + headings + code symbols)
|
|
299
|
+
* 4. Skip (no frontmatter, no cache, autoKeywords disabled)
|
|
300
|
+
*
|
|
301
|
+
* LLM-generated keywords populate the cache via the `_doc_injector_keywords`
|
|
302
|
+
* tool, so they surface as `keywordSource: "cache"` on the next rebuild
|
|
303
|
+
* (their `mtimeMs` is set to the file's current mtime when written).
|
|
285
304
|
*/
|
|
286
305
|
private async processFile(
|
|
287
306
|
{ filePath, relativePath, fileName }: ScanResult,
|
|
288
307
|
preserved: Map<string, boolean>,
|
|
289
308
|
): Promise<DocEntry | null> {
|
|
290
309
|
try {
|
|
291
|
-
//
|
|
292
|
-
|
|
293
|
-
// Step 1: Stat the file for size and mtime
|
|
310
|
+
// ─── METADATA ─────────────────────────────────────────────
|
|
294
311
|
const fileStat = await stat(filePath);
|
|
295
312
|
|
|
296
|
-
// Step 2: Skip files exceeding maxFileSize
|
|
297
313
|
if (fileStat.size > this.config.maxFileSize) {
|
|
298
|
-
|
|
314
|
+
this.notifier.warn(
|
|
299
315
|
`[doc-injector] Skipping ${relativePath}: size ${fileStat.size} > max ${this.config.maxFileSize}`,
|
|
300
316
|
);
|
|
301
317
|
return null;
|
|
302
318
|
}
|
|
303
319
|
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
// Step 6: Cache hit — mtime matches, use cached keywords
|
|
307
|
-
if (cachedEntry && cachedEntry.mtimeMs === fileStat.mtimeMs) {
|
|
308
|
-
// Still read the file for content and title (needed for injection),
|
|
309
|
-
// but skip keyword generation entirely
|
|
310
|
-
const raw = await readFile(filePath, "utf-8");
|
|
311
|
-
const title = extractTitle(raw, fileName);
|
|
320
|
+
// Read once — needed for frontmatter parse, content, and title.
|
|
321
|
+
const raw = await readFile(filePath, "utf-8");
|
|
312
322
|
|
|
323
|
+
// ─── PRIORITY 1: Frontmatter (authoritative) ─────────────
|
|
324
|
+
const parsed = parseFrontmatter(raw);
|
|
325
|
+
if (parsed) {
|
|
326
|
+
// Frontmatter is self-caching (lives in the file), no dirty mark needed.
|
|
313
327
|
return {
|
|
314
328
|
filePath,
|
|
315
329
|
fileName,
|
|
316
330
|
relativePath,
|
|
317
|
-
title,
|
|
318
|
-
keywords:
|
|
331
|
+
title: parsed.title,
|
|
332
|
+
keywords: parsed.keywords,
|
|
319
333
|
content: raw,
|
|
320
334
|
injected: preserved.get(filePath) ?? false,
|
|
321
|
-
keywordSource: "
|
|
335
|
+
keywordSource: "frontmatter",
|
|
322
336
|
};
|
|
323
337
|
}
|
|
324
338
|
|
|
325
|
-
//
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
339
|
+
// ─── PRIORITY 2: Cache (mtime match means content unchanged) ──
|
|
340
|
+
const cachedEntry = this.cache?.files[relativePath];
|
|
341
|
+
if (cachedEntry) {
|
|
342
|
+
// LLM-generated: sentinel mtime never matches a real file
|
|
343
|
+
if (cachedEntry.mtimeMs === LLM_CACHE_SENTINEL) {
|
|
344
|
+
const title = extractTitle(raw, fileName);
|
|
345
|
+
return {
|
|
346
|
+
filePath,
|
|
347
|
+
fileName,
|
|
348
|
+
relativePath,
|
|
349
|
+
title,
|
|
350
|
+
keywords: cachedEntry.keywords,
|
|
351
|
+
content: raw,
|
|
352
|
+
injected: preserved.get(filePath) ?? false,
|
|
353
|
+
keywordSource: "llm",
|
|
354
|
+
};
|
|
355
|
+
}
|
|
356
|
+
// Real mtime match: heuristic or prior LLM-upgrade cache hit
|
|
357
|
+
if (cachedEntry.mtimeMs === fileStat.mtimeMs) {
|
|
358
|
+
const title = extractTitle(raw, fileName);
|
|
359
|
+
return {
|
|
360
|
+
filePath,
|
|
361
|
+
fileName,
|
|
362
|
+
relativePath,
|
|
363
|
+
title,
|
|
364
|
+
keywords: cachedEntry.keywords,
|
|
365
|
+
content: raw,
|
|
366
|
+
injected: preserved.get(filePath) ?? false,
|
|
367
|
+
keywordSource: "cache",
|
|
368
|
+
};
|
|
369
|
+
}
|
|
370
|
+
}
|
|
329
371
|
|
|
330
|
-
//
|
|
331
|
-
|
|
372
|
+
// ─── PRIORITY 3: Heuristic (free, automatic fallback) ─────────
|
|
373
|
+
if (this.config.autoKeywords) {
|
|
374
|
+
const title = extractTitle(raw, fileName);
|
|
375
|
+
const keywords = generateKeywords(fileName, raw);
|
|
332
376
|
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
377
|
+
// Mark cache dirty (newly generated keywords must be persisted).
|
|
378
|
+
this.dirtyCache.files[relativePath] = {
|
|
379
|
+
mtimeMs: fileStat.mtimeMs,
|
|
380
|
+
keywords,
|
|
381
|
+
};
|
|
336
382
|
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
} else {
|
|
348
|
-
// Step 11: No frontmatter and autoKeywords disabled — skip
|
|
349
|
-
console.warn(
|
|
350
|
-
`[doc-injector] Skipping ${relativePath}: no valid frontmatter with keywords`,
|
|
351
|
-
);
|
|
352
|
-
return null;
|
|
383
|
+
return {
|
|
384
|
+
filePath,
|
|
385
|
+
fileName,
|
|
386
|
+
relativePath,
|
|
387
|
+
title,
|
|
388
|
+
keywords,
|
|
389
|
+
content: raw,
|
|
390
|
+
injected: preserved.get(filePath) ?? false,
|
|
391
|
+
keywordSource: "heuristic",
|
|
392
|
+
};
|
|
353
393
|
}
|
|
354
394
|
|
|
355
|
-
//
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
keywords,
|
|
361
|
-
};
|
|
362
|
-
|
|
363
|
-
return {
|
|
364
|
-
filePath,
|
|
365
|
-
fileName,
|
|
366
|
-
relativePath,
|
|
367
|
-
title,
|
|
368
|
-
keywords,
|
|
369
|
-
content: raw,
|
|
370
|
-
injected: preserved.get(filePath) ?? false,
|
|
371
|
-
keywordSource,
|
|
372
|
-
};
|
|
395
|
+
// ─── PRIORITY 4: Skip ───────────────────────────────────────────
|
|
396
|
+
this.notifier.warn(
|
|
397
|
+
`[doc-injector] Skipping ${relativePath}: no valid frontmatter with keywords`,
|
|
398
|
+
);
|
|
399
|
+
return null;
|
|
373
400
|
} catch (err) {
|
|
374
401
|
// Only warn for unexpected errors, not ENOENT (file deleted/moved after scan)
|
|
375
402
|
if ((err as NodeJS.ErrnoException).code !== "ENOENT") {
|
|
376
|
-
|
|
403
|
+
this.notifier.warn(`[doc-injector] Error reading ${relativePath}: ${err instanceof Error ? err.message : String(err)}`);
|
|
377
404
|
}
|
|
378
405
|
return null;
|
|
379
406
|
}
|
package/types.ts
CHANGED
|
@@ -107,4 +107,16 @@ export const DEFAULT_CONFIG: DocInjectorConfig = {
|
|
|
107
107
|
export const DEFAULT_MATCHER_OPTIONS: MatcherOptions = {
|
|
108
108
|
matchThreshold: DEFAULT_CONFIG.matchThreshold,
|
|
109
109
|
caseSensitive: false,
|
|
110
|
-
};
|
|
110
|
+
};
|
|
111
|
+
|
|
112
|
+
/**
|
|
113
|
+
* Sentinel value used in CacheEntry.mtimeMs to mark entries written by the
|
|
114
|
+
* LLM keyword generator. -1 is chosen because Node.Stats.mtimeMs is documented
|
|
115
|
+
* as a non-negative integer (milliseconds since the Unix Epoch), so a real
|
|
116
|
+
* file can never have mtimeMs === -1. Heuristic-written entries use the real
|
|
117
|
+
* file mtime, which is always >= 0.
|
|
118
|
+
*
|
|
119
|
+
* If you find yourself writing LLM_CACHE_SENTINEL into a real cache entry
|
|
120
|
+
* from a non-LLM code path, that's a bug.
|
|
121
|
+
*/
|
|
122
|
+
export const LLM_CACHE_SENTINEL = -1;
|