pi-doc-injector 0.2.1 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cache.ts +79 -0
- package/commands.ts +68 -1
- package/config.ts +63 -28
- package/docs/async-subagent-bug.md +199 -0
- package/globber.ts +48 -0
- package/index.ts +151 -20
- package/injector.ts +18 -1
- package/keyword-gen.ts +142 -0
- package/keyword-llm.ts +57 -0
- package/matcher.ts +14 -10
- package/package.json +5 -1
- package/picomatch.d.ts +11 -0
- package/registry.ts +361 -72
- package/types.ts +62 -3
package/index.ts
CHANGED
|
@@ -53,18 +53,21 @@
|
|
|
53
53
|
* is cleared after injection, and `markInjected()` operates on the registry's
|
|
54
54
|
* current entries, not the stale array.
|
|
55
55
|
*/
|
|
56
|
-
import type { ExtensionAPI } from "@mariozechner/pi-coding-agent";
|
|
56
|
+
import type { ExtensionAPI, ExtensionContext } from "@mariozechner/pi-coding-agent";
|
|
57
|
+
import { Type } from "@sinclair/typebox";
|
|
57
58
|
import { resolve } from "node:path";
|
|
59
|
+
import { loadCache, saveCache } from "./cache";
|
|
58
60
|
import { loadConfig } from "./config";
|
|
59
61
|
import { buildSystemPromptAppend, notifyInjection } from "./injector";
|
|
62
|
+
import { buildKeywordGenPrompt } from "./keyword-llm";
|
|
60
63
|
import { extractText, KeywordMatcher } from "./matcher";
|
|
61
64
|
import { DocRegistry } from "./registry";
|
|
62
|
-
import { DEFAULT_MATCHER_OPTIONS, type DocEntry, type MatchResult } from "./types";
|
|
65
|
+
import { DEFAULT_MATCHER_OPTIONS, type DocEntry, type MatchResult, type KeywordCache, type CacheEntry } from "./types";
|
|
63
66
|
import { registerCommands } from "./commands";
|
|
64
67
|
|
|
65
68
|
export default async function docInjectorExtension(pi: ExtensionAPI) {
|
|
66
69
|
// ---- State ----
|
|
67
|
-
let config = loadConfig(process.cwd());
|
|
70
|
+
let config = await loadConfig(process.cwd());
|
|
68
71
|
let registry: DocRegistry | null = null;
|
|
69
72
|
let initRegistryPromise: Promise<void> | null = null;
|
|
70
73
|
let enabled = true;
|
|
@@ -72,23 +75,51 @@ export default async function docInjectorExtension(pi: ExtensionAPI) {
|
|
|
72
75
|
let pendingMatches = new Map<string, string[]>(); // filePath → matchedKeywords
|
|
73
76
|
let abortingForInjection = false; // guard against cascading aborts
|
|
74
77
|
|
|
78
|
+
// P5.4b — Guard flags for LLM keyword generation
|
|
79
|
+
let keywordGenInFlight = false;
|
|
80
|
+
let llmBatchesCompleted = 0;
|
|
81
|
+
let llmTotalFiles = 0;
|
|
82
|
+
let cache: KeywordCache = { version: 1, files: {} };
|
|
83
|
+
|
|
75
84
|
// ---- Helpers ----
|
|
76
85
|
const getRegistry = () => registry;
|
|
77
86
|
const getEnabled = () => enabled;
|
|
78
87
|
const setEnabled = (v: boolean) => {
|
|
79
88
|
enabled = v;
|
|
80
89
|
};
|
|
90
|
+
const getConfig = () => config;
|
|
91
|
+
|
|
92
|
+
const safeSaveCache = async (cwd: string, dirtyEntries: Record<string, CacheEntry>) => {
|
|
93
|
+
// MAJOR-2 fix: before saveCache, re-read cache from disk to merge
|
|
94
|
+
// LLM-written entries that may have landed during the scan.
|
|
95
|
+
const freshCache = await loadCache(cwd);
|
|
96
|
+
const mergedCache: KeywordCache = { version: 1, files: {} };
|
|
97
|
+
|
|
98
|
+
// Start with fresh (disk) entries — includes any LLM writes during scan
|
|
99
|
+
for (const [key, entry] of Object.entries(freshCache.files)) {
|
|
100
|
+
mergedCache.files[key] = entry;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
// Overlay dirty entries from this scan (scan results take precedence)
|
|
104
|
+
for (const [key, entry] of Object.entries(dirtyEntries)) {
|
|
105
|
+
mergedCache.files[key] = entry;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
await saveCache(cwd, mergedCache);
|
|
109
|
+
};
|
|
81
110
|
|
|
82
111
|
const initRegistry = async (cwd: string) => {
|
|
83
|
-
config = loadConfig(cwd);
|
|
112
|
+
config = await loadConfig(cwd);
|
|
84
113
|
const docsPath = resolve(cwd, config.docsPath);
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
114
|
+
cache = await loadCache(cwd);
|
|
115
|
+
registry = await DocRegistry.create(docsPath, config, cache);
|
|
116
|
+
|
|
117
|
+
const dirty = registry.getDirtyCache();
|
|
118
|
+
if (Object.keys(dirty).length > 0) {
|
|
119
|
+
await safeSaveCache(cwd, dirty);
|
|
91
120
|
}
|
|
121
|
+
|
|
122
|
+
const count = registry.getEntries().length;
|
|
92
123
|
};
|
|
93
124
|
|
|
94
125
|
const buildMatcher = (): KeywordMatcher | null => {
|
|
@@ -99,11 +130,69 @@ export default async function docInjectorExtension(pi: ExtensionAPI) {
|
|
|
99
130
|
);
|
|
100
131
|
};
|
|
101
132
|
|
|
133
|
+
// P5.4f — generateKeywordsLLM: sets keywordGenInFlight and sends a user message
|
|
134
|
+
// with the prompt built by buildKeywordGenPrompt. The LLM will respond by
|
|
135
|
+
// calling the _doc_injector_keywords tool.
|
|
136
|
+
const generateKeywordsLLM = async (
|
|
137
|
+
files: Array<{ path: string; snippet: string; existingKeywords: string[] }>,
|
|
138
|
+
) => {
|
|
139
|
+
keywordGenInFlight = true;
|
|
140
|
+
const prompt = buildKeywordGenPrompt(files);
|
|
141
|
+
pi.sendUserMessage(prompt, { deliverAs: "followUp" });
|
|
142
|
+
};
|
|
143
|
+
|
|
144
|
+
// P5.4a — Inline tool registration (BLOCKER-2 fix).
|
|
145
|
+
// Registered inside the factory for closure access to cache, cwd, saveCache,
|
|
146
|
+
// and llmBatchesCompleted. Uses real mtime from stat().
|
|
147
|
+
pi.registerTool({
|
|
148
|
+
name: "_doc_injector_keywords",
|
|
149
|
+
label: "Doc Injector Keywords",
|
|
150
|
+
description:
|
|
151
|
+
"Save LLM-generated keywords for documentation files. Call this tool with the keywords array after analyzing file snippets.",
|
|
152
|
+
parameters: Type.Object({
|
|
153
|
+
keywords: Type.Array(
|
|
154
|
+
Type.Object({
|
|
155
|
+
path: Type.String(),
|
|
156
|
+
keywords: Type.Array(Type.String()),
|
|
157
|
+
}),
|
|
158
|
+
),
|
|
159
|
+
}),
|
|
160
|
+
execute: async (_id, params, _signal, _onUpdate, ctx) => {
|
|
161
|
+
const generated = params.keywords as Array<{ path: string; keywords: string[] }>;
|
|
162
|
+
const { stat } = await import("node:fs/promises");
|
|
163
|
+
let saved = 0;
|
|
164
|
+
for (const item of generated) {
|
|
165
|
+
const absPath = resolve(ctx.cwd, config.docsPath, item.path);
|
|
166
|
+
const fileStat = await stat(absPath).catch(() => null);
|
|
167
|
+
if (!fileStat) {
|
|
168
|
+
continue;
|
|
169
|
+
}
|
|
170
|
+
cache.files[item.path] = {
|
|
171
|
+
mtimeMs: fileStat.mtimeMs,
|
|
172
|
+
keywords: item.keywords.map((k) => k.toLowerCase()).slice(0, 20),
|
|
173
|
+
};
|
|
174
|
+
saved++;
|
|
175
|
+
}
|
|
176
|
+
await saveCache(ctx.cwd, cache);
|
|
177
|
+
llmBatchesCompleted++;
|
|
178
|
+
llmTotalFiles += saved;
|
|
179
|
+
return {
|
|
180
|
+
content: [{ type: "text" as const, text: `Keywords saved for ${saved} files.` }],
|
|
181
|
+
details: undefined as never,
|
|
182
|
+
};
|
|
183
|
+
},
|
|
184
|
+
});
|
|
185
|
+
|
|
102
186
|
// ---- Event: session_start ----
|
|
103
187
|
// Pi emits session_start for startup, reload, and real session transitions.
|
|
104
188
|
// Skip the reload variant because resources_discover will rebuild docs right
|
|
105
189
|
// after it, and deduplicate any overlapping non-reload inits.
|
|
106
190
|
pi.on("session_start", async (event, ctx) => {
|
|
191
|
+
// P5.4d — Safety unbind: clear all LLM keyword gen state on session start
|
|
192
|
+
keywordGenInFlight = false;
|
|
193
|
+
llmBatchesCompleted = 0;
|
|
194
|
+
llmTotalFiles = 0;
|
|
195
|
+
|
|
107
196
|
if (event.reason === "reload") return;
|
|
108
197
|
|
|
109
198
|
if (initRegistryPromise) {
|
|
@@ -119,17 +208,29 @@ export default async function docInjectorExtension(pi: ExtensionAPI) {
|
|
|
119
208
|
}
|
|
120
209
|
});
|
|
121
210
|
|
|
122
|
-
const reloadRegistry = async (): Promise<number> => {
|
|
211
|
+
const reloadRegistry = async (cwd?: string): Promise<number> => {
|
|
123
212
|
if (!registry) throw new Error("No registry loaded");
|
|
213
|
+
const effectiveCwd = cwd ?? process.cwd();
|
|
214
|
+
|
|
215
|
+
// Reload cache from disk to pick up LLM-generated entries
|
|
216
|
+
const freshCache = await loadCache(effectiveCwd);
|
|
217
|
+
cache = freshCache;
|
|
218
|
+
registry.updateCache(cache);
|
|
219
|
+
|
|
124
220
|
await registry.rebuild();
|
|
221
|
+
|
|
222
|
+
const dirty = registry.getDirtyCache();
|
|
223
|
+
if (Object.keys(dirty).length > 0) {
|
|
224
|
+
await safeSaveCache(effectiveCwd, dirty);
|
|
225
|
+
}
|
|
226
|
+
|
|
125
227
|
const count = registry.getEntries().length;
|
|
126
|
-
console.log(`[doc-injector] Reloaded: ${count} documents`);
|
|
127
228
|
return count;
|
|
128
229
|
};
|
|
129
230
|
|
|
130
231
|
// ---- Event: resources_discover (reload) ----
|
|
131
|
-
pi.on("resources_discover", async (_event,
|
|
132
|
-
await reloadRegistry();
|
|
232
|
+
pi.on("resources_discover", async (_event, ctx) => {
|
|
233
|
+
await reloadRegistry(ctx.cwd);
|
|
133
234
|
});
|
|
134
235
|
|
|
135
236
|
// ---- Event: input (user message matching) ----
|
|
@@ -138,6 +239,17 @@ export default async function docInjectorExtension(pi: ExtensionAPI) {
|
|
|
138
239
|
// BEFORE before_agent_start fires, so docs are injected in time for
|
|
139
240
|
// the assistant's immediate response.
|
|
140
241
|
pi.on("input", async (event, _ctx) => {
|
|
242
|
+
// P5.4d — Safety unbind: if the user is typing interactively, clear all
|
|
243
|
+
// LLM keyword gen state (they may have aborted the generation).
|
|
244
|
+
if (event.source === "interactive") {
|
|
245
|
+
keywordGenInFlight = false;
|
|
246
|
+
llmBatchesCompleted = 0;
|
|
247
|
+
llmTotalFiles = 0;
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
// P5.4b — Guard: skip keyword matching during LLM keyword generation
|
|
251
|
+
if (keywordGenInFlight) return;
|
|
252
|
+
|
|
141
253
|
if (!enabled || !registry) return;
|
|
142
254
|
if (!event.text) return;
|
|
143
255
|
|
|
@@ -155,6 +267,9 @@ export default async function docInjectorExtension(pi: ExtensionAPI) {
|
|
|
155
267
|
// non-injected docs, abort the current generation and restart with the
|
|
156
268
|
// injected context — no waiting for the next turn.
|
|
157
269
|
pi.on("message_update", async (event, ctx) => {
|
|
270
|
+
// P5.4b — Guard: skip auto-abort logic during LLM keyword generation
|
|
271
|
+
if (keywordGenInFlight) return;
|
|
272
|
+
|
|
158
273
|
if (!enabled || !registry) return;
|
|
159
274
|
|
|
160
275
|
const msg = event.message;
|
|
@@ -195,6 +310,9 @@ export default async function docInjectorExtension(pi: ExtensionAPI) {
|
|
|
195
310
|
|
|
196
311
|
// ---- Event: before_agent_start (inject into system prompt) ----
|
|
197
312
|
pi.on("before_agent_start", async (event, ctx) => {
|
|
313
|
+
// P5.4b — Guard: skip injection during LLM keyword generation
|
|
314
|
+
if (keywordGenInFlight) return;
|
|
315
|
+
|
|
198
316
|
if (!enabled || !registry || pendingMatches.size === 0) return;
|
|
199
317
|
|
|
200
318
|
const matchedEntries: DocEntry[] = [];
|
|
@@ -213,7 +331,6 @@ export default async function docInjectorExtension(pi: ExtensionAPI) {
|
|
|
213
331
|
// past the model's limit.
|
|
214
332
|
const usage = ctx.getContextUsage();
|
|
215
333
|
if (usage && usage.tokens && usage.tokens > 0 && usage.percent && usage.percent > config.contextThreshold) {
|
|
216
|
-
console.warn(`[doc-injector] Skipping injection: context usage > ${config.contextThreshold}%`);
|
|
217
334
|
pendingMatches.clear();
|
|
218
335
|
return;
|
|
219
336
|
}
|
|
@@ -235,13 +352,25 @@ export default async function docInjectorExtension(pi: ExtensionAPI) {
|
|
|
235
352
|
};
|
|
236
353
|
});
|
|
237
354
|
|
|
238
|
-
// ---- Event: agent_end (restart after auto-abort) ----
|
|
239
|
-
pi.on("agent_end", async () => {
|
|
355
|
+
// ---- Event: agent_end (restart after auto-abort + LLM batch summary) ----
|
|
356
|
+
pi.on("agent_end", async (event, ctx) => {
|
|
357
|
+
// P5.4c — Summary notification from agent_end (BLOCKER-3)
|
|
358
|
+
keywordGenInFlight = false;
|
|
359
|
+
if (llmBatchesCompleted > 0) {
|
|
360
|
+
await ctx.ui.notify(
|
|
361
|
+
`Doc keywords: ${llmTotalFiles} files across ${llmBatchesCompleted} batch(es)`,
|
|
362
|
+
"info",
|
|
363
|
+
);
|
|
364
|
+
llmBatchesCompleted = 0;
|
|
365
|
+
llmTotalFiles = 0;
|
|
366
|
+
}
|
|
367
|
+
|
|
240
368
|
if (abortingForInjection) {
|
|
241
369
|
abortingForInjection = false;
|
|
242
|
-
//
|
|
243
|
-
|
|
244
|
-
|
|
370
|
+
// Defer sendUserMessage to next tick to avoid re-entrancy issues.
|
|
371
|
+
setTimeout(() => {
|
|
372
|
+
pi.sendUserMessage("continue");
|
|
373
|
+
}, 0);
|
|
245
374
|
}
|
|
246
375
|
});
|
|
247
376
|
|
|
@@ -251,5 +380,7 @@ export default async function docInjectorExtension(pi: ExtensionAPI) {
|
|
|
251
380
|
getEnabled,
|
|
252
381
|
setEnabled,
|
|
253
382
|
reloadRegistry,
|
|
383
|
+
getConfig,
|
|
384
|
+
generateKeywordsLLM,
|
|
254
385
|
});
|
|
255
386
|
}
|
package/injector.ts
CHANGED
|
@@ -13,6 +13,21 @@ export interface NotifyCapability {
|
|
|
13
13
|
notify: (msg: string, type?: "info" | "warning" | "error") => void;
|
|
14
14
|
}
|
|
15
15
|
|
|
16
|
+
/**
|
|
17
|
+
* Sanitize keywords for safe injection into the system prompt.
|
|
18
|
+
*
|
|
19
|
+
* - Strips \n and \r (replaces with space) to prevent prompt injection
|
|
20
|
+
* - Caps each keyword at 100 characters
|
|
21
|
+
* - Enforces a hard limit of 20 keywords
|
|
22
|
+
*/
|
|
23
|
+
function sanitizeKeywords(keywords: string[]): string[] {
|
|
24
|
+
return keywords
|
|
25
|
+
.map((k) => k.replace(/[\n\r]/g, " ").trim())
|
|
26
|
+
.filter((k) => k.length > 0)
|
|
27
|
+
.map((k) => (k.length > 100 ? k.slice(0, 100) : k))
|
|
28
|
+
.slice(0, 20);
|
|
29
|
+
}
|
|
30
|
+
|
|
16
31
|
/**
|
|
17
32
|
* Build a system prompt append string from matched documents.
|
|
18
33
|
*/
|
|
@@ -29,7 +44,9 @@ export function buildSystemPromptAppend(
|
|
|
29
44
|
];
|
|
30
45
|
|
|
31
46
|
for (const entry of entries) {
|
|
32
|
-
|
|
47
|
+
// Sanitize keywords before display to prevent prompt injection
|
|
48
|
+
const rawKeywords = matchedKeywords.get(entry.filePath) ?? [];
|
|
49
|
+
const keywords = sanitizeKeywords(rawKeywords);
|
|
33
50
|
sections.push(`### ${entry.title}`);
|
|
34
51
|
sections.push(`Source: \`${entry.relativePath}\``);
|
|
35
52
|
if (keywords.length > 0) {
|
package/keyword-gen.ts
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Local keyword generation — extracts keywords from filenames and content
|
|
3
|
+
* when no frontmatter is available.
|
|
4
|
+
*
|
|
5
|
+
* Extraction sources:
|
|
6
|
+
* 1. Filename parts (split on -, _, .)
|
|
7
|
+
* 2. Markdown headings (# Title, ## Title, etc.)
|
|
8
|
+
* 3. Code symbols (function, class, const, interface, type, enum)
|
|
9
|
+
*
|
|
10
|
+
* All keywords are lowercased, deduplicated, and filtered through a stop-word list.
|
|
11
|
+
* Output is capped at 20 keywords.
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
const STOP_WORDS = new Set<string>([
|
|
15
|
+
// Articles
|
|
16
|
+
"a", "an", "the",
|
|
17
|
+
// Pronouns
|
|
18
|
+
"i", "you", "he", "she", "it", "we", "they",
|
|
19
|
+
"me", "him", "her", "us", "them",
|
|
20
|
+
"my", "your", "his", "its", "our", "their",
|
|
21
|
+
"this", "that", "these", "those",
|
|
22
|
+
"who", "whom", "whose", "which", "what",
|
|
23
|
+
// Prepositions
|
|
24
|
+
"in", "on", "at", "by", "for", "with", "about",
|
|
25
|
+
"to", "from", "of", "into", "onto", "upon",
|
|
26
|
+
"over", "under", "between", "among", "through",
|
|
27
|
+
"during", "before", "after", "above", "below",
|
|
28
|
+
"up", "down", "out", "off",
|
|
29
|
+
// Conjunctions
|
|
30
|
+
"and", "but", "or", "nor", "so", "yet", "for",
|
|
31
|
+
"if", "then", "than", "as", "when", "while",
|
|
32
|
+
"because", "since", "although", "though",
|
|
33
|
+
// Auxiliary/modal verbs
|
|
34
|
+
"is", "are", "was", "were", "be", "been", "being",
|
|
35
|
+
"have", "has", "had", "having",
|
|
36
|
+
"do", "does", "did", "doing",
|
|
37
|
+
"will", "would", "shall", "should", "can", "could",
|
|
38
|
+
"may", "might", "must",
|
|
39
|
+
// Common adverbs
|
|
40
|
+
"not", "no", "yes",
|
|
41
|
+
"just", "only", "also", "too", "very", "now", "then",
|
|
42
|
+
"here", "there", "where", "how", "why",
|
|
43
|
+
"all", "each", "every", "both", "few", "more", "most",
|
|
44
|
+
"some", "any", "other", "another", "such",
|
|
45
|
+
"much", "many", "little", "less",
|
|
46
|
+
// Common content-less words
|
|
47
|
+
"get", "set", "put", "use", "make", "see", "need",
|
|
48
|
+
"one", "two", "three", "first", "second", "third",
|
|
49
|
+
"using", "used", "into", "onto", "new",
|
|
50
|
+
"note", "notes", "example", "examples", "todo",
|
|
51
|
+
]);
|
|
52
|
+
|
|
53
|
+
/**
|
|
54
|
+
* Generate up to 20 keywords from a file's name and content.
|
|
55
|
+
*
|
|
56
|
+
* Sources (in order, each adds keywords until cap is reached):
|
|
57
|
+
* 1. Filename parts — split on `-`, `_`, and `.`, keep segments ≥ 3 chars
|
|
58
|
+
* 2. Markdown headings — text after `#` markers
|
|
59
|
+
* 3. Code symbols — function/class/const/interface/type/enum declarations
|
|
60
|
+
*
|
|
61
|
+
* Each candidate is lowercased, filtered through a stop-word list, deduplicated,
|
|
62
|
+
* and limited to words with ≥ 3 characters.
|
|
63
|
+
*
|
|
64
|
+
* @param fileName - The basename of the file (e.g. "api-authentication.md")
|
|
65
|
+
* @param content - The full file content
|
|
66
|
+
* @returns Up to 20 deduplicated keyword strings
|
|
67
|
+
*/
|
|
68
|
+
export function generateKeywords(
|
|
69
|
+
fileName: string,
|
|
70
|
+
content: string,
|
|
71
|
+
): string[] {
|
|
72
|
+
const keywords: string[] = [];
|
|
73
|
+
|
|
74
|
+
// Source 1: Filename parts
|
|
75
|
+
addFromFilename(fileName, keywords);
|
|
76
|
+
|
|
77
|
+
// Source 2: Markdown headings
|
|
78
|
+
addFromHeadings(content, keywords);
|
|
79
|
+
|
|
80
|
+
// Source 3: Code symbols
|
|
81
|
+
addFromCodeSymbols(content, keywords);
|
|
82
|
+
|
|
83
|
+
// Deduplicate while preserving order
|
|
84
|
+
const seen = new Set<string>();
|
|
85
|
+
const result: string[] = [];
|
|
86
|
+
for (const kw of keywords) {
|
|
87
|
+
const lower = kw.toLowerCase();
|
|
88
|
+
if (seen.has(lower)) continue;
|
|
89
|
+
seen.add(lower);
|
|
90
|
+
result.push(kw);
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
return result.slice(0, 20);
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
/** Extract keyword candidates from filename parts. */
|
|
97
|
+
function addFromFilename(fileName: string, out: string[]): void {
|
|
98
|
+
// Strip extension(s)
|
|
99
|
+
const nameWithoutExt = fileName.replace(/\.[^.]+$/, "");
|
|
100
|
+
|
|
101
|
+
// Split on common delimiters
|
|
102
|
+
const parts = nameWithoutExt.split(/[-_.\s]+/);
|
|
103
|
+
|
|
104
|
+
for (const part of parts) {
|
|
105
|
+
const cleaned = part.replace(/[^a-zA-Z0-9]/g, "").toLowerCase();
|
|
106
|
+
if (cleaned.length >= 3 && !STOP_WORDS.has(cleaned)) {
|
|
107
|
+
out.push(cleaned);
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
/** Extract keyword candidates from markdown headings (#, ##, ###, etc.). */
|
|
113
|
+
function addFromHeadings(content: string, out: string[]): void {
|
|
114
|
+
const headingRegex = /^#{1,6}\s+(.+)$/gm;
|
|
115
|
+
let match: RegExpExecArray | null;
|
|
116
|
+
while ((match = headingRegex.exec(content)) !== null) {
|
|
117
|
+
const headingText = match[1].trim();
|
|
118
|
+
// Split heading into words
|
|
119
|
+
const words = headingText.split(/\s+/);
|
|
120
|
+
for (const word of words) {
|
|
121
|
+
const cleaned = word.replace(/[^a-zA-Z0-9]/g, "").toLowerCase();
|
|
122
|
+
if (cleaned.length >= 3 && !STOP_WORDS.has(cleaned)) {
|
|
123
|
+
out.push(cleaned);
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
/** Extract keyword candidates from code symbol declarations. */
|
|
130
|
+
function addFromCodeSymbols(content: string, out: string[]): void {
|
|
131
|
+
// Match: function name, class name, const name, interface name, type name, enum name
|
|
132
|
+
// Also: export function, export class, export const, etc.
|
|
133
|
+
const symbolRegex = /(?:export\s+)?(?:async\s+)?(?:function|class|const|interface|type|enum)\s+(\w+)/gm;
|
|
134
|
+
let match: RegExpExecArray | null;
|
|
135
|
+
while ((match = symbolRegex.exec(content)) !== null) {
|
|
136
|
+
const name = match[1];
|
|
137
|
+
const cleaned = name.toLowerCase();
|
|
138
|
+
if (cleaned.length >= 3 && !STOP_WORDS.has(cleaned)) {
|
|
139
|
+
out.push(cleaned);
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
}
|
package/keyword-llm.ts
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LLM Keyword Generation — builds prompts for the LLM to generate keywords
|
|
3
|
+
* for documentation files via the _doc_injector_keywords tool.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
/** Input for a single file in a keyword generation batch. */
|
|
7
|
+
export interface FileInput {
|
|
8
|
+
/** Path relative to cwd (e.g. "docs/api.md") */
|
|
9
|
+
path: string;
|
|
10
|
+
/** First ~500 chars of the file content as context */
|
|
11
|
+
snippet: string;
|
|
12
|
+
/** Existing keywords (from frontmatter/heuristic), so LLM augments not replaces */
|
|
13
|
+
existingKeywords: string[];
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Build a user message prompt instructing the LLM to generate keywords
|
|
18
|
+
* for a batch of documentation files by calling the _doc_injector_keywords tool.
|
|
19
|
+
*
|
|
20
|
+
* The prompt asks the LLM to read each file's snippet and produce 3-10 concise,
|
|
21
|
+
* searchable keywords per file, incorporating any existing keywords.
|
|
22
|
+
*/
|
|
23
|
+
export function buildKeywordGenPrompt(files: FileInput[]): string {
|
|
24
|
+
if (files.length === 0) return "";
|
|
25
|
+
|
|
26
|
+
const fileDescriptions = files.map((f, i) => {
|
|
27
|
+
const existing = f.existingKeywords.length > 0
|
|
28
|
+
? ` Existing keywords: ${f.existingKeywords.join(", ")}`
|
|
29
|
+
: "";
|
|
30
|
+
// Escape markdown special chars in path to prevent prompt injection
|
|
31
|
+
const safePath = f.path.replace(/[*`\[\]]/g, "\\$&");
|
|
32
|
+
// Escape backticks in snippet to prevent breaking code fences
|
|
33
|
+
const safeSnippet = f.snippet.replace(/```/g, "'''");
|
|
34
|
+
return `${i + 1}. **${safePath}**\n${existing}\n Snippet:\n\`\`\`\n${safeSnippet}\n\`\`\``;
|
|
35
|
+
}).join("\n\n");
|
|
36
|
+
|
|
37
|
+
const expectedOutput = files.map((f) => {
|
|
38
|
+
const safePath = f.path.replace(/[*`\[\]]/g, "\\$&");
|
|
39
|
+
return ` - "${safePath}": keywords array incorporating relevant existing keywords [${f.existingKeywords.slice(0, 5).map(k => `"${k}"`).join(", ")}${f.existingKeywords.length > 5 ? ", ..." : ""}]`;
|
|
40
|
+
}).join("\n");
|
|
41
|
+
|
|
42
|
+
return `Generate documentation keywords for the following ${files.length} file(s). For each file, read the snippet and produce 3-10 concise, searchable keywords that someone might type when looking for this documentation.
|
|
43
|
+
|
|
44
|
+
Rules:
|
|
45
|
+
- Keywords should be lowercase, 3+ characters, no stop-words
|
|
46
|
+
- Incorporate any existing keywords that are still relevant
|
|
47
|
+
- Focus on the document's core topic, not generic terms
|
|
48
|
+
- Prefer specific technical terms over vague ones
|
|
49
|
+
|
|
50
|
+
Files:
|
|
51
|
+
${fileDescriptions}
|
|
52
|
+
|
|
53
|
+
After analysis, call the \`_doc_injector_keywords\` tool with a \`keywords\` array like:
|
|
54
|
+
${expectedOutput}
|
|
55
|
+
|
|
56
|
+
Do not output any other text — just call the tool with the keywords.`;
|
|
57
|
+
}
|
package/matcher.ts
CHANGED
|
@@ -31,6 +31,10 @@ export function extractText(content: unknown): string {
|
|
|
31
31
|
export class KeywordMatcher {
|
|
32
32
|
private options: MatcherOptions;
|
|
33
33
|
|
|
34
|
+
/**
|
|
35
|
+
* @param entries - The document entries to match against
|
|
36
|
+
* @param options - Optional matcher settings (merged with defaults)
|
|
37
|
+
*/
|
|
34
38
|
constructor(private entries: DocEntry[], options?: Partial<MatcherOptions>) {
|
|
35
39
|
this.options = { ...DEFAULT_MATCHER_OPTIONS, ...options };
|
|
36
40
|
}
|
|
@@ -44,8 +48,13 @@ export class KeywordMatcher {
|
|
|
44
48
|
for (const entry of this.entries) {
|
|
45
49
|
if (entry.injected) continue;
|
|
46
50
|
|
|
51
|
+
// Skip entries with no keywords (empty array or falsy)
|
|
52
|
+
if (!entry.keywords || entry.keywords.length === 0) continue;
|
|
53
|
+
|
|
47
54
|
const matchedKeywords: string[] = [];
|
|
48
55
|
for (const keyword of entry.keywords) {
|
|
56
|
+
// Skip empty keywords — they'd match everything with word boundaries
|
|
57
|
+
if (!keyword || keyword.trim().length === 0) continue;
|
|
49
58
|
if (this.keywordMatches(text, keyword)) {
|
|
50
59
|
matchedKeywords.push(keyword);
|
|
51
60
|
}
|
|
@@ -63,18 +72,13 @@ export class KeywordMatcher {
|
|
|
63
72
|
return results;
|
|
64
73
|
}
|
|
65
74
|
|
|
75
|
+
/**
|
|
76
|
+
* Check if a single keyword matches the given text.
|
|
77
|
+
* Uses simple substring inclusion (case-insensitive by default).
|
|
78
|
+
*/
|
|
66
79
|
private keywordMatches(text: string, keyword: string): boolean {
|
|
67
80
|
const search = this.options.caseSensitive ? text : text.toLowerCase();
|
|
68
81
|
const kw = this.options.caseSensitive ? keyword : keyword.toLowerCase();
|
|
69
|
-
|
|
70
|
-
if (this.options.wordBoundary) {
|
|
71
|
-
// Escape special regex chars in keyword, then apply word boundary
|
|
72
|
-
const escaped = kw.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
73
|
-
const flags = this.options.caseSensitive ? "" : "i";
|
|
74
|
-
const regex = new RegExp(`\\b${escaped}\\b`, flags);
|
|
75
|
-
return regex.test(search);
|
|
76
|
-
}
|
|
77
|
-
|
|
78
82
|
return search.includes(kw);
|
|
79
83
|
}
|
|
80
|
-
}
|
|
84
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "pi-doc-injector",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.3.1",
|
|
4
4
|
"description": "Auto-inject relevant project documentation into Pi's LLM context based on keyword matching",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./index.ts",
|
|
@@ -9,6 +9,7 @@
|
|
|
9
9
|
},
|
|
10
10
|
"files": [
|
|
11
11
|
"*.ts",
|
|
12
|
+
"*.d.ts",
|
|
12
13
|
"docs/**/*.md",
|
|
13
14
|
"README.md"
|
|
14
15
|
],
|
|
@@ -33,6 +34,9 @@
|
|
|
33
34
|
"./index.ts"
|
|
34
35
|
]
|
|
35
36
|
},
|
|
37
|
+
"dependencies": {
|
|
38
|
+
"picomatch": "^4.0.2"
|
|
39
|
+
},
|
|
36
40
|
"peerDependencies": {
|
|
37
41
|
"@mariozechner/pi-coding-agent": "*"
|
|
38
42
|
},
|
package/picomatch.d.ts
ADDED