diffdoc 0.1.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.diffdocrc.example +4 -1
- package/README.md +83 -10
- package/dist/commands/embed.js +110 -24
- package/dist/commands/query.js +18 -64
- package/dist/commands/summarize.js +264 -37
- package/dist/config.js +24 -0
- package/dist/index.js +19 -2
- package/dist/mcp.js +133 -0
- package/dist/services/retrieval.js +86 -0
- package/dist/types/artifacts.js +5 -0
- package/dist/utils/git.js +1 -5
- package/package.json +6 -3
package/.diffdocrc.example
CHANGED
package/README.md
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
## Project Description
|
|
4
4
|
|
|
5
|
-
DiffDoc turns source code into searchable, plain-English project context. It scans repository files, asks an OpenAI-compatible chat model to summarize the business behavior in each file, stores
|
|
5
|
+
DiffDoc turns source code into searchable, plain-English project context. It scans repository files, asks an OpenAI-compatible chat model to summarize the business behavior in each file, stores the summaries as portable per-hash JSON assets, embeds those assets into a local Vectra index, and answers questions using the indexed results as retrieval context.
|
|
6
6
|
|
|
7
7
|
The project is designed for teams that need fast codebase comprehension without requiring every stakeholder to read implementation details. It can run against local model servers such as Ollama, LM Studio, or vLLM, or against cloud OpenAI-compatible APIs.
|
|
8
8
|
|
|
@@ -37,7 +37,8 @@ Package scripts can call the installed binary:
|
|
|
37
37
|
"diffdoc:summarize": "diffdoc summarize",
|
|
38
38
|
"diffdoc:embed": "diffdoc embed",
|
|
39
39
|
"diffdoc:search": "diffdoc search",
|
|
40
|
-
"diffdoc:query": "diffdoc query"
|
|
40
|
+
"diffdoc:query": "diffdoc query",
|
|
41
|
+
"diffdoc:mcp": "diffdoc-mcp"
|
|
41
42
|
}
|
|
42
43
|
}
|
|
43
44
|
```
|
|
@@ -71,31 +72,42 @@ Example config with all supported keys:
|
|
|
71
72
|
"cloudLlmEndpoint": "https://api.openai.com/v1",
|
|
72
73
|
"cloudChatModel": "gpt-4o-mini",
|
|
73
74
|
"cloudEmbedModel": "text-embedding-3-small",
|
|
74
|
-
"openaiApiKey": ""
|
|
75
|
+
"openaiApiKey": "",
|
|
76
|
+
"includeGlobs": [],
|
|
77
|
+
"excludeGlobs": [],
|
|
78
|
+
"ignoreFile": ".diffdocignore"
|
|
75
79
|
}
|
|
76
80
|
```
|
|
77
81
|
|
|
78
|
-
Supported environment fallbacks use the uppercase names for the same settings, including `AI_PROVIDER`, `DIFFDOC_BASE_DIR`, `LOCAL_LLM_ENDPOINT`, `LOCAL_EMBED_ENDPOINT`, `LOCAL_CHAT_MODEL`, `LOCAL_EMBED_MODEL`, `CLOUD_LLM_ENDPOINT`, `CLOUD_CHAT_MODEL`, `CLOUD_EMBED_MODEL`, and `
|
|
82
|
+
Supported environment fallbacks use the uppercase names for the same settings, including `AI_PROVIDER`, `DIFFDOC_BASE_DIR`, `LOCAL_LLM_ENDPOINT`, `LOCAL_EMBED_ENDPOINT`, `LOCAL_CHAT_MODEL`, `LOCAL_EMBED_MODEL`, `CLOUD_LLM_ENDPOINT`, `CLOUD_CHAT_MODEL`, `CLOUD_EMBED_MODEL`, `OPENAI_API_KEY`, `DIFFDOC_INCLUDE_GLOBS`, `DIFFDOC_EXCLUDE_GLOBS`, and `DIFFDOC_IGNORE_FILE`.
|
|
79
83
|
|
|
80
84
|
## Manifest-First Design
|
|
81
85
|
|
|
82
|
-
DiffDoc separates summarization from embedding. The `summarize` command writes
|
|
86
|
+
DiffDoc separates summarization from embedding. The `summarize` command writes file-to-hash mappings to `manifest.json` and stores each summary in an independent hash-addressed JSON file under `./.diffdoc/summaries/`.
|
|
83
87
|
|
|
84
88
|
The manifest is plain JSON and contains one entry per tracked file:
|
|
85
89
|
|
|
86
90
|
```json
|
|
87
91
|
{
|
|
92
|
+
"schemaVersion": 2,
|
|
88
93
|
"lastSyncedCommit": "string-hash",
|
|
89
94
|
"files": {
|
|
90
|
-
"src/example.ts":
|
|
91
|
-
"hash": "md5-string",
|
|
92
|
-
"summaryText": "Plain-English explanation text here.",
|
|
93
|
-
"rawCodeSnapshot": "Full code text here..."
|
|
94
|
-
}
|
|
95
|
+
"src/example.ts": "md5-string"
|
|
95
96
|
}
|
|
96
97
|
}
|
|
97
98
|
```
|
|
98
99
|
|
|
100
|
+
Example summary asset at `./.diffdoc/summaries/<hash>.json`:
|
|
101
|
+
|
|
102
|
+
```json
|
|
103
|
+
{
|
|
104
|
+
"schemaVersion": 1,
|
|
105
|
+
"content_hash": "md5-string",
|
|
106
|
+
"summary": "Plain-English explanation text here.",
|
|
107
|
+
"raw_code_snapshot": "Optional code text when --include-code-snapshot is enabled"
|
|
108
|
+
}
|
|
109
|
+
```
|
|
110
|
+
|
|
99
111
|
Because the summaries are stored independently, users do not have to embed immediately. They can review, archive, transform, or embed the manifest later using their preferred vectorization model and storage solution.
|
|
100
112
|
|
|
101
113
|
DiffDoc includes `diffdoc embed` as a built-in convenience path for creating a local Vectra index, but the manifest can also be consumed by other tools such as custom OpenAI-compatible embedding pipelines, hosted vector databases, local search systems, or internal documentation workflows.
|
|
@@ -114,12 +126,30 @@ Summarize only changed Git files using the existing manifest state:
|
|
|
114
126
|
diffdoc summarize --path . --mode delta
|
|
115
127
|
```
|
|
116
128
|
|
|
129
|
+
Store raw code snapshots in summary assets:
|
|
130
|
+
|
|
131
|
+
```bash
|
|
132
|
+
diffdoc summarize --path . --mode all --include-code-snapshot
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
Add include/exclude filters at runtime:
|
|
136
|
+
|
|
137
|
+
```bash
|
|
138
|
+
diffdoc summarize --path . --mode all --include-glob "src/**/*.ts" --exclude-glob "**/*.test.ts"
|
|
139
|
+
```
|
|
140
|
+
|
|
117
141
|
Embed the manifest into a local Vectra index at `./.diffdoc/vectra`:
|
|
118
142
|
|
|
119
143
|
```bash
|
|
120
144
|
diffdoc embed
|
|
121
145
|
```
|
|
122
146
|
|
|
147
|
+
Force full index rebuild:
|
|
148
|
+
|
|
149
|
+
```bash
|
|
150
|
+
diffdoc embed --rebuild
|
|
151
|
+
```
|
|
152
|
+
|
|
123
153
|
Search the local Vectra index and print raw matches:
|
|
124
154
|
|
|
125
155
|
```bash
|
|
@@ -180,10 +210,53 @@ diffdoc summarize --path . --mode delta
|
|
|
180
210
|
diffdoc embed
|
|
181
211
|
```
|
|
182
212
|
|
|
213
|
+
## GitHub Actions
|
|
214
|
+
|
|
215
|
+
This repository includes a workflow at `.github/workflows/diffdoc-summarize.yml` that runs on pushes to `main`. It installs the project, builds the CLI, runs delta summarization, and commits `.diffdoc/manifest.json` back to the branch when the manifest changes.
|
|
216
|
+
|
|
217
|
+
The workflow intentionally ignores `.diffdoc/manifest.json` and `.diffdoc/vectra/**` changes as triggers so the bot commit does not create a loop.
|
|
218
|
+
|
|
219
|
+
Configure the same values used by the CLI as GitHub Actions variables or secrets, such as `AI_PROVIDER`, `LOCAL_LLM_ENDPOINT`, `LOCAL_CHAT_MODEL`, `CLOUD_LLM_ENDPOINT`, `CLOUD_CHAT_MODEL`, and `OPENAI_API_KEY`. The workflow uses the environment-variable fallback path in DiffDoc, so no `.diffdocrc` file is required in CI.
|
|
220
|
+
|
|
221
|
+
## MCP Server
|
|
222
|
+
|
|
223
|
+
DiffDoc also ships a local MCP stdio server as `diffdoc-mcp`. This lets MCP-compatible agents search or answer questions against the local Vectra index directly.
|
|
224
|
+
|
|
225
|
+
Run it manually with the same config style as the CLI:
|
|
226
|
+
|
|
227
|
+
```bash
|
|
228
|
+
diffdoc-mcp --config ./.diffdocrc
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
Example MCP client configuration:
|
|
232
|
+
|
|
233
|
+
```json
|
|
234
|
+
{
|
|
235
|
+
"mcpServers": {
|
|
236
|
+
"diffdoc": {
|
|
237
|
+
"command": "npx",
|
|
238
|
+
"args": ["diffdoc-mcp", "--config", "./.diffdocrc"]
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
If DiffDoc is installed as a project dev dependency, the same `npx diffdoc-mcp` command will resolve the local package binary.
|
|
245
|
+
|
|
246
|
+
Available MCP tools:
|
|
247
|
+
|
|
248
|
+
- `diffdoc_search`: searches the local Vectra index and returns raw file matches, summaries, scores, hashes, and optional code snapshots.
|
|
249
|
+
- `diffdoc_answer`: retrieves relevant index context and asks the configured chat model to answer the question.
|
|
250
|
+
- `diffdoc_index_stats`: returns the Vectra index path, whether it exists, and the indexed item count.
|
|
251
|
+
|
|
252
|
+
Run `diffdoc summarize` and `diffdoc embed` before using the MCP server, otherwise the search and answer tools will not have a local index to query.
|
|
253
|
+
|
|
183
254
|
## Notes
|
|
184
255
|
|
|
185
256
|
- Node.js `>=22` is required because Vectra requires it.
|
|
186
257
|
- This repository ignores `.diffdoc/vectra` and `.diffdocrc`; add similar entries to your project's `.gitignore` if you do not want generated indexes or local config committed. The manifest at `.diffdoc/manifest.json` is not ignored by this repository.
|
|
258
|
+
- Summary assets are written to `.diffdoc/summaries/*.json`.
|
|
259
|
+
- Manifest schema is currently `schemaVersion: 2`; older manifest shapes are not auto-migrated.
|
|
187
260
|
- Commit `.diffdoc/manifest.json` when using delta workflows. Delta summarization reads the previous manifest state to decide which changed files need fresh summaries.
|
|
188
261
|
- `summarize` requires a configured chat model.
|
|
189
262
|
- `embed` requires a configured embedding model.
|
package/dist/commands/embed.js
CHANGED
|
@@ -8,56 +8,142 @@ exports.runEmbed = runEmbed;
|
|
|
8
8
|
const promises_1 = __importDefault(require("node:fs/promises"));
|
|
9
9
|
const node_path_1 = __importDefault(require("node:path"));
|
|
10
10
|
const vectra_1 = require("vectra");
|
|
11
|
+
const artifacts_1 = require("../types/artifacts");
|
|
11
12
|
const llm_1 = require("../utils/llm");
|
|
12
13
|
const paths_1 = require("../utils/paths");
|
|
13
14
|
const VECTRA_INDEX_DIR = "vectra";
|
|
14
15
|
function getVectraIndexPath(config) {
|
|
15
16
|
return node_path_1.default.resolve((0, paths_1.getDiffdocBaseDir)(config.baseDir), VECTRA_INDEX_DIR);
|
|
16
17
|
}
|
|
18
|
+
function getSummaryDir(manifestPath) {
|
|
19
|
+
return node_path_1.default.resolve(node_path_1.default.dirname(manifestPath), "summaries");
|
|
20
|
+
}
|
|
21
|
+
function getSummaryPath(summaryDir, hash) {
|
|
22
|
+
return node_path_1.default.resolve(summaryDir, `${hash}.json`);
|
|
23
|
+
}
|
|
24
|
+
async function readManifest(manifestPath) {
|
|
25
|
+
const parsed = JSON.parse(await promises_1.default.readFile(manifestPath, "utf8"));
|
|
26
|
+
if (parsed.schemaVersion !== artifacts_1.MANIFEST_SCHEMA_VERSION) {
|
|
27
|
+
throw new Error(`Unsupported manifest schema in ${manifestPath}. Expected schemaVersion ${artifacts_1.MANIFEST_SCHEMA_VERSION}.`);
|
|
28
|
+
}
|
|
29
|
+
return {
|
|
30
|
+
schemaVersion: artifacts_1.MANIFEST_SCHEMA_VERSION,
|
|
31
|
+
lastSyncedCommit: typeof parsed.lastSyncedCommit === "string" ? parsed.lastSyncedCommit : "",
|
|
32
|
+
files: parsed.files && typeof parsed.files === "object" ? parsed.files : {}
|
|
33
|
+
};
|
|
34
|
+
}
|
|
35
|
+
async function readSummaryAsset(summaryPath) {
|
|
36
|
+
const parsed = JSON.parse(await promises_1.default.readFile(summaryPath, "utf8"));
|
|
37
|
+
if (parsed.schemaVersion !== artifacts_1.SUMMARY_ASSET_SCHEMA_VERSION) {
|
|
38
|
+
throw new Error(`Unsupported summary schema in ${summaryPath}. Expected schemaVersion ${artifacts_1.SUMMARY_ASSET_SCHEMA_VERSION}.`);
|
|
39
|
+
}
|
|
40
|
+
if (typeof parsed.content_hash !== "string") {
|
|
41
|
+
throw new Error(`Invalid summary hash in ${summaryPath}.`);
|
|
42
|
+
}
|
|
43
|
+
if (typeof parsed.summary !== "string") {
|
|
44
|
+
throw new Error(`Invalid summary text in ${summaryPath}.`);
|
|
45
|
+
}
|
|
46
|
+
return {
|
|
47
|
+
schemaVersion: artifacts_1.SUMMARY_ASSET_SCHEMA_VERSION,
|
|
48
|
+
content_hash: parsed.content_hash,
|
|
49
|
+
summary: parsed.summary,
|
|
50
|
+
raw_code_snapshot: typeof parsed.raw_code_snapshot === "string" ? parsed.raw_code_snapshot : undefined
|
|
51
|
+
};
|
|
52
|
+
}
|
|
17
53
|
function buildDocument(filePath, summaryText, rawCodeSnapshot) {
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
54
|
+
let output = `File: ${filePath}\nSummary: ${summaryText}`;
|
|
55
|
+
if (rawCodeSnapshot) {
|
|
56
|
+
output += `\n\nCode Snapshot:\n\`\`\`\n${rawCodeSnapshot}\n\`\`\``;
|
|
57
|
+
}
|
|
58
|
+
return output;
|
|
21
59
|
}
|
|
22
60
|
async function runEmbed(options, config) {
|
|
23
61
|
const manifestPath = (0, paths_1.resolveDiffdocArtifactPath)(options.manifest, config.baseDir);
|
|
24
|
-
const manifest =
|
|
62
|
+
const manifest = await readManifest(manifestPath);
|
|
25
63
|
const entries = Object.entries(manifest.files);
|
|
64
|
+
const summaryDir = getSummaryDir(manifestPath);
|
|
26
65
|
const indexPath = getVectraIndexPath(config);
|
|
27
66
|
const index = new vectra_1.LocalIndex(indexPath);
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
67
|
+
if (options.rebuild) {
|
|
68
|
+
await index.createIndex({
|
|
69
|
+
version: 1,
|
|
70
|
+
deleteIfExists: true,
|
|
71
|
+
metadata_config: {
|
|
72
|
+
indexed: ["filePath", "hash"]
|
|
73
|
+
}
|
|
74
|
+
});
|
|
75
|
+
}
|
|
76
|
+
else if (!await index.isIndexCreated()) {
|
|
77
|
+
await index.createIndex({
|
|
78
|
+
version: 1,
|
|
79
|
+
deleteIfExists: false,
|
|
80
|
+
metadata_config: {
|
|
81
|
+
indexed: ["filePath", "hash"]
|
|
82
|
+
}
|
|
83
|
+
});
|
|
84
|
+
}
|
|
85
|
+
const existingItems = await index.listItems();
|
|
86
|
+
const existingByPath = new Map(existingItems.map((item) => [item.id, item]));
|
|
87
|
+
const toUpsert = [];
|
|
88
|
+
for (const [filePath, hash] of entries) {
|
|
89
|
+
const existing = existingByPath.get(filePath);
|
|
90
|
+
if (existing?.metadata.hash === hash) {
|
|
91
|
+
continue;
|
|
92
|
+
}
|
|
93
|
+
const summaryPath = getSummaryPath(summaryDir, hash);
|
|
94
|
+
const summaryAsset = await readSummaryAsset(summaryPath);
|
|
95
|
+
if (summaryAsset.content_hash !== hash) {
|
|
96
|
+
throw new Error(`Hash mismatch in summary asset ${summaryPath}.`);
|
|
33
97
|
}
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
98
|
+
toUpsert.push({
|
|
99
|
+
filePath,
|
|
100
|
+
hash,
|
|
101
|
+
summaryText: summaryAsset.summary,
|
|
102
|
+
rawCodeSnapshot: summaryAsset.raw_code_snapshot,
|
|
103
|
+
document: buildDocument(filePath, summaryAsset.summary, summaryAsset.raw_code_snapshot)
|
|
104
|
+
});
|
|
105
|
+
}
|
|
106
|
+
const activePathSet = new Set(entries.map(([filePath]) => filePath));
|
|
107
|
+
const toDelete = existingItems
|
|
108
|
+
.map((item) => item.id)
|
|
109
|
+
.filter((id) => Boolean(id) && !activePathSet.has(id));
|
|
110
|
+
if (toUpsert.length === 0 && toDelete.length === 0) {
|
|
111
|
+
console.log(`Index is already up to date at ${indexPath}.`);
|
|
37
112
|
return;
|
|
38
113
|
}
|
|
39
|
-
const
|
|
40
|
-
|
|
114
|
+
const embeddings = toUpsert.length > 0
|
|
115
|
+
? await (0, llm_1.generateEmbeddings)(toUpsert.map((item) => item.document), config.embeddings)
|
|
116
|
+
: [];
|
|
41
117
|
await index.beginUpdate();
|
|
42
118
|
try {
|
|
43
|
-
for (let i = 0; i <
|
|
44
|
-
const
|
|
119
|
+
for (let i = 0; i < toUpsert.length; i += 1) {
|
|
120
|
+
const item = toUpsert[i];
|
|
121
|
+
const metadata = item.rawCodeSnapshot
|
|
122
|
+
? {
|
|
123
|
+
filePath: item.filePath,
|
|
124
|
+
hash: item.hash,
|
|
125
|
+
summaryText: item.summaryText,
|
|
126
|
+
rawCodeSnapshot: item.rawCodeSnapshot
|
|
127
|
+
}
|
|
128
|
+
: {
|
|
129
|
+
filePath: item.filePath,
|
|
130
|
+
hash: item.hash,
|
|
131
|
+
summaryText: item.summaryText
|
|
132
|
+
};
|
|
45
133
|
await index.upsertItem({
|
|
46
|
-
id: filePath,
|
|
134
|
+
id: item.filePath,
|
|
47
135
|
vector: embeddings[i],
|
|
48
|
-
metadata
|
|
49
|
-
filePath,
|
|
50
|
-
hash: file.hash,
|
|
51
|
-
summaryText: file.summaryText,
|
|
52
|
-
rawCodeSnapshot: file.rawCodeSnapshot
|
|
53
|
-
}
|
|
136
|
+
metadata
|
|
54
137
|
});
|
|
55
138
|
}
|
|
139
|
+
for (const itemId of toDelete) {
|
|
140
|
+
await index.deleteItem(itemId);
|
|
141
|
+
}
|
|
56
142
|
await index.endUpdate();
|
|
57
143
|
}
|
|
58
144
|
catch (error) {
|
|
59
145
|
index.cancelUpdate();
|
|
60
146
|
throw error;
|
|
61
147
|
}
|
|
62
|
-
console.log(`Embedded ${
|
|
148
|
+
console.log(`Embedded ${toUpsert.length} summaries and pruned ${toDelete.length} items in ${indexPath}.`);
|
|
63
149
|
}
|
package/dist/commands/query.js
CHANGED
|
@@ -2,95 +2,49 @@
|
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
3
|
exports.runQuery = runQuery;
|
|
4
4
|
exports.runSearch = runSearch;
|
|
5
|
-
const
|
|
6
|
-
const llm_1 = require("../utils/llm");
|
|
7
|
-
const embed_1 = require("./embed");
|
|
8
|
-
const CODE_QUERY_PREFIX = "Represent this query for searching relevant code: ";
|
|
9
|
-
function parseTopK(value) {
|
|
10
|
-
const topK = Number.parseInt(value, 10);
|
|
11
|
-
if (!Number.isInteger(topK) || topK < 1) {
|
|
12
|
-
throw new Error("Invalid --top value. Expected a positive integer.");
|
|
13
|
-
}
|
|
14
|
-
return topK;
|
|
15
|
-
}
|
|
16
|
-
function trimForDisplay(text, maxLength) {
|
|
17
|
-
if (text.length <= maxLength) {
|
|
18
|
-
return text;
|
|
19
|
-
}
|
|
20
|
-
return `${text.slice(0, maxLength).trimEnd()}...`;
|
|
21
|
-
}
|
|
22
|
-
function buildAnswerPrompt(question, results) {
|
|
23
|
-
const context = results.map((result, indexPosition) => {
|
|
24
|
-
const metadata = result.item.metadata;
|
|
25
|
-
return [
|
|
26
|
-
`Result ${indexPosition + 1}`,
|
|
27
|
-
`File: ${metadata.filePath}`,
|
|
28
|
-
`Score: ${result.score}`,
|
|
29
|
-
`Summary:\n${metadata.summaryText}`,
|
|
30
|
-
`Code Snapshot:\n${metadata.rawCodeSnapshot}`
|
|
31
|
-
].join("\n");
|
|
32
|
-
}).join("\n\n---\n\n");
|
|
33
|
-
return `Answer the user's question using only the retrieved DiffDoc results below. If the results do not contain enough information, say what is missing. Prefer a direct answer first, then cite the relevant file paths. Keep the explanation appropriate to the question: summarize when asked for a summary, explain implementation details when asked how something works, and avoid unsupported claims.\n\nUser question:\n${question}\n\nRetrieved results:\n${context}`;
|
|
34
|
-
}
|
|
5
|
+
const retrieval_1 = require("../services/retrieval");
|
|
35
6
|
async function runQuery(message, options, config) {
|
|
36
|
-
const topK = parseTopK(options.top);
|
|
37
|
-
const
|
|
38
|
-
|
|
39
|
-
if (
|
|
40
|
-
throw new Error(`No Vectra index found at ${indexPath}. Run "diffdoc embed" first.`);
|
|
41
|
-
}
|
|
42
|
-
const [queryVector] = await (0, llm_1.generateEmbeddings)([`${CODE_QUERY_PREFIX}${message}`], config.embeddings);
|
|
43
|
-
const results = await index.queryItems(queryVector, message, topK);
|
|
44
|
-
if (results.length === 0) {
|
|
45
|
-
console.log("No matching embedded summaries found.");
|
|
7
|
+
const topK = (0, retrieval_1.parseTopK)(options.top);
|
|
8
|
+
const answerResult = await (0, retrieval_1.answerFromIndex)(message, topK, config);
|
|
9
|
+
console.log(answerResult.answer);
|
|
10
|
+
if (answerResult.sources.length === 0) {
|
|
46
11
|
return;
|
|
47
12
|
}
|
|
48
|
-
const answer = await (0, llm_1.promptLlm)(buildAnswerPrompt(message, results), config.chat);
|
|
49
|
-
console.log(answer);
|
|
50
13
|
console.log("\nSources:");
|
|
51
|
-
for (const [indexPosition,
|
|
52
|
-
|
|
53
|
-
console.log(`${indexPosition + 1}. ${metadata.filePath} (${result.score.toFixed(4)})`);
|
|
14
|
+
for (const [indexPosition, source] of answerResult.sources.entries()) {
|
|
15
|
+
console.log(`${indexPosition + 1}. ${source.filePath} (${source.score.toFixed(4)})`);
|
|
54
16
|
}
|
|
55
17
|
if (!options.code) {
|
|
56
18
|
return;
|
|
57
19
|
}
|
|
58
|
-
for (const [indexPosition, result] of results.entries()) {
|
|
59
|
-
|
|
60
|
-
console.log(`\n#${indexPosition + 1} ${metadata.filePath}`);
|
|
20
|
+
for (const [indexPosition, result] of answerResult.results.entries()) {
|
|
21
|
+
console.log(`\n#${indexPosition + 1} ${result.filePath}`);
|
|
61
22
|
console.log(`Score: ${result.score.toFixed(4)}`);
|
|
62
|
-
console.log(`Hash: ${
|
|
23
|
+
console.log(`Hash: ${result.hash}`);
|
|
63
24
|
console.log("Summary:");
|
|
64
|
-
console.log(trimForDisplay(
|
|
25
|
+
console.log((0, retrieval_1.trimForDisplay)(result.summaryText, 1200));
|
|
65
26
|
if (options.code) {
|
|
66
27
|
console.log("Code Snapshot:");
|
|
67
|
-
console.log(trimForDisplay(
|
|
28
|
+
console.log((0, retrieval_1.trimForDisplay)(result.rawCodeSnapshot || "(not stored)", 2000));
|
|
68
29
|
}
|
|
69
30
|
}
|
|
70
31
|
}
|
|
71
32
|
async function runSearch(message, options, config) {
|
|
72
|
-
const topK = parseTopK(options.top);
|
|
73
|
-
const
|
|
74
|
-
const index = new vectra_1.LocalIndex(indexPath);
|
|
75
|
-
if (!await index.isIndexCreated()) {
|
|
76
|
-
throw new Error(`No Vectra index found at ${indexPath}. Run "diffdoc embed" first.`);
|
|
77
|
-
}
|
|
78
|
-
const [queryVector] = await (0, llm_1.generateEmbeddings)([`${CODE_QUERY_PREFIX}${message}`], config.embeddings);
|
|
79
|
-
const results = await index.queryItems(queryVector, message, topK);
|
|
33
|
+
const topK = (0, retrieval_1.parseTopK)(options.top);
|
|
34
|
+
const results = await (0, retrieval_1.searchIndex)(message, topK, config);
|
|
80
35
|
if (results.length === 0) {
|
|
81
36
|
console.log("No matching embedded summaries found.");
|
|
82
37
|
return;
|
|
83
38
|
}
|
|
84
39
|
for (const [indexPosition, result] of results.entries()) {
|
|
85
|
-
|
|
86
|
-
console.log(`\n#${indexPosition + 1} ${metadata.filePath}`);
|
|
40
|
+
console.log(`\n#${indexPosition + 1} ${result.filePath}`);
|
|
87
41
|
console.log(`Score: ${result.score.toFixed(4)}`);
|
|
88
|
-
console.log(`Hash: ${
|
|
42
|
+
console.log(`Hash: ${result.hash}`);
|
|
89
43
|
console.log("Summary:");
|
|
90
|
-
console.log(trimForDisplay(
|
|
44
|
+
console.log((0, retrieval_1.trimForDisplay)(result.summaryText, 1200));
|
|
91
45
|
if (options.code) {
|
|
92
46
|
console.log("Code Snapshot:");
|
|
93
|
-
console.log(trimForDisplay(
|
|
47
|
+
console.log((0, retrieval_1.trimForDisplay)(result.rawCodeSnapshot || "(not stored)", 2000));
|
|
94
48
|
}
|
|
95
49
|
}
|
|
96
50
|
}
|
|
@@ -6,60 +6,240 @@ Object.defineProperty(exports, "__esModule", { value: true });
|
|
|
6
6
|
exports.runSummarize = runSummarize;
|
|
7
7
|
const promises_1 = __importDefault(require("node:fs/promises"));
|
|
8
8
|
const node_path_1 = __importDefault(require("node:path"));
|
|
9
|
+
const artifacts_1 = require("../types/artifacts");
|
|
9
10
|
const git_1 = require("../utils/git");
|
|
10
11
|
const hashing_1 = require("../utils/hashing");
|
|
11
12
|
const llm_1 = require("../utils/llm");
|
|
12
13
|
const paths_1 = require("../utils/paths");
|
|
13
|
-
const TARGET_EXTENSIONS = new Set([".ts", ".js", ".cs", ".py"]);
|
|
14
|
-
const IGNORED_DIRECTORIES = new Set([".git", "node_modules", "dist"]);
|
|
15
|
-
const IGNORED_FILES = new Set(["package-lock.json", "yarn.lock", "pnpm-lock.yaml", "bun.lockb"]);
|
|
16
14
|
function normalizeRelativePath(filePath) {
|
|
17
15
|
return filePath.split(node_path_1.default.sep).join("/");
|
|
18
16
|
}
|
|
19
|
-
function
|
|
20
|
-
return
|
|
17
|
+
function getSummaryDir(manifestPath) {
|
|
18
|
+
return node_path_1.default.resolve(node_path_1.default.dirname(manifestPath), "summaries");
|
|
19
|
+
}
|
|
20
|
+
function getSummaryPath(summaryDir, hash) {
|
|
21
|
+
return node_path_1.default.resolve(summaryDir, `${hash}.json`);
|
|
22
|
+
}
|
|
23
|
+
function normalizeGlobPattern(pattern) {
|
|
24
|
+
return pattern.split(node_path_1.default.sep).join("/");
|
|
25
|
+
}
|
|
26
|
+
function escapeRegex(value) {
|
|
27
|
+
return value.replace(/[|\\{}()[\]^$+?.]/g, "\\$&");
|
|
28
|
+
}
|
|
29
|
+
function globToRegExp(pattern) {
|
|
30
|
+
const normalized = normalizeGlobPattern(pattern);
|
|
31
|
+
let regexBody = "";
|
|
32
|
+
for (let i = 0; i < normalized.length; i += 1) {
|
|
33
|
+
const char = normalized[i];
|
|
34
|
+
const next = normalized[i + 1];
|
|
35
|
+
if (char === "*" && next === "*") {
|
|
36
|
+
regexBody += ".*";
|
|
37
|
+
i += 1;
|
|
38
|
+
continue;
|
|
39
|
+
}
|
|
40
|
+
if (char === "*") {
|
|
41
|
+
regexBody += "[^/]*";
|
|
42
|
+
continue;
|
|
43
|
+
}
|
|
44
|
+
if (char === "?") {
|
|
45
|
+
regexBody += "[^/]";
|
|
46
|
+
continue;
|
|
47
|
+
}
|
|
48
|
+
regexBody += escapeRegex(char);
|
|
49
|
+
}
|
|
50
|
+
return new RegExp(`^${regexBody}$`);
|
|
51
|
+
}
|
|
52
|
+
function compileGlobs(patterns) {
|
|
53
|
+
return patterns.filter(Boolean).map(globToRegExp);
|
|
54
|
+
}
|
|
55
|
+
function matchesAny(filePath, patterns) {
|
|
56
|
+
return patterns.some((pattern) => pattern.test(filePath));
|
|
57
|
+
}
|
|
58
|
+
function shouldIncludeFile(filePath, includeGlobs, excludeGlobs, ignoreGlobs) {
|
|
59
|
+
if (includeGlobs.length > 0 && !matchesAny(filePath, includeGlobs)) {
|
|
60
|
+
return false;
|
|
61
|
+
}
|
|
62
|
+
if (excludeGlobs.length > 0 && matchesAny(filePath, excludeGlobs)) {
|
|
63
|
+
return false;
|
|
64
|
+
}
|
|
65
|
+
if (ignoreGlobs.length > 0 && matchesAny(filePath, ignoreGlobs)) {
|
|
66
|
+
return false;
|
|
67
|
+
}
|
|
68
|
+
return true;
|
|
69
|
+
}
|
|
70
|
+
async function fileExists(filePath) {
|
|
71
|
+
try {
|
|
72
|
+
await promises_1.default.access(filePath);
|
|
73
|
+
return true;
|
|
74
|
+
}
|
|
75
|
+
catch {
|
|
76
|
+
return false;
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
async function atomicWriteUtf8(targetPath, content) {
|
|
80
|
+
await promises_1.default.mkdir(node_path_1.default.dirname(targetPath), { recursive: true });
|
|
81
|
+
const tempPath = `${targetPath}.${process.pid}.${Date.now()}.tmp`;
|
|
82
|
+
const handle = await promises_1.default.open(tempPath, "w");
|
|
83
|
+
try {
|
|
84
|
+
await handle.writeFile(content, "utf8");
|
|
85
|
+
await handle.sync();
|
|
86
|
+
}
|
|
87
|
+
finally {
|
|
88
|
+
await handle.close();
|
|
89
|
+
}
|
|
90
|
+
await promises_1.default.rename(tempPath, targetPath);
|
|
91
|
+
}
|
|
92
|
+
async function writeManifest(manifestPath, manifest) {
|
|
93
|
+
await atomicWriteUtf8(manifestPath, `${JSON.stringify(manifest, null, 2)}\n`);
|
|
94
|
+
}
|
|
95
|
+
async function writeSummaryAsset(summaryPath, summary) {
|
|
96
|
+
await atomicWriteUtf8(summaryPath, `${JSON.stringify(summary, null, 2)}\n`);
|
|
21
97
|
}
|
|
22
98
|
async function readManifest(manifestPath) {
|
|
23
99
|
try {
|
|
24
|
-
|
|
100
|
+
const parsed = JSON.parse(await promises_1.default.readFile(manifestPath, "utf8"));
|
|
101
|
+
if (parsed.schemaVersion !== artifacts_1.MANIFEST_SCHEMA_VERSION) {
|
|
102
|
+
throw new Error(`Unsupported manifest schema in ${manifestPath}. Expected schemaVersion ${artifacts_1.MANIFEST_SCHEMA_VERSION}.`);
|
|
103
|
+
}
|
|
104
|
+
return {
|
|
105
|
+
schemaVersion: artifacts_1.MANIFEST_SCHEMA_VERSION,
|
|
106
|
+
lastSyncedCommit: typeof parsed.lastSyncedCommit === "string" ? parsed.lastSyncedCommit : "",
|
|
107
|
+
files: parsed.files && typeof parsed.files === "object" ? parsed.files : {}
|
|
108
|
+
};
|
|
25
109
|
}
|
|
26
110
|
catch (error) {
|
|
27
111
|
const nodeError = error;
|
|
28
112
|
if (nodeError.code === "ENOENT") {
|
|
29
|
-
return {
|
|
113
|
+
return {
|
|
114
|
+
schemaVersion: artifacts_1.MANIFEST_SCHEMA_VERSION,
|
|
115
|
+
lastSyncedCommit: "",
|
|
116
|
+
files: {}
|
|
117
|
+
};
|
|
30
118
|
}
|
|
31
119
|
throw error;
|
|
32
120
|
}
|
|
33
121
|
}
|
|
34
|
-
async function
|
|
35
|
-
|
|
36
|
-
|
|
122
|
+
async function readIgnorePatterns(repoPath, ignoreFilePath) {
|
|
123
|
+
const absolutePath = node_path_1.default.isAbsolute(ignoreFilePath)
|
|
124
|
+
? ignoreFilePath
|
|
125
|
+
: node_path_1.default.resolve(repoPath, ignoreFilePath);
|
|
126
|
+
try {
|
|
127
|
+
const raw = await promises_1.default.readFile(absolutePath, "utf8");
|
|
128
|
+
return raw
|
|
129
|
+
.split(/\r?\n/)
|
|
130
|
+
.map((line) => line.trim())
|
|
131
|
+
.filter((line) => line.length > 0 && !line.startsWith("#"))
|
|
132
|
+
.map(normalizeGlobPattern);
|
|
133
|
+
}
|
|
134
|
+
catch (error) {
|
|
135
|
+
const nodeError = error;
|
|
136
|
+
if (nodeError.code === "ENOENT") {
|
|
137
|
+
return [];
|
|
138
|
+
}
|
|
139
|
+
throw error;
|
|
140
|
+
}
|
|
37
141
|
}
|
|
38
|
-
async function walkCodeFiles(rootPath, currentPath = rootPath) {
|
|
142
|
+
async function walkCodeFiles(rootPath, includeGlobs, excludeGlobs, ignoreGlobs, currentPath = rootPath) {
|
|
39
143
|
const entries = await promises_1.default.readdir(currentPath, { withFileTypes: true });
|
|
40
144
|
const files = [];
|
|
41
145
|
for (const entry of entries) {
|
|
42
146
|
const entryPath = node_path_1.default.join(currentPath, entry.name);
|
|
43
147
|
if (entry.isDirectory()) {
|
|
44
|
-
|
|
45
|
-
files.push(...await walkCodeFiles(rootPath, entryPath));
|
|
46
|
-
}
|
|
148
|
+
files.push(...await walkCodeFiles(rootPath, includeGlobs, excludeGlobs, ignoreGlobs, entryPath));
|
|
47
149
|
continue;
|
|
48
150
|
}
|
|
49
|
-
if (entry.isFile()
|
|
50
|
-
|
|
151
|
+
if (entry.isFile()) {
|
|
152
|
+
const relativePath = normalizeRelativePath(node_path_1.default.relative(rootPath, entryPath));
|
|
153
|
+
if (shouldIncludeFile(relativePath, includeGlobs, excludeGlobs, ignoreGlobs)) {
|
|
154
|
+
files.push(relativePath);
|
|
155
|
+
}
|
|
51
156
|
}
|
|
52
157
|
}
|
|
53
158
|
return files.sort();
|
|
54
159
|
}
|
|
55
|
-
|
|
56
|
-
const
|
|
57
|
-
const
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
160
|
+
function countHashRefs(files) {
|
|
161
|
+
const refs = new Map();
|
|
162
|
+
for (const hash of Object.values(files)) {
|
|
163
|
+
refs.set(hash, (refs.get(hash) || 0) + 1);
|
|
164
|
+
}
|
|
165
|
+
return refs;
|
|
166
|
+
}
|
|
167
|
+
async function deleteSummaryIfUnreferenced(summaryDir, hash, refs) {
|
|
168
|
+
if ((refs.get(hash) || 0) > 0) {
|
|
169
|
+
return;
|
|
170
|
+
}
|
|
171
|
+
const summaryPath = getSummaryPath(summaryDir, hash);
|
|
172
|
+
try {
|
|
173
|
+
await promises_1.default.unlink(summaryPath);
|
|
174
|
+
}
|
|
175
|
+
catch (error) {
|
|
176
|
+
const nodeError = error;
|
|
177
|
+
if (nodeError.code !== "ENOENT") {
|
|
178
|
+
throw error;
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
async function setManifestPathHash(filePath, newHash, manifest, manifestPath, summaryDir, refs) {
|
|
183
|
+
const previousHash = manifest.files[filePath];
|
|
184
|
+
if (previousHash === newHash) {
|
|
185
|
+
return;
|
|
186
|
+
}
|
|
187
|
+
if (previousHash) {
|
|
188
|
+
refs.set(previousHash, Math.max((refs.get(previousHash) || 1) - 1, 0));
|
|
189
|
+
}
|
|
190
|
+
manifest.files[filePath] = newHash;
|
|
191
|
+
refs.set(newHash, (refs.get(newHash) || 0) + 1);
|
|
192
|
+
await writeManifest(manifestPath, manifest);
|
|
193
|
+
if (previousHash) {
|
|
194
|
+
await deleteSummaryIfUnreferenced(summaryDir, previousHash, refs);
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
async function removeManifestPath(filePath, manifest, manifestPath, summaryDir, refs) {
|
|
198
|
+
const previousHash = manifest.files[filePath];
|
|
199
|
+
if (!previousHash) {
|
|
200
|
+
return;
|
|
201
|
+
}
|
|
202
|
+
delete manifest.files[filePath];
|
|
203
|
+
refs.set(previousHash, Math.max((refs.get(previousHash) || 1) - 1, 0));
|
|
204
|
+
await writeManifest(manifestPath, manifest);
|
|
205
|
+
await deleteSummaryIfUnreferenced(summaryDir, previousHash, refs);
|
|
206
|
+
}
|
|
207
|
+
async function ensureSummaryAsset(summaryDir, hash, summaryText, rawCodeSnapshot, includeCodeSnapshot) {
|
|
208
|
+
const summaryPath = getSummaryPath(summaryDir, hash);
|
|
209
|
+
if (await fileExists(summaryPath)) {
|
|
210
|
+
return;
|
|
211
|
+
}
|
|
212
|
+
const summary = {
|
|
213
|
+
schemaVersion: artifacts_1.SUMMARY_ASSET_SCHEMA_VERSION,
|
|
214
|
+
content_hash: hash,
|
|
215
|
+
summary: summaryText,
|
|
216
|
+
raw_code_snapshot: includeCodeSnapshot ? rawCodeSnapshot : undefined
|
|
62
217
|
};
|
|
218
|
+
await writeSummaryAsset(summaryPath, summary);
|
|
219
|
+
}
|
|
220
|
+
async function pruneOrphanedSummaries(summaryDir, manifest) {
|
|
221
|
+
const activeHashes = new Set(Object.values(manifest.files));
|
|
222
|
+
let entries = [];
|
|
223
|
+
try {
|
|
224
|
+
entries = await promises_1.default.readdir(summaryDir);
|
|
225
|
+
}
|
|
226
|
+
catch (error) {
|
|
227
|
+
const nodeError = error;
|
|
228
|
+
if (nodeError.code === "ENOENT") {
|
|
229
|
+
return;
|
|
230
|
+
}
|
|
231
|
+
throw error;
|
|
232
|
+
}
|
|
233
|
+
for (const entry of entries) {
|
|
234
|
+
if (!entry.endsWith(".json")) {
|
|
235
|
+
continue;
|
|
236
|
+
}
|
|
237
|
+
const hash = entry.slice(0, -5);
|
|
238
|
+
if (activeHashes.has(hash)) {
|
|
239
|
+
continue;
|
|
240
|
+
}
|
|
241
|
+
await promises_1.default.unlink(node_path_1.default.resolve(summaryDir, entry));
|
|
242
|
+
}
|
|
63
243
|
}
|
|
64
244
|
async function runSummarize(options, config) {
|
|
65
245
|
if (options.mode !== "all" && options.mode !== "delta") {
|
|
@@ -68,46 +248,93 @@ async function runSummarize(options, config) {
|
|
|
68
248
|
const commandCwd = process.cwd();
|
|
69
249
|
const repoPath = node_path_1.default.resolve(commandCwd, options.path);
|
|
70
250
|
const manifestPath = (0, paths_1.resolveDiffdocArtifactPath)(options.out, config.baseDir);
|
|
71
|
-
const
|
|
251
|
+
const summaryDir = getSummaryDir(manifestPath);
|
|
252
|
+
const manifest = await readManifest(manifestPath);
|
|
253
|
+
const refs = countHashRefs(manifest.files);
|
|
254
|
+
const includePatterns = compileGlobs((options.includeGlobs && options.includeGlobs.length > 0)
|
|
255
|
+
? options.includeGlobs.map(normalizeGlobPattern)
|
|
256
|
+
: config.summarize.includeGlobs.map(normalizeGlobPattern));
|
|
257
|
+
const excludePatterns = compileGlobs((options.excludeGlobs && options.excludeGlobs.length > 0)
|
|
258
|
+
? options.excludeGlobs.map(normalizeGlobPattern)
|
|
259
|
+
: config.summarize.excludeGlobs.map(normalizeGlobPattern));
|
|
260
|
+
const ignoreFile = options.ignoreFile || config.summarize.ignoreFile;
|
|
261
|
+
const ignorePatterns = compileGlobs(await readIgnorePatterns(repoPath, ignoreFile));
|
|
262
|
+
const failures = [];
|
|
72
263
|
if (options.mode === "all") {
|
|
73
|
-
const files = await walkCodeFiles(repoPath);
|
|
74
264
|
manifest.files = {};
|
|
265
|
+
refs.clear();
|
|
266
|
+
await writeManifest(manifestPath, manifest);
|
|
267
|
+
const files = await walkCodeFiles(repoPath, includePatterns, excludePatterns, ignorePatterns);
|
|
75
268
|
for (const filePath of files) {
|
|
76
|
-
|
|
77
|
-
|
|
269
|
+
try {
|
|
270
|
+
const absolutePath = node_path_1.default.join(repoPath, filePath);
|
|
271
|
+
const rawCodeSnapshot = await promises_1.default.readFile(absolutePath, "utf8");
|
|
272
|
+
const hash = (0, hashing_1.hashFileContent)(rawCodeSnapshot);
|
|
273
|
+
const summaryPath = getSummaryPath(summaryDir, hash);
|
|
274
|
+
if (!await fileExists(summaryPath)) {
|
|
275
|
+
const summaryText = await (0, llm_1.generateFunctionalSummary)(filePath, rawCodeSnapshot, config.chat);
|
|
276
|
+
await ensureSummaryAsset(summaryDir, hash, summaryText, rawCodeSnapshot, options.includeCodeSnapshot);
|
|
277
|
+
}
|
|
278
|
+
manifest.files[filePath] = hash;
|
|
279
|
+
refs.set(hash, (refs.get(hash) || 0) + 1);
|
|
280
|
+
await writeManifest(manifestPath, manifest);
|
|
281
|
+
console.log(`Summarized ${filePath}`);
|
|
282
|
+
}
|
|
283
|
+
catch (error) {
|
|
284
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
285
|
+
failures.push({ filePath, message });
|
|
286
|
+
console.error(`Failed ${filePath}: ${message}`);
|
|
287
|
+
}
|
|
78
288
|
}
|
|
79
289
|
}
|
|
80
290
|
else {
|
|
81
291
|
const deltas = await (0, git_1.getGitDeltas)(repoPath, manifest.lastSyncedCommit);
|
|
82
292
|
for (const deletedPath of deltas.deleted) {
|
|
83
|
-
|
|
293
|
+
await removeManifestPath(deletedPath, manifest, manifestPath, summaryDir, refs);
|
|
84
294
|
console.log(`Pruned ${deletedPath}`);
|
|
85
295
|
}
|
|
86
296
|
for (const filePath of deltas.modifiedOrAdded) {
|
|
87
|
-
const absolutePath = node_path_1.default.join(repoPath, filePath);
|
|
88
297
|
try {
|
|
298
|
+
if (!shouldIncludeFile(filePath, includePatterns, excludePatterns, ignorePatterns)) {
|
|
299
|
+
await removeManifestPath(filePath, manifest, manifestPath, summaryDir, refs);
|
|
300
|
+
continue;
|
|
301
|
+
}
|
|
302
|
+
const previousHash = manifest.files[filePath];
|
|
303
|
+
const absolutePath = node_path_1.default.join(repoPath, filePath);
|
|
89
304
|
const rawCodeSnapshot = await promises_1.default.readFile(absolutePath, "utf8");
|
|
90
305
|
const hash = (0, hashing_1.hashFileContent)(rawCodeSnapshot);
|
|
91
|
-
if (
|
|
306
|
+
if (previousHash === hash) {
|
|
92
307
|
continue;
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
rawCodeSnapshot
|
|
97
|
-
|
|
308
|
+
}
|
|
309
|
+
const summaryPath = getSummaryPath(summaryDir, hash);
|
|
310
|
+
if (!await fileExists(summaryPath)) {
|
|
311
|
+
const summaryText = await (0, llm_1.generateFunctionalSummary)(filePath, rawCodeSnapshot, config.chat);
|
|
312
|
+
await ensureSummaryAsset(summaryDir, hash, summaryText, rawCodeSnapshot, options.includeCodeSnapshot);
|
|
313
|
+
}
|
|
314
|
+
await setManifestPathHash(filePath, hash, manifest, manifestPath, summaryDir, refs);
|
|
98
315
|
console.log(`Updated ${filePath}`);
|
|
99
316
|
}
|
|
100
317
|
catch (error) {
|
|
101
318
|
const nodeError = error;
|
|
102
319
|
if (nodeError.code === "ENOENT") {
|
|
103
|
-
|
|
320
|
+
await removeManifestPath(filePath, manifest, manifestPath, summaryDir, refs);
|
|
104
321
|
continue;
|
|
105
322
|
}
|
|
106
|
-
|
|
323
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
324
|
+
failures.push({ filePath, message });
|
|
325
|
+
console.error(`Failed ${filePath}: ${message}`);
|
|
107
326
|
}
|
|
108
327
|
}
|
|
109
328
|
}
|
|
110
329
|
manifest.lastSyncedCommit = await (0, git_1.getCurrentCommit)(repoPath);
|
|
111
330
|
await writeManifest(manifestPath, manifest);
|
|
331
|
+
await pruneOrphanedSummaries(summaryDir, manifest);
|
|
112
332
|
console.log(`Wrote manifest to ${manifestPath}`);
|
|
333
|
+
if (failures.length > 0) {
|
|
334
|
+
console.error(`\n${failures.length} file(s) failed during summarization:`);
|
|
335
|
+
for (const failure of failures) {
|
|
336
|
+
console.error(`- ${failure.filePath}: ${failure.message}`);
|
|
337
|
+
}
|
|
338
|
+
throw new Error("Summarization completed with failures.");
|
|
339
|
+
}
|
|
113
340
|
}
|
package/dist/config.js
CHANGED
|
@@ -9,6 +9,22 @@ const node_path_1 = __importDefault(require("node:path"));
|
|
|
9
9
|
function readOption(value, envName, fallback = "") {
|
|
10
10
|
return value || process.env[envName] || fallback;
|
|
11
11
|
}
|
|
12
|
+
function parseCsv(value) {
|
|
13
|
+
return value.split(",").map((item) => item.trim()).filter(Boolean);
|
|
14
|
+
}
|
|
15
|
+
function readListOption(value, envName, fallback = []) {
|
|
16
|
+
if (Array.isArray(value)) {
|
|
17
|
+
return value.flatMap((item) => parseCsv(item)).filter(Boolean);
|
|
18
|
+
}
|
|
19
|
+
if (typeof value === "string" && value.trim()) {
|
|
20
|
+
return parseCsv(value);
|
|
21
|
+
}
|
|
22
|
+
const envValue = process.env[envName];
|
|
23
|
+
if (envValue && envValue.trim()) {
|
|
24
|
+
return parseCsv(envValue);
|
|
25
|
+
}
|
|
26
|
+
return fallback;
|
|
27
|
+
}
|
|
12
28
|
function loadRcFile(configPath) {
|
|
13
29
|
const resolvedPath = node_path_1.default.resolve(process.cwd(), configPath || ".diffdocrc");
|
|
14
30
|
if (!node_fs_1.default.existsSync(resolvedPath)) {
|
|
@@ -41,6 +57,9 @@ function buildRuntimeConfig(options, needs = { chat: true, embeddings: true }) {
|
|
|
41
57
|
const mergedOptions = mergeConfigOptions(options);
|
|
42
58
|
const provider = readProvider(mergedOptions.aiProvider);
|
|
43
59
|
const apiKey = readOption(mergedOptions.openaiApiKey, "OPENAI_API_KEY", provider === "local" ? "local-key" : "");
|
|
60
|
+
const includeGlobs = readListOption(mergedOptions.includeGlobs, "DIFFDOC_INCLUDE_GLOBS");
|
|
61
|
+
const excludeGlobs = readListOption(mergedOptions.excludeGlobs, "DIFFDOC_EXCLUDE_GLOBS");
|
|
62
|
+
const ignoreFile = readOption(mergedOptions.ignoreFile, "DIFFDOC_IGNORE_FILE", ".diffdocignore");
|
|
44
63
|
const chatBaseURL = provider === "cloud"
|
|
45
64
|
? readOption(mergedOptions.cloudLlmEndpoint, "CLOUD_LLM_ENDPOINT", "https://api.openai.com/v1")
|
|
46
65
|
: readOption(mergedOptions.localLlmEndpoint, "LOCAL_LLM_ENDPOINT");
|
|
@@ -80,6 +99,11 @@ function buildRuntimeConfig(options, needs = { chat: true, embeddings: true }) {
|
|
|
80
99
|
apiKey,
|
|
81
100
|
baseURL: embedBaseURL,
|
|
82
101
|
model: embedModel
|
|
102
|
+
},
|
|
103
|
+
summarize: {
|
|
104
|
+
includeGlobs,
|
|
105
|
+
excludeGlobs,
|
|
106
|
+
ignoreFile
|
|
83
107
|
}
|
|
84
108
|
};
|
|
85
109
|
}
|
package/dist/index.js
CHANGED
|
@@ -8,6 +8,10 @@ const query_1 = require("./commands/query");
|
|
|
8
8
|
const summarize_1 = require("./commands/summarize");
|
|
9
9
|
const llm_1 = require("./utils/llm");
|
|
10
10
|
const program = new commander_1.Command();
|
|
11
|
+
function collectOption(value, previous) {
|
|
12
|
+
previous.push(value);
|
|
13
|
+
return previous;
|
|
14
|
+
}
|
|
11
15
|
function addBaseOptions(command) {
|
|
12
16
|
return command
|
|
13
17
|
.option("--config <path>", "path to .diffdocrc JSON config file")
|
|
@@ -43,10 +47,22 @@ addChatOptions(addBaseOptions(program
|
|
|
43
47
|
.option("--path <path>", "repository or code path to scan", ".")
|
|
44
48
|
.option("--out <path>", "manifest output path under --base-dir", "manifest.json")
|
|
45
49
|
.option("--mode <mode>", "summarization mode: all or delta", "all")
|
|
50
|
+
.option("--include-code-snapshot", "store raw code in summary assets", false)
|
|
51
|
+
.option("--include-glob <pattern>", "include glob pattern (repeatable)", collectOption, [])
|
|
52
|
+
.option("--exclude-glob <pattern>", "exclude glob pattern (repeatable)", collectOption, [])
|
|
53
|
+
.option("--ignore-file <path>", "path to ignore pattern file relative to --path")
|
|
46
54
|
.action(async (options) => {
|
|
47
55
|
try {
|
|
48
56
|
const config = (0, config_1.buildRuntimeConfig)(options, { chat: true });
|
|
49
|
-
await (0, summarize_1.runSummarize)({
|
|
57
|
+
await (0, summarize_1.runSummarize)({
|
|
58
|
+
path: options.path,
|
|
59
|
+
out: options.out,
|
|
60
|
+
mode: options.mode,
|
|
61
|
+
includeCodeSnapshot: options.includeCodeSnapshot,
|
|
62
|
+
includeGlobs: options.includeGlob,
|
|
63
|
+
excludeGlobs: options.excludeGlob,
|
|
64
|
+
ignoreFile: options.ignoreFile
|
|
65
|
+
}, config);
|
|
50
66
|
}
|
|
51
67
|
catch (error) {
|
|
52
68
|
console.error(error instanceof Error ? error.message : error);
|
|
@@ -104,10 +120,11 @@ addCloudEndpointAndKeyOptions(addEmbeddingOptions(addBaseOptions(program
|
|
|
104
120
|
.command("embed"))))
|
|
105
121
|
.description("Embed manifest summaries into a local Vectra index")
|
|
106
122
|
.option("--manifest <path>", "manifest input path under --base-dir", "manifest.json")
|
|
123
|
+
.option("--rebuild", "rebuild local index from scratch", false)
|
|
107
124
|
.action(async (options) => {
|
|
108
125
|
try {
|
|
109
126
|
const config = (0, config_1.buildRuntimeConfig)(options, { embeddings: true });
|
|
110
|
-
await (0, embed_1.runEmbed)({ manifest: options.manifest }, config);
|
|
127
|
+
await (0, embed_1.runEmbed)({ manifest: options.manifest, rebuild: options.rebuild }, config);
|
|
111
128
|
}
|
|
112
129
|
catch (error) {
|
|
113
130
|
console.error(error instanceof Error ? error.message : error);
|
package/dist/mcp.js
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
"use strict";
|
|
3
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
4
|
+
const mcp_js_1 = require("@modelcontextprotocol/sdk/server/mcp.js");
|
|
5
|
+
const stdio_js_1 = require("@modelcontextprotocol/sdk/server/stdio.js");
|
|
6
|
+
const zod_1 = require("zod");
|
|
7
|
+
const config_1 = require("./config");
|
|
8
|
+
const retrieval_1 = require("./services/retrieval");
|
|
9
|
+
const MCP_SERVER_VERSION = "0.1.1";
|
|
10
|
+
function readCliOptions(argv) {
|
|
11
|
+
const options = {};
|
|
12
|
+
for (let i = 0; i < argv.length; i += 1) {
|
|
13
|
+
const arg = argv[i];
|
|
14
|
+
if (!arg.startsWith("--"))
|
|
15
|
+
continue;
|
|
16
|
+
const key = arg.slice(2);
|
|
17
|
+
const nextValue = argv[i + 1];
|
|
18
|
+
if (!nextValue || nextValue.startsWith("--")) {
|
|
19
|
+
throw new Error(`Missing value for --${key}.`);
|
|
20
|
+
}
|
|
21
|
+
i += 1;
|
|
22
|
+
switch (key) {
|
|
23
|
+
case "config":
|
|
24
|
+
options.config = nextValue;
|
|
25
|
+
break;
|
|
26
|
+
case "base-dir":
|
|
27
|
+
options.baseDir = nextValue;
|
|
28
|
+
break;
|
|
29
|
+
case "ai-provider":
|
|
30
|
+
options.aiProvider = nextValue;
|
|
31
|
+
break;
|
|
32
|
+
case "local-llm-endpoint":
|
|
33
|
+
options.localLlmEndpoint = nextValue;
|
|
34
|
+
break;
|
|
35
|
+
case "local-chat-model":
|
|
36
|
+
options.localChatModel = nextValue;
|
|
37
|
+
break;
|
|
38
|
+
case "local-embed-endpoint":
|
|
39
|
+
options.localEmbedEndpoint = nextValue;
|
|
40
|
+
break;
|
|
41
|
+
case "local-embed-model":
|
|
42
|
+
options.localEmbedModel = nextValue;
|
|
43
|
+
break;
|
|
44
|
+
case "cloud-llm-endpoint":
|
|
45
|
+
options.cloudLlmEndpoint = nextValue;
|
|
46
|
+
break;
|
|
47
|
+
case "cloud-chat-model":
|
|
48
|
+
options.cloudChatModel = nextValue;
|
|
49
|
+
break;
|
|
50
|
+
case "cloud-embed-model":
|
|
51
|
+
options.cloudEmbedModel = nextValue;
|
|
52
|
+
break;
|
|
53
|
+
case "openai-api-key":
|
|
54
|
+
options.openaiApiKey = nextValue;
|
|
55
|
+
break;
|
|
56
|
+
default:
|
|
57
|
+
throw new Error(`Unknown MCP option: --${key}.`);
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
return options;
|
|
61
|
+
}
|
|
62
|
+
function buildConfig(options, needs) {
|
|
63
|
+
return (0, config_1.buildRuntimeConfig)(options, needs);
|
|
64
|
+
}
|
|
65
|
+
function jsonText(data) {
|
|
66
|
+
return {
|
|
67
|
+
content: [
|
|
68
|
+
{
|
|
69
|
+
type: "text",
|
|
70
|
+
text: JSON.stringify(data, null, 2)
|
|
71
|
+
}
|
|
72
|
+
]
|
|
73
|
+
};
|
|
74
|
+
}
|
|
75
|
+
async function main() {
|
|
76
|
+
const runtimeOptions = readCliOptions(process.argv.slice(2));
|
|
77
|
+
const server = new mcp_js_1.McpServer({
|
|
78
|
+
name: "diffdoc",
|
|
79
|
+
version: MCP_SERVER_VERSION
|
|
80
|
+
});
|
|
81
|
+
const toolServer = server;
|
|
82
|
+
toolServer.registerTool("diffdoc_search", {
|
|
83
|
+
title: "Search DiffDoc Index",
|
|
84
|
+
description: "Search the local DiffDoc Vectra index and return raw matching files, summaries, and optional code snapshots.",
|
|
85
|
+
inputSchema: {
|
|
86
|
+
query: zod_1.z.string().min(1).describe("Natural-language search query."),
|
|
87
|
+
top: zod_1.z.number().int().positive().optional().describe("Number of matches to return."),
|
|
88
|
+
includeCode: zod_1.z.boolean().optional().describe("Include raw code snapshots in the returned results.")
|
|
89
|
+
}
|
|
90
|
+
}, async ({ query, top = 5, includeCode = false }) => {
|
|
91
|
+
const config = buildConfig(runtimeOptions, { embeddings: true });
|
|
92
|
+
const results = await (0, retrieval_1.searchIndex)(String(query), (0, retrieval_1.parseTopK)(top), config);
|
|
93
|
+
return jsonText({
|
|
94
|
+
results: results.map((result) => ({
|
|
95
|
+
filePath: result.filePath,
|
|
96
|
+
score: result.score,
|
|
97
|
+
hash: result.hash,
|
|
98
|
+
summaryText: result.summaryText,
|
|
99
|
+
rawCodeSnapshot: Boolean(includeCode) ? result.rawCodeSnapshot : undefined
|
|
100
|
+
}))
|
|
101
|
+
});
|
|
102
|
+
});
|
|
103
|
+
toolServer.registerTool("diffdoc_answer", {
|
|
104
|
+
title: "Answer From DiffDoc Index",
|
|
105
|
+
description: "Answer a question using retrieved DiffDoc index context and the configured chat model.",
|
|
106
|
+
inputSchema: {
|
|
107
|
+
question: zod_1.z.string().min(1).describe("Question to answer using indexed DiffDoc context."),
|
|
108
|
+
top: zod_1.z.number().int().positive().optional().describe("Number of matches to retrieve before answering."),
|
|
109
|
+
includeResults: zod_1.z.boolean().optional().describe("Include full retrieved results in addition to answer and sources.")
|
|
110
|
+
}
|
|
111
|
+
}, async ({ question, top = 5, includeResults = false }) => {
|
|
112
|
+
const config = buildConfig(runtimeOptions, { chat: true, embeddings: true });
|
|
113
|
+
const answer = await (0, retrieval_1.answerFromIndex)(String(question), (0, retrieval_1.parseTopK)(top), config);
|
|
114
|
+
return jsonText({
|
|
115
|
+
answer: answer.answer,
|
|
116
|
+
sources: answer.sources,
|
|
117
|
+
results: Boolean(includeResults) ? answer.results : undefined
|
|
118
|
+
});
|
|
119
|
+
});
|
|
120
|
+
toolServer.registerTool("diffdoc_index_stats", {
|
|
121
|
+
title: "DiffDoc Index Stats",
|
|
122
|
+
description: "Return the local DiffDoc Vectra index path, existence status, and indexed item count.",
|
|
123
|
+
inputSchema: {}
|
|
124
|
+
}, async () => {
|
|
125
|
+
const config = buildConfig(runtimeOptions, { embeddings: false, chat: false });
|
|
126
|
+
return jsonText(await (0, retrieval_1.getIndexStats)(config));
|
|
127
|
+
});
|
|
128
|
+
await server.connect(new stdio_js_1.StdioServerTransport());
|
|
129
|
+
}
|
|
130
|
+
main().catch((error) => {
|
|
131
|
+
console.error(error instanceof Error ? error.message : error);
|
|
132
|
+
process.exit(1);
|
|
133
|
+
});
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.parseTopK = parseTopK;
|
|
4
|
+
exports.trimForDisplay = trimForDisplay;
|
|
5
|
+
exports.searchIndex = searchIndex;
|
|
6
|
+
exports.answerFromIndex = answerFromIndex;
|
|
7
|
+
exports.getIndexStats = getIndexStats;
|
|
8
|
+
const vectra_1 = require("vectra");
|
|
9
|
+
const embed_1 = require("../commands/embed");
|
|
10
|
+
const llm_1 = require("../utils/llm");
|
|
11
|
+
const CODE_QUERY_PREFIX = "Represent this query for searching relevant code: ";
|
|
12
|
+
function parseTopK(value) {
|
|
13
|
+
const topK = typeof value === "number" ? value : Number.parseInt(value, 10);
|
|
14
|
+
if (!Number.isInteger(topK) || topK < 1) {
|
|
15
|
+
throw new Error("Invalid top value. Expected a positive integer.");
|
|
16
|
+
}
|
|
17
|
+
return topK;
|
|
18
|
+
}
|
|
19
|
+
function trimForDisplay(text, maxLength) {
|
|
20
|
+
if (text.length <= maxLength) {
|
|
21
|
+
return text;
|
|
22
|
+
}
|
|
23
|
+
return `${text.slice(0, maxLength).trimEnd()}...`;
|
|
24
|
+
}
|
|
25
|
+
function mapSearchResult(result) {
|
|
26
|
+
const metadata = result.item.metadata;
|
|
27
|
+
return {
|
|
28
|
+
filePath: metadata.filePath,
|
|
29
|
+
score: result.score,
|
|
30
|
+
hash: metadata.hash,
|
|
31
|
+
summaryText: metadata.summaryText,
|
|
32
|
+
rawCodeSnapshot: metadata.rawCodeSnapshot
|
|
33
|
+
};
|
|
34
|
+
}
|
|
35
|
+
function buildAnswerPrompt(question, results) {
|
|
36
|
+
const context = results.map((result, indexPosition) => {
|
|
37
|
+
return [
|
|
38
|
+
`Result ${indexPosition + 1}`,
|
|
39
|
+
`File: ${result.filePath}`,
|
|
40
|
+
`Score: ${result.score}`,
|
|
41
|
+
`Summary:\n${result.summaryText}`,
|
|
42
|
+
`Code Snapshot:\n${result.rawCodeSnapshot || "(not stored)"}`
|
|
43
|
+
].join("\n");
|
|
44
|
+
}).join("\n\n---\n\n");
|
|
45
|
+
return `Answer the user's question using only the retrieved DiffDoc results below. If the results do not contain enough information, say what is missing. Prefer a direct answer first, then cite the relevant file paths. Keep the explanation appropriate to the question: summarize when asked for a summary, explain implementation details when asked how something works, and avoid unsupported claims.\n\nUser question:\n${question}\n\nRetrieved results:\n${context}`;
|
|
46
|
+
}
|
|
47
|
+
async function getExistingIndex(config) {
|
|
48
|
+
const indexPath = (0, embed_1.getVectraIndexPath)(config);
|
|
49
|
+
const index = new vectra_1.LocalIndex(indexPath);
|
|
50
|
+
if (!await index.isIndexCreated()) {
|
|
51
|
+
throw new Error(`No Vectra index found at ${indexPath}. Run "diffdoc embed" first.`);
|
|
52
|
+
}
|
|
53
|
+
return index;
|
|
54
|
+
}
|
|
55
|
+
async function searchIndex(query, topK, config) {
|
|
56
|
+
const index = await getExistingIndex(config);
|
|
57
|
+
const [queryVector] = await (0, llm_1.generateEmbeddings)([`${CODE_QUERY_PREFIX}${query}`], config.embeddings);
|
|
58
|
+
const results = await index.queryItems(queryVector, query, topK);
|
|
59
|
+
return results.map(mapSearchResult);
|
|
60
|
+
}
|
|
61
|
+
async function answerFromIndex(question, topK, config) {
|
|
62
|
+
const results = await searchIndex(question, topK, config);
|
|
63
|
+
if (results.length === 0) {
|
|
64
|
+
return {
|
|
65
|
+
answer: "No matching embedded summaries found.",
|
|
66
|
+
sources: [],
|
|
67
|
+
results: []
|
|
68
|
+
};
|
|
69
|
+
}
|
|
70
|
+
const answer = await (0, llm_1.promptLlm)(buildAnswerPrompt(question, results), config.chat);
|
|
71
|
+
return {
|
|
72
|
+
answer,
|
|
73
|
+
sources: results.map((result) => ({ filePath: result.filePath, score: result.score })),
|
|
74
|
+
results
|
|
75
|
+
};
|
|
76
|
+
}
|
|
77
|
+
async function getIndexStats(config) {
|
|
78
|
+
const indexPath = (0, embed_1.getVectraIndexPath)(config);
|
|
79
|
+
const index = new vectra_1.LocalIndex(indexPath);
|
|
80
|
+
const exists = await index.isIndexCreated();
|
|
81
|
+
if (!exists) {
|
|
82
|
+
return { indexPath, exists: false, items: 0 };
|
|
83
|
+
}
|
|
84
|
+
const stats = await index.getIndexStats();
|
|
85
|
+
return { indexPath, exists: true, items: stats.items };
|
|
86
|
+
}
|
package/dist/utils/git.js
CHANGED
|
@@ -7,16 +7,12 @@ exports.getGitDeltas = getGitDeltas;
|
|
|
7
7
|
exports.getCurrentCommit = getCurrentCommit;
|
|
8
8
|
const node_path_1 = __importDefault(require("node:path"));
|
|
9
9
|
const simple_git_1 = __importDefault(require("simple-git"));
|
|
10
|
-
const TARGET_EXTENSIONS = new Set([".ts", ".js", ".cs", ".py"]);
|
|
11
10
|
function normalizePath(filePath) {
|
|
12
11
|
return filePath.split(node_path_1.default.sep).join("/");
|
|
13
12
|
}
|
|
14
|
-
function isTargetCodeFile(filePath) {
|
|
15
|
-
return TARGET_EXTENSIONS.has(node_path_1.default.extname(filePath));
|
|
16
|
-
}
|
|
17
13
|
function addUnique(target, filePath) {
|
|
18
14
|
const normalized = normalizePath(filePath.trim());
|
|
19
|
-
if (normalized
|
|
15
|
+
if (normalized) {
|
|
20
16
|
target.add(normalized);
|
|
21
17
|
}
|
|
22
18
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "diffdoc",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.3.0",
|
|
4
4
|
"description": "Translate repository code shifts into plain-English business context",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"author": "Christopher Sullivan",
|
|
@@ -15,7 +15,8 @@
|
|
|
15
15
|
"type": "commonjs",
|
|
16
16
|
"main": "dist/index.js",
|
|
17
17
|
"bin": {
|
|
18
|
-
"diffdoc": "./dist/index.js"
|
|
18
|
+
"diffdoc": "./dist/index.js",
|
|
19
|
+
"diffdoc-mcp": "./dist/mcp.js"
|
|
19
20
|
},
|
|
20
21
|
"files": [
|
|
21
22
|
"dist",
|
|
@@ -33,10 +34,12 @@
|
|
|
33
34
|
"prepare": "npm run build"
|
|
34
35
|
},
|
|
35
36
|
"dependencies": {
|
|
37
|
+
"@modelcontextprotocol/sdk": "^1.29.0",
|
|
36
38
|
"commander": "^12.0.0",
|
|
37
39
|
"openai": "^4.28.0",
|
|
38
40
|
"simple-git": "^3.24.0",
|
|
39
|
-
"vectra": "^0.14.0"
|
|
41
|
+
"vectra": "^0.14.0",
|
|
42
|
+
"zod": "^3.25.76"
|
|
40
43
|
},
|
|
41
44
|
"devDependencies": {
|
|
42
45
|
"@types/node": "^20.19.41",
|