@usewhisper/mcp-server 0.3.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +182 -154
- package/dist/autosubscribe-6EDKPBE2.js +4068 -4068
- package/dist/autosubscribe-GHO6YR5A.js +4068 -4068
- package/dist/autosubscribe-ISDETQIB.js +435 -435
- package/dist/chunk-3WGYBAYR.js +8387 -8387
- package/dist/chunk-52VJYCZ7.js +455 -455
- package/dist/chunk-5KBZQHDL.js +189 -189
- package/dist/chunk-5KIJNY6Z.js +370 -370
- package/dist/chunk-7SN3CKDK.js +1076 -1076
- package/dist/chunk-B3VWOHUA.js +271 -271
- package/dist/chunk-C57DHKTL.js +459 -459
- package/dist/chunk-EI5CE3EY.js +616 -616
- package/dist/chunk-FTWUJBAH.js +386 -386
- package/dist/chunk-H3HSKH2P.js +4841 -4841
- package/dist/chunk-JO3ORBZD.js +616 -616
- package/dist/chunk-L6DXSM2U.js +456 -456
- package/dist/chunk-LMEYV4JD.js +368 -368
- package/dist/chunk-MEFLJ4PV.js +8385 -8385
- package/dist/chunk-OBLI4FE4.js +275 -275
- package/dist/chunk-PPGYJJED.js +271 -271
- package/dist/chunk-QGM4M3NI.js +37 -37
- package/dist/chunk-T7KMSTWP.js +399 -399
- package/dist/chunk-TWEIYHI6.js +399 -399
- package/dist/chunk-UYWE7HSU.js +368 -368
- package/dist/chunk-X2DL2GWT.js +32 -32
- package/dist/chunk-X7HNNNJJ.js +1079 -1079
- package/dist/consolidation-2GCKI4RE.js +220 -220
- package/dist/consolidation-4JOPW6BG.js +220 -220
- package/dist/consolidation-FOVQTWNQ.js +222 -222
- package/dist/consolidation-IFQ52E44.js +209 -209
- package/dist/context-sharing-4ITCNKG4.js +307 -307
- package/dist/context-sharing-6CCFIAKL.js +275 -275
- package/dist/context-sharing-GYKLXHZA.js +307 -307
- package/dist/context-sharing-PH64JTXS.js +308 -308
- package/dist/context-sharing-Y6LTZZOF.js +307 -307
- package/dist/cost-optimization-6OIKRSBV.js +195 -195
- package/dist/cost-optimization-7DVSTL6R.js +307 -307
- package/dist/cost-optimization-BH5NAX33.js +286 -286
- package/dist/cost-optimization-F3L5BS5F.js +303 -303
- package/dist/ingest-2LPTWUUM.js +16 -16
- package/dist/ingest-7T5FAZNC.js +15 -15
- package/dist/ingest-EBNIE7XB.js +15 -15
- package/dist/ingest-FSHT5BCS.js +15 -15
- package/dist/ingest-QE2BTV72.js +14 -14
- package/dist/oracle-3RLQF3DP.js +259 -259
- package/dist/oracle-FKRTQUUG.js +282 -282
- package/dist/oracle-J47QCSEW.js +263 -263
- package/dist/oracle-MDP5MZRC.js +256 -256
- package/dist/search-BLVHWLWC.js +14 -14
- package/dist/search-CZ5NYL5B.js +12 -12
- package/dist/search-EG6TYWWW.js +13 -13
- package/dist/search-I22QQA7T.js +13 -13
- package/dist/search-T7H5G6DW.js +13 -13
- package/dist/server.d.ts +2 -2
- package/dist/server.js +1973 -169
- package/dist/server.js.map +1 -1
- package/package.json +51 -51
package/dist/chunk-FTWUJBAH.js
CHANGED
|
@@ -1,387 +1,387 @@
|
|
|
1
|
-
import {
|
|
2
|
-
embed,
|
|
3
|
-
embedSingle,
|
|
4
|
-
prisma
|
|
5
|
-
} from "./chunk-X2DL2GWT.js";
|
|
6
|
-
|
|
7
|
-
// src/engine/chunker.ts
|
|
8
|
-
var CODE_EXTENSIONS = /* @__PURE__ */ new Set([
|
|
9
|
-
".ts",
|
|
10
|
-
".tsx",
|
|
11
|
-
".js",
|
|
12
|
-
".jsx",
|
|
13
|
-
".py",
|
|
14
|
-
".java",
|
|
15
|
-
".go",
|
|
16
|
-
".rb",
|
|
17
|
-
".php",
|
|
18
|
-
".cs",
|
|
19
|
-
".rs",
|
|
20
|
-
".swift",
|
|
21
|
-
".kt",
|
|
22
|
-
".scala",
|
|
23
|
-
".c",
|
|
24
|
-
".cpp",
|
|
25
|
-
".h",
|
|
26
|
-
".hpp",
|
|
27
|
-
".sol",
|
|
28
|
-
".vy"
|
|
29
|
-
]);
|
|
30
|
-
var CONFIG_EXTENSIONS = /* @__PURE__ */ new Set([
|
|
31
|
-
".json",
|
|
32
|
-
".yaml",
|
|
33
|
-
".yml",
|
|
34
|
-
".toml",
|
|
35
|
-
".ini",
|
|
36
|
-
".env",
|
|
37
|
-
".xml"
|
|
38
|
-
]);
|
|
39
|
-
function detectChunkType(filePath, content) {
|
|
40
|
-
if (!filePath) return "text";
|
|
41
|
-
const ext = "." + filePath.split(".").pop()?.toLowerCase();
|
|
42
|
-
if (CODE_EXTENSIONS.has(ext)) return "code";
|
|
43
|
-
if (CONFIG_EXTENSIONS.has(ext)) return "config";
|
|
44
|
-
if (filePath.includes("schema") || filePath.includes("migration")) return "schema";
|
|
45
|
-
if (filePath.endsWith(".md") || filePath.endsWith(".mdx") || filePath.endsWith(".rst")) return "documentation";
|
|
46
|
-
if (filePath.includes("openapi") || filePath.includes("swagger")) return "api_spec";
|
|
47
|
-
return "text";
|
|
48
|
-
}
|
|
49
|
-
function chunkText(content, opts = {}) {
|
|
50
|
-
const { chunkSize = 1e3, chunkOverlap = 200, filePath, metadata = {} } = opts;
|
|
51
|
-
const chunkType = detectChunkType(filePath, content);
|
|
52
|
-
if (chunkType === "code") {
|
|
53
|
-
return chunkCode(content, { chunkSize, filePath, metadata });
|
|
54
|
-
}
|
|
55
|
-
return chunkBySize(content, { chunkSize, chunkOverlap, chunkType, metadata });
|
|
56
|
-
}
|
|
57
|
-
function chunkCode(content, opts) {
|
|
58
|
-
const { chunkSize, filePath, metadata = {} } = opts;
|
|
59
|
-
const lines = content.split("\n");
|
|
60
|
-
const chunks = [];
|
|
61
|
-
const boundaries = [
|
|
62
|
-
/^(export\s+)?(async\s+)?function\s+/,
|
|
63
|
-
/^(export\s+)?(default\s+)?class\s+/,
|
|
64
|
-
/^(export\s+)?const\s+\w+\s*=\s*(async\s+)?\(/,
|
|
65
|
-
/^(export\s+)?const\s+\w+\s*=\s*\{/,
|
|
66
|
-
/^(export\s+)?interface\s+/,
|
|
67
|
-
/^(export\s+)?type\s+/,
|
|
68
|
-
/^(export\s+)?enum\s+/,
|
|
69
|
-
/^def\s+/,
|
|
70
|
-
// Python
|
|
71
|
-
/^class\s+/,
|
|
72
|
-
// Python/Java
|
|
73
|
-
/^func\s+/,
|
|
74
|
-
// Go
|
|
75
|
-
/^pub\s+(fn|struct|enum|impl)/
|
|
76
|
-
// Rust
|
|
77
|
-
];
|
|
78
|
-
let currentChunk = [];
|
|
79
|
-
let currentStart = 0;
|
|
80
|
-
for (let i = 0; i < lines.length; i++) {
|
|
81
|
-
const trimmed = lines[i].trimStart();
|
|
82
|
-
const isBoundary = boundaries.some((b) => b.test(trimmed));
|
|
83
|
-
if (isBoundary && currentChunk.length > 0) {
|
|
84
|
-
const chunkContent = currentChunk.join("\n").trim();
|
|
85
|
-
if (chunkContent.length > 0) {
|
|
86
|
-
chunks.push({
|
|
87
|
-
content: chunkContent,
|
|
88
|
-
chunkType: "code",
|
|
89
|
-
chunkIndex: chunks.length,
|
|
90
|
-
metadata: {
|
|
91
|
-
...metadata,
|
|
92
|
-
filePath,
|
|
93
|
-
startLine: currentStart + 1,
|
|
94
|
-
endLine: i
|
|
95
|
-
}
|
|
96
|
-
});
|
|
97
|
-
}
|
|
98
|
-
currentChunk = [lines[i]];
|
|
99
|
-
currentStart = i;
|
|
100
|
-
} else {
|
|
101
|
-
currentChunk.push(lines[i]);
|
|
102
|
-
}
|
|
103
|
-
if (currentChunk.join("\n").length > chunkSize * 1.5) {
|
|
104
|
-
const chunkContent = currentChunk.join("\n").trim();
|
|
105
|
-
if (chunkContent.length > 0) {
|
|
106
|
-
chunks.push({
|
|
107
|
-
content: chunkContent,
|
|
108
|
-
chunkType: "code",
|
|
109
|
-
chunkIndex: chunks.length,
|
|
110
|
-
metadata: {
|
|
111
|
-
...metadata,
|
|
112
|
-
filePath,
|
|
113
|
-
startLine: currentStart + 1,
|
|
114
|
-
endLine: i + 1
|
|
115
|
-
}
|
|
116
|
-
});
|
|
117
|
-
}
|
|
118
|
-
currentChunk = [];
|
|
119
|
-
currentStart = i + 1;
|
|
120
|
-
}
|
|
121
|
-
}
|
|
122
|
-
if (currentChunk.length > 0) {
|
|
123
|
-
const chunkContent = currentChunk.join("\n").trim();
|
|
124
|
-
if (chunkContent.length > 0) {
|
|
125
|
-
chunks.push({
|
|
126
|
-
content: chunkContent,
|
|
127
|
-
chunkType: "code",
|
|
128
|
-
chunkIndex: chunks.length,
|
|
129
|
-
metadata: {
|
|
130
|
-
...metadata,
|
|
131
|
-
filePath,
|
|
132
|
-
startLine: currentStart + 1,
|
|
133
|
-
endLine: lines.length
|
|
134
|
-
}
|
|
135
|
-
});
|
|
136
|
-
}
|
|
137
|
-
}
|
|
138
|
-
return chunks;
|
|
139
|
-
}
|
|
140
|
-
function chunkBySize(content, opts) {
|
|
141
|
-
const { chunkSize, chunkOverlap, chunkType, metadata = {} } = opts;
|
|
142
|
-
const chunks = [];
|
|
143
|
-
const paragraphs = content.split(/\n\n+/);
|
|
144
|
-
let current = "";
|
|
145
|
-
for (const para of paragraphs) {
|
|
146
|
-
if ((current + "\n\n" + para).length > chunkSize && current.length > 0) {
|
|
147
|
-
chunks.push({
|
|
148
|
-
content: current.trim(),
|
|
149
|
-
chunkType,
|
|
150
|
-
chunkIndex: chunks.length,
|
|
151
|
-
metadata
|
|
152
|
-
});
|
|
153
|
-
const words = current.split(/\s+/);
|
|
154
|
-
const overlapWords = words.slice(-Math.floor(chunkOverlap / 5));
|
|
155
|
-
current = overlapWords.join(" ") + "\n\n" + para;
|
|
156
|
-
} else {
|
|
157
|
-
current = current ? current + "\n\n" + para : para;
|
|
158
|
-
}
|
|
159
|
-
}
|
|
160
|
-
if (current.trim().length > 0) {
|
|
161
|
-
chunks.push({
|
|
162
|
-
content: current.trim(),
|
|
163
|
-
chunkType,
|
|
164
|
-
chunkIndex: chunks.length,
|
|
165
|
-
metadata
|
|
166
|
-
});
|
|
167
|
-
}
|
|
168
|
-
return chunks;
|
|
169
|
-
}
|
|
170
|
-
|
|
171
|
-
// src/engine/extractor.ts
|
|
172
|
-
import OpenAI from "openai";
|
|
173
|
-
var openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });
|
|
174
|
-
async function extractEntities(projectId, content, chunkType, metadata = {}, chunkId) {
|
|
175
|
-
if (content.length < 100) return { entities: 0, relations: 0 };
|
|
176
|
-
const isCode = ["code", "function", "class"].includes(chunkType);
|
|
177
|
-
const prompt = isCode ? `Analyze this code and extract entities and relationships.
|
|
178
|
-
|
|
179
|
-
Entities: functions, classes, interfaces, types, modules, variables, constants, API endpoints, services.
|
|
180
|
-
Relations: imports, exports, calls, implements, extends, depends_on, references, part_of.
|
|
181
|
-
|
|
182
|
-
Code:
|
|
183
|
-
\`\`\`
|
|
184
|
-
${content.slice(0, 3e3)}
|
|
185
|
-
\`\`\`
|
|
186
|
-
|
|
187
|
-
Respond with JSON only:
|
|
188
|
-
{
|
|
189
|
-
"entities": [{"name": "...", "type": "function|class|interface|module|constant|api_endpoint|service", "description": "one line"}],
|
|
190
|
-
"relations": [{"from": "name", "fromType": "type", "to": "name", "toType": "type", "relation": "imports|calls|extends|implements|depends_on|references|part_of"}]
|
|
191
|
-
}` : `Analyze this text and extract key entities (concepts, people, tools, services, APIs, technologies) and their relationships.
|
|
192
|
-
|
|
193
|
-
Text:
|
|
194
|
-
${content.slice(0, 3e3)}
|
|
195
|
-
|
|
196
|
-
Respond with JSON only:
|
|
197
|
-
{
|
|
198
|
-
"entities": [{"name": "...", "type": "concept|tool|service|api|technology|person|organization", "description": "one line"}],
|
|
199
|
-
"relations": [{"from": "name", "fromType": "type", "to": "name", "toType": "type", "relation": "references|depends_on|related_to|part_of|supersedes"}]
|
|
200
|
-
}`;
|
|
201
|
-
try {
|
|
202
|
-
const res = await openai.chat.completions.create({
|
|
203
|
-
model: "gpt-4.1-nano",
|
|
204
|
-
messages: [{ role: "user", content: prompt }],
|
|
205
|
-
temperature: 0,
|
|
206
|
-
max_tokens: 1e3,
|
|
207
|
-
response_format: { type: "json_object" }
|
|
208
|
-
});
|
|
209
|
-
const text = res.choices[0]?.message?.content?.trim() || "{}";
|
|
210
|
-
const parsed = JSON.parse(text);
|
|
211
|
-
const extractedEntities = parsed.entities || [];
|
|
212
|
-
const extractedRelations = parsed.relations || [];
|
|
213
|
-
let entityCount = 0;
|
|
214
|
-
let relationCount = 0;
|
|
215
|
-
const entityMap = /* @__PURE__ */ new Map();
|
|
216
|
-
for (const ent of extractedEntities.slice(0, 20)) {
|
|
217
|
-
if (!ent.name || !ent.type) continue;
|
|
218
|
-
const embedding = await embedSingle(`${ent.type}: ${ent.name} - ${ent.description || ""}`);
|
|
219
|
-
const entity = await prisma.entity.upsert({
|
|
220
|
-
where: {
|
|
221
|
-
projectId_name_entityType: {
|
|
222
|
-
projectId,
|
|
223
|
-
name: ent.name,
|
|
224
|
-
entityType: ent.type
|
|
225
|
-
}
|
|
226
|
-
},
|
|
227
|
-
update: {
|
|
228
|
-
description: ent.description,
|
|
229
|
-
sourceChunkId: chunkId,
|
|
230
|
-
embedding,
|
|
231
|
-
updatedAt: /* @__PURE__ */ new Date()
|
|
232
|
-
},
|
|
233
|
-
create: {
|
|
234
|
-
projectId,
|
|
235
|
-
name: ent.name,
|
|
236
|
-
entityType: ent.type,
|
|
237
|
-
description: ent.description,
|
|
238
|
-
metadata: { ...metadata, autoExtracted: true },
|
|
239
|
-
sourceChunkId: chunkId,
|
|
240
|
-
embedding
|
|
241
|
-
}
|
|
242
|
-
});
|
|
243
|
-
entityMap.set(`${ent.name}:${ent.type}`, entity.id);
|
|
244
|
-
entityCount++;
|
|
245
|
-
}
|
|
246
|
-
for (const rel of extractedRelations.slice(0, 30)) {
|
|
247
|
-
if (!rel.from || !rel.to || !rel.relation) continue;
|
|
248
|
-
const fromId = entityMap.get(`${rel.from}:${rel.fromType}`);
|
|
249
|
-
const toId = entityMap.get(`${rel.to}:${rel.toType}`);
|
|
250
|
-
if (!fromId || !toId) continue;
|
|
251
|
-
const validRelations = [
|
|
252
|
-
"imports",
|
|
253
|
-
"exports",
|
|
254
|
-
"calls",
|
|
255
|
-
"implements",
|
|
256
|
-
"extends",
|
|
257
|
-
"references",
|
|
258
|
-
"depends_on",
|
|
259
|
-
"related_to",
|
|
260
|
-
"part_of",
|
|
261
|
-
"contradicts",
|
|
262
|
-
"supersedes"
|
|
263
|
-
];
|
|
264
|
-
if (!validRelations.includes(rel.relation)) continue;
|
|
265
|
-
await prisma.entityRelation.upsert({
|
|
266
|
-
where: {
|
|
267
|
-
fromEntityId_toEntityId_relationType: {
|
|
268
|
-
fromEntityId: fromId,
|
|
269
|
-
toEntityId: toId,
|
|
270
|
-
relationType: rel.relation
|
|
271
|
-
}
|
|
272
|
-
},
|
|
273
|
-
update: {
|
|
274
|
-
metadata: { autoExtracted: true }
|
|
275
|
-
},
|
|
276
|
-
create: {
|
|
277
|
-
projectId,
|
|
278
|
-
fromEntityId: fromId,
|
|
279
|
-
toEntityId: toId,
|
|
280
|
-
relationType: rel.relation,
|
|
281
|
-
metadata: { autoExtracted: true }
|
|
282
|
-
}
|
|
283
|
-
});
|
|
284
|
-
relationCount++;
|
|
285
|
-
}
|
|
286
|
-
return { entities: entityCount, relations: relationCount };
|
|
287
|
-
} catch {
|
|
288
|
-
return { entities: 0, relations: 0 };
|
|
289
|
-
}
|
|
290
|
-
}
|
|
291
|
-
|
|
292
|
-
// src/engine/ingest.ts
|
|
293
|
-
import { createHash } from "crypto";
|
|
294
|
-
import PQueue from "p-queue";
|
|
295
|
-
var queue = new PQueue({ concurrency: 3 });
|
|
296
|
-
var ENABLE_AUTO_EXTRACTION = process.env.DISABLE_AUTO_EXTRACTION !== "true";
|
|
297
|
-
async function ingestDocument(input) {
|
|
298
|
-
const { sourceId, projectId, externalId, title, content, metadata = {}, filePath } = input;
|
|
299
|
-
const contentHash = createHash("sha256").update(content).digest("hex");
|
|
300
|
-
const doc = await prisma.document.upsert({
|
|
301
|
-
where: {
|
|
302
|
-
sourceId_externalId: {
|
|
303
|
-
sourceId,
|
|
304
|
-
externalId
|
|
305
|
-
}
|
|
306
|
-
},
|
|
307
|
-
update: {
|
|
308
|
-
title,
|
|
309
|
-
content,
|
|
310
|
-
metadata,
|
|
311
|
-
contentHash,
|
|
312
|
-
updatedAt: /* @__PURE__ */ new Date()
|
|
313
|
-
},
|
|
314
|
-
create: {
|
|
315
|
-
sourceId,
|
|
316
|
-
projectId,
|
|
317
|
-
externalId,
|
|
318
|
-
title,
|
|
319
|
-
content,
|
|
320
|
-
metadata,
|
|
321
|
-
contentHash
|
|
322
|
-
}
|
|
323
|
-
});
|
|
324
|
-
await prisma.chunk.deleteMany({
|
|
325
|
-
where: { documentId: doc.id }
|
|
326
|
-
});
|
|
327
|
-
const textChunks = chunkText(content, {
|
|
328
|
-
filePath: filePath || externalId,
|
|
329
|
-
metadata: { ...metadata, title }
|
|
330
|
-
});
|
|
331
|
-
if (textChunks.length === 0) return doc;
|
|
332
|
-
const batchSize = 50;
|
|
333
|
-
const insertedChunkIds = [];
|
|
334
|
-
for (let i = 0; i < textChunks.length; i += batchSize) {
|
|
335
|
-
const batch = textChunks.slice(i, i + batchSize);
|
|
336
|
-
const embeddings = await embed(batch.map((c) => c.content));
|
|
337
|
-
const inserted = await prisma.$transaction(
|
|
338
|
-
batch.map(
|
|
339
|
-
(chunk, j) => prisma.chunk.create({
|
|
340
|
-
data: {
|
|
341
|
-
documentId: doc.id,
|
|
342
|
-
projectId,
|
|
343
|
-
content: chunk.content,
|
|
344
|
-
chunkType: chunk.chunkType,
|
|
345
|
-
chunkIndex: chunk.chunkIndex,
|
|
346
|
-
metadata: chunk.metadata,
|
|
347
|
-
embedding: embeddings[j],
|
|
348
|
-
tokenCount: Math.ceil(chunk.content.length / 4)
|
|
349
|
-
},
|
|
350
|
-
select: { id: true }
|
|
351
|
-
})
|
|
352
|
-
)
|
|
353
|
-
);
|
|
354
|
-
insertedChunkIds.push(...inserted.map((c) => c.id));
|
|
355
|
-
}
|
|
356
|
-
if (ENABLE_AUTO_EXTRACTION && !input.skipEntityExtraction) {
|
|
357
|
-
const chunksToExtract = textChunks.filter((c) => c.content.length > 200).slice(0, 5);
|
|
358
|
-
for (let i = 0; i < chunksToExtract.length; i++) {
|
|
359
|
-
const chunk = chunksToExtract[i];
|
|
360
|
-
const chunkId = insertedChunkIds[textChunks.indexOf(chunk)];
|
|
361
|
-
extractEntities(projectId, chunk.content, chunk.chunkType, metadata, chunkId).catch(() => {
|
|
362
|
-
});
|
|
363
|
-
}
|
|
364
|
-
}
|
|
365
|
-
const docCount = await prisma.document.count({
|
|
366
|
-
where: { sourceId }
|
|
367
|
-
});
|
|
368
|
-
const chunkCount = await prisma.chunk.count({
|
|
369
|
-
where: { documentId: doc.id }
|
|
370
|
-
});
|
|
371
|
-
await prisma.source.update({
|
|
372
|
-
where: { id: sourceId },
|
|
373
|
-
data: {
|
|
374
|
-
documentCount: docCount,
|
|
375
|
-
chunkCount,
|
|
376
|
-
lastSyncAt: /* @__PURE__ */ new Date(),
|
|
377
|
-
status: "READY",
|
|
378
|
-
updatedAt: /* @__PURE__ */ new Date()
|
|
379
|
-
}
|
|
380
|
-
});
|
|
381
|
-
return doc;
|
|
382
|
-
}
|
|
383
|
-
|
|
384
|
-
export {
|
|
385
|
-
ingestDocument
|
|
386
|
-
};
|
|
1
|
+
import {
|
|
2
|
+
embed,
|
|
3
|
+
embedSingle,
|
|
4
|
+
prisma
|
|
5
|
+
} from "./chunk-X2DL2GWT.js";
|
|
6
|
+
|
|
7
|
+
// src/engine/chunker.ts
|
|
8
|
+
var CODE_EXTENSIONS = /* @__PURE__ */ new Set([
|
|
9
|
+
".ts",
|
|
10
|
+
".tsx",
|
|
11
|
+
".js",
|
|
12
|
+
".jsx",
|
|
13
|
+
".py",
|
|
14
|
+
".java",
|
|
15
|
+
".go",
|
|
16
|
+
".rb",
|
|
17
|
+
".php",
|
|
18
|
+
".cs",
|
|
19
|
+
".rs",
|
|
20
|
+
".swift",
|
|
21
|
+
".kt",
|
|
22
|
+
".scala",
|
|
23
|
+
".c",
|
|
24
|
+
".cpp",
|
|
25
|
+
".h",
|
|
26
|
+
".hpp",
|
|
27
|
+
".sol",
|
|
28
|
+
".vy"
|
|
29
|
+
]);
|
|
30
|
+
var CONFIG_EXTENSIONS = /* @__PURE__ */ new Set([
|
|
31
|
+
".json",
|
|
32
|
+
".yaml",
|
|
33
|
+
".yml",
|
|
34
|
+
".toml",
|
|
35
|
+
".ini",
|
|
36
|
+
".env",
|
|
37
|
+
".xml"
|
|
38
|
+
]);
|
|
39
|
+
function detectChunkType(filePath, content) {
|
|
40
|
+
if (!filePath) return "text";
|
|
41
|
+
const ext = "." + filePath.split(".").pop()?.toLowerCase();
|
|
42
|
+
if (CODE_EXTENSIONS.has(ext)) return "code";
|
|
43
|
+
if (CONFIG_EXTENSIONS.has(ext)) return "config";
|
|
44
|
+
if (filePath.includes("schema") || filePath.includes("migration")) return "schema";
|
|
45
|
+
if (filePath.endsWith(".md") || filePath.endsWith(".mdx") || filePath.endsWith(".rst")) return "documentation";
|
|
46
|
+
if (filePath.includes("openapi") || filePath.includes("swagger")) return "api_spec";
|
|
47
|
+
return "text";
|
|
48
|
+
}
|
|
49
|
+
function chunkText(content, opts = {}) {
|
|
50
|
+
const { chunkSize = 1e3, chunkOverlap = 200, filePath, metadata = {} } = opts;
|
|
51
|
+
const chunkType = detectChunkType(filePath, content);
|
|
52
|
+
if (chunkType === "code") {
|
|
53
|
+
return chunkCode(content, { chunkSize, filePath, metadata });
|
|
54
|
+
}
|
|
55
|
+
return chunkBySize(content, { chunkSize, chunkOverlap, chunkType, metadata });
|
|
56
|
+
}
|
|
57
|
+
function chunkCode(content, opts) {
|
|
58
|
+
const { chunkSize, filePath, metadata = {} } = opts;
|
|
59
|
+
const lines = content.split("\n");
|
|
60
|
+
const chunks = [];
|
|
61
|
+
const boundaries = [
|
|
62
|
+
/^(export\s+)?(async\s+)?function\s+/,
|
|
63
|
+
/^(export\s+)?(default\s+)?class\s+/,
|
|
64
|
+
/^(export\s+)?const\s+\w+\s*=\s*(async\s+)?\(/,
|
|
65
|
+
/^(export\s+)?const\s+\w+\s*=\s*\{/,
|
|
66
|
+
/^(export\s+)?interface\s+/,
|
|
67
|
+
/^(export\s+)?type\s+/,
|
|
68
|
+
/^(export\s+)?enum\s+/,
|
|
69
|
+
/^def\s+/,
|
|
70
|
+
// Python
|
|
71
|
+
/^class\s+/,
|
|
72
|
+
// Python/Java
|
|
73
|
+
/^func\s+/,
|
|
74
|
+
// Go
|
|
75
|
+
/^pub\s+(fn|struct|enum|impl)/
|
|
76
|
+
// Rust
|
|
77
|
+
];
|
|
78
|
+
let currentChunk = [];
|
|
79
|
+
let currentStart = 0;
|
|
80
|
+
for (let i = 0; i < lines.length; i++) {
|
|
81
|
+
const trimmed = lines[i].trimStart();
|
|
82
|
+
const isBoundary = boundaries.some((b) => b.test(trimmed));
|
|
83
|
+
if (isBoundary && currentChunk.length > 0) {
|
|
84
|
+
const chunkContent = currentChunk.join("\n").trim();
|
|
85
|
+
if (chunkContent.length > 0) {
|
|
86
|
+
chunks.push({
|
|
87
|
+
content: chunkContent,
|
|
88
|
+
chunkType: "code",
|
|
89
|
+
chunkIndex: chunks.length,
|
|
90
|
+
metadata: {
|
|
91
|
+
...metadata,
|
|
92
|
+
filePath,
|
|
93
|
+
startLine: currentStart + 1,
|
|
94
|
+
endLine: i
|
|
95
|
+
}
|
|
96
|
+
});
|
|
97
|
+
}
|
|
98
|
+
currentChunk = [lines[i]];
|
|
99
|
+
currentStart = i;
|
|
100
|
+
} else {
|
|
101
|
+
currentChunk.push(lines[i]);
|
|
102
|
+
}
|
|
103
|
+
if (currentChunk.join("\n").length > chunkSize * 1.5) {
|
|
104
|
+
const chunkContent = currentChunk.join("\n").trim();
|
|
105
|
+
if (chunkContent.length > 0) {
|
|
106
|
+
chunks.push({
|
|
107
|
+
content: chunkContent,
|
|
108
|
+
chunkType: "code",
|
|
109
|
+
chunkIndex: chunks.length,
|
|
110
|
+
metadata: {
|
|
111
|
+
...metadata,
|
|
112
|
+
filePath,
|
|
113
|
+
startLine: currentStart + 1,
|
|
114
|
+
endLine: i + 1
|
|
115
|
+
}
|
|
116
|
+
});
|
|
117
|
+
}
|
|
118
|
+
currentChunk = [];
|
|
119
|
+
currentStart = i + 1;
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
if (currentChunk.length > 0) {
|
|
123
|
+
const chunkContent = currentChunk.join("\n").trim();
|
|
124
|
+
if (chunkContent.length > 0) {
|
|
125
|
+
chunks.push({
|
|
126
|
+
content: chunkContent,
|
|
127
|
+
chunkType: "code",
|
|
128
|
+
chunkIndex: chunks.length,
|
|
129
|
+
metadata: {
|
|
130
|
+
...metadata,
|
|
131
|
+
filePath,
|
|
132
|
+
startLine: currentStart + 1,
|
|
133
|
+
endLine: lines.length
|
|
134
|
+
}
|
|
135
|
+
});
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
return chunks;
|
|
139
|
+
}
|
|
140
|
+
function chunkBySize(content, opts) {
|
|
141
|
+
const { chunkSize, chunkOverlap, chunkType, metadata = {} } = opts;
|
|
142
|
+
const chunks = [];
|
|
143
|
+
const paragraphs = content.split(/\n\n+/);
|
|
144
|
+
let current = "";
|
|
145
|
+
for (const para of paragraphs) {
|
|
146
|
+
if ((current + "\n\n" + para).length > chunkSize && current.length > 0) {
|
|
147
|
+
chunks.push({
|
|
148
|
+
content: current.trim(),
|
|
149
|
+
chunkType,
|
|
150
|
+
chunkIndex: chunks.length,
|
|
151
|
+
metadata
|
|
152
|
+
});
|
|
153
|
+
const words = current.split(/\s+/);
|
|
154
|
+
const overlapWords = words.slice(-Math.floor(chunkOverlap / 5));
|
|
155
|
+
current = overlapWords.join(" ") + "\n\n" + para;
|
|
156
|
+
} else {
|
|
157
|
+
current = current ? current + "\n\n" + para : para;
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
if (current.trim().length > 0) {
|
|
161
|
+
chunks.push({
|
|
162
|
+
content: current.trim(),
|
|
163
|
+
chunkType,
|
|
164
|
+
chunkIndex: chunks.length,
|
|
165
|
+
metadata
|
|
166
|
+
});
|
|
167
|
+
}
|
|
168
|
+
return chunks;
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
// src/engine/extractor.ts
|
|
172
|
+
import OpenAI from "openai";
|
|
173
|
+
var openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });
|
|
174
|
+
async function extractEntities(projectId, content, chunkType, metadata = {}, chunkId) {
|
|
175
|
+
if (content.length < 100) return { entities: 0, relations: 0 };
|
|
176
|
+
const isCode = ["code", "function", "class"].includes(chunkType);
|
|
177
|
+
const prompt = isCode ? `Analyze this code and extract entities and relationships.
|
|
178
|
+
|
|
179
|
+
Entities: functions, classes, interfaces, types, modules, variables, constants, API endpoints, services.
|
|
180
|
+
Relations: imports, exports, calls, implements, extends, depends_on, references, part_of.
|
|
181
|
+
|
|
182
|
+
Code:
|
|
183
|
+
\`\`\`
|
|
184
|
+
${content.slice(0, 3e3)}
|
|
185
|
+
\`\`\`
|
|
186
|
+
|
|
187
|
+
Respond with JSON only:
|
|
188
|
+
{
|
|
189
|
+
"entities": [{"name": "...", "type": "function|class|interface|module|constant|api_endpoint|service", "description": "one line"}],
|
|
190
|
+
"relations": [{"from": "name", "fromType": "type", "to": "name", "toType": "type", "relation": "imports|calls|extends|implements|depends_on|references|part_of"}]
|
|
191
|
+
}` : `Analyze this text and extract key entities (concepts, people, tools, services, APIs, technologies) and their relationships.
|
|
192
|
+
|
|
193
|
+
Text:
|
|
194
|
+
${content.slice(0, 3e3)}
|
|
195
|
+
|
|
196
|
+
Respond with JSON only:
|
|
197
|
+
{
|
|
198
|
+
"entities": [{"name": "...", "type": "concept|tool|service|api|technology|person|organization", "description": "one line"}],
|
|
199
|
+
"relations": [{"from": "name", "fromType": "type", "to": "name", "toType": "type", "relation": "references|depends_on|related_to|part_of|supersedes"}]
|
|
200
|
+
}`;
|
|
201
|
+
try {
|
|
202
|
+
const res = await openai.chat.completions.create({
|
|
203
|
+
model: "gpt-4.1-nano",
|
|
204
|
+
messages: [{ role: "user", content: prompt }],
|
|
205
|
+
temperature: 0,
|
|
206
|
+
max_tokens: 1e3,
|
|
207
|
+
response_format: { type: "json_object" }
|
|
208
|
+
});
|
|
209
|
+
const text = res.choices[0]?.message?.content?.trim() || "{}";
|
|
210
|
+
const parsed = JSON.parse(text);
|
|
211
|
+
const extractedEntities = parsed.entities || [];
|
|
212
|
+
const extractedRelations = parsed.relations || [];
|
|
213
|
+
let entityCount = 0;
|
|
214
|
+
let relationCount = 0;
|
|
215
|
+
const entityMap = /* @__PURE__ */ new Map();
|
|
216
|
+
for (const ent of extractedEntities.slice(0, 20)) {
|
|
217
|
+
if (!ent.name || !ent.type) continue;
|
|
218
|
+
const embedding = await embedSingle(`${ent.type}: ${ent.name} - ${ent.description || ""}`);
|
|
219
|
+
const entity = await prisma.entity.upsert({
|
|
220
|
+
where: {
|
|
221
|
+
projectId_name_entityType: {
|
|
222
|
+
projectId,
|
|
223
|
+
name: ent.name,
|
|
224
|
+
entityType: ent.type
|
|
225
|
+
}
|
|
226
|
+
},
|
|
227
|
+
update: {
|
|
228
|
+
description: ent.description,
|
|
229
|
+
sourceChunkId: chunkId,
|
|
230
|
+
embedding,
|
|
231
|
+
updatedAt: /* @__PURE__ */ new Date()
|
|
232
|
+
},
|
|
233
|
+
create: {
|
|
234
|
+
projectId,
|
|
235
|
+
name: ent.name,
|
|
236
|
+
entityType: ent.type,
|
|
237
|
+
description: ent.description,
|
|
238
|
+
metadata: { ...metadata, autoExtracted: true },
|
|
239
|
+
sourceChunkId: chunkId,
|
|
240
|
+
embedding
|
|
241
|
+
}
|
|
242
|
+
});
|
|
243
|
+
entityMap.set(`${ent.name}:${ent.type}`, entity.id);
|
|
244
|
+
entityCount++;
|
|
245
|
+
}
|
|
246
|
+
for (const rel of extractedRelations.slice(0, 30)) {
|
|
247
|
+
if (!rel.from || !rel.to || !rel.relation) continue;
|
|
248
|
+
const fromId = entityMap.get(`${rel.from}:${rel.fromType}`);
|
|
249
|
+
const toId = entityMap.get(`${rel.to}:${rel.toType}`);
|
|
250
|
+
if (!fromId || !toId) continue;
|
|
251
|
+
const validRelations = [
|
|
252
|
+
"imports",
|
|
253
|
+
"exports",
|
|
254
|
+
"calls",
|
|
255
|
+
"implements",
|
|
256
|
+
"extends",
|
|
257
|
+
"references",
|
|
258
|
+
"depends_on",
|
|
259
|
+
"related_to",
|
|
260
|
+
"part_of",
|
|
261
|
+
"contradicts",
|
|
262
|
+
"supersedes"
|
|
263
|
+
];
|
|
264
|
+
if (!validRelations.includes(rel.relation)) continue;
|
|
265
|
+
await prisma.entityRelation.upsert({
|
|
266
|
+
where: {
|
|
267
|
+
fromEntityId_toEntityId_relationType: {
|
|
268
|
+
fromEntityId: fromId,
|
|
269
|
+
toEntityId: toId,
|
|
270
|
+
relationType: rel.relation
|
|
271
|
+
}
|
|
272
|
+
},
|
|
273
|
+
update: {
|
|
274
|
+
metadata: { autoExtracted: true }
|
|
275
|
+
},
|
|
276
|
+
create: {
|
|
277
|
+
projectId,
|
|
278
|
+
fromEntityId: fromId,
|
|
279
|
+
toEntityId: toId,
|
|
280
|
+
relationType: rel.relation,
|
|
281
|
+
metadata: { autoExtracted: true }
|
|
282
|
+
}
|
|
283
|
+
});
|
|
284
|
+
relationCount++;
|
|
285
|
+
}
|
|
286
|
+
return { entities: entityCount, relations: relationCount };
|
|
287
|
+
} catch {
|
|
288
|
+
return { entities: 0, relations: 0 };
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
// src/engine/ingest.ts
|
|
293
|
+
import { createHash } from "crypto";
|
|
294
|
+
import PQueue from "p-queue";
|
|
295
|
+
var queue = new PQueue({ concurrency: 3 });
|
|
296
|
+
var ENABLE_AUTO_EXTRACTION = process.env.DISABLE_AUTO_EXTRACTION !== "true";
|
|
297
|
+
async function ingestDocument(input) {
|
|
298
|
+
const { sourceId, projectId, externalId, title, content, metadata = {}, filePath } = input;
|
|
299
|
+
const contentHash = createHash("sha256").update(content).digest("hex");
|
|
300
|
+
const doc = await prisma.document.upsert({
|
|
301
|
+
where: {
|
|
302
|
+
sourceId_externalId: {
|
|
303
|
+
sourceId,
|
|
304
|
+
externalId
|
|
305
|
+
}
|
|
306
|
+
},
|
|
307
|
+
update: {
|
|
308
|
+
title,
|
|
309
|
+
content,
|
|
310
|
+
metadata,
|
|
311
|
+
contentHash,
|
|
312
|
+
updatedAt: /* @__PURE__ */ new Date()
|
|
313
|
+
},
|
|
314
|
+
create: {
|
|
315
|
+
sourceId,
|
|
316
|
+
projectId,
|
|
317
|
+
externalId,
|
|
318
|
+
title,
|
|
319
|
+
content,
|
|
320
|
+
metadata,
|
|
321
|
+
contentHash
|
|
322
|
+
}
|
|
323
|
+
});
|
|
324
|
+
await prisma.chunk.deleteMany({
|
|
325
|
+
where: { documentId: doc.id }
|
|
326
|
+
});
|
|
327
|
+
const textChunks = chunkText(content, {
|
|
328
|
+
filePath: filePath || externalId,
|
|
329
|
+
metadata: { ...metadata, title }
|
|
330
|
+
});
|
|
331
|
+
if (textChunks.length === 0) return doc;
|
|
332
|
+
const batchSize = 50;
|
|
333
|
+
const insertedChunkIds = [];
|
|
334
|
+
for (let i = 0; i < textChunks.length; i += batchSize) {
|
|
335
|
+
const batch = textChunks.slice(i, i + batchSize);
|
|
336
|
+
const embeddings = await embed(batch.map((c) => c.content));
|
|
337
|
+
const inserted = await prisma.$transaction(
|
|
338
|
+
batch.map(
|
|
339
|
+
(chunk, j) => prisma.chunk.create({
|
|
340
|
+
data: {
|
|
341
|
+
documentId: doc.id,
|
|
342
|
+
projectId,
|
|
343
|
+
content: chunk.content,
|
|
344
|
+
chunkType: chunk.chunkType,
|
|
345
|
+
chunkIndex: chunk.chunkIndex,
|
|
346
|
+
metadata: chunk.metadata,
|
|
347
|
+
embedding: embeddings[j],
|
|
348
|
+
tokenCount: Math.ceil(chunk.content.length / 4)
|
|
349
|
+
},
|
|
350
|
+
select: { id: true }
|
|
351
|
+
})
|
|
352
|
+
)
|
|
353
|
+
);
|
|
354
|
+
insertedChunkIds.push(...inserted.map((c) => c.id));
|
|
355
|
+
}
|
|
356
|
+
if (ENABLE_AUTO_EXTRACTION && !input.skipEntityExtraction) {
|
|
357
|
+
const chunksToExtract = textChunks.filter((c) => c.content.length > 200).slice(0, 5);
|
|
358
|
+
for (let i = 0; i < chunksToExtract.length; i++) {
|
|
359
|
+
const chunk = chunksToExtract[i];
|
|
360
|
+
const chunkId = insertedChunkIds[textChunks.indexOf(chunk)];
|
|
361
|
+
extractEntities(projectId, chunk.content, chunk.chunkType, metadata, chunkId).catch(() => {
|
|
362
|
+
});
|
|
363
|
+
}
|
|
364
|
+
}
|
|
365
|
+
const docCount = await prisma.document.count({
|
|
366
|
+
where: { sourceId }
|
|
367
|
+
});
|
|
368
|
+
const chunkCount = await prisma.chunk.count({
|
|
369
|
+
where: { documentId: doc.id }
|
|
370
|
+
});
|
|
371
|
+
await prisma.source.update({
|
|
372
|
+
where: { id: sourceId },
|
|
373
|
+
data: {
|
|
374
|
+
documentCount: docCount,
|
|
375
|
+
chunkCount,
|
|
376
|
+
lastSyncAt: /* @__PURE__ */ new Date(),
|
|
377
|
+
status: "READY",
|
|
378
|
+
updatedAt: /* @__PURE__ */ new Date()
|
|
379
|
+
}
|
|
380
|
+
});
|
|
381
|
+
return doc;
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
export {
|
|
385
|
+
ingestDocument
|
|
386
|
+
};
|
|
387
387
|
//# sourceMappingURL=chunk-FTWUJBAH.js.map
|