preflight-mcp 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +208 -0
- package/README.zh-CN.md +406 -0
- package/dist/bundle/analysis.js +91 -0
- package/dist/bundle/context7.js +301 -0
- package/dist/bundle/deepwiki.js +206 -0
- package/dist/bundle/facts.js +296 -0
- package/dist/bundle/github.js +55 -0
- package/dist/bundle/guides.js +65 -0
- package/dist/bundle/ingest.js +152 -0
- package/dist/bundle/manifest.js +14 -0
- package/dist/bundle/overview.js +222 -0
- package/dist/bundle/paths.js +29 -0
- package/dist/bundle/service.js +803 -0
- package/dist/bundle/tagging.js +206 -0
- package/dist/config.js +65 -0
- package/dist/context7/client.js +30 -0
- package/dist/context7/tools.js +58 -0
- package/dist/core/scheduler.js +166 -0
- package/dist/errors.js +150 -0
- package/dist/index.js +7 -0
- package/dist/jobs/bundle-auto-update-job.js +71 -0
- package/dist/jobs/health-check-job.js +172 -0
- package/dist/jobs/storage-cleanup-job.js +148 -0
- package/dist/logging/logger.js +311 -0
- package/dist/mcp/uris.js +45 -0
- package/dist/search/sqliteFts.js +481 -0
- package/dist/server/optimized-server.js +255 -0
- package/dist/server.js +778 -0
- package/dist/storage/compression.js +249 -0
- package/dist/storage/storage-adapter.js +316 -0
- package/dist/utils/index.js +100 -0
- package/package.json +44 -0
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
import crypto from 'node:crypto';
|
|
2
|
+
import fs from 'node:fs/promises';
|
|
3
|
+
import path from 'node:path';
|
|
4
|
+
import { connectContext7 } from '../context7/client.js';
|
|
5
|
+
import { extractContext7IdsFromResult, textFromToolResult } from '../context7/tools.js';
|
|
6
|
+
function normalizeKey(s) {
|
|
7
|
+
return s.toLowerCase().replace(/[^a-z0-9]+/g, '');
|
|
8
|
+
}
|
|
9
|
+
function parseResolveEntries(text) {
|
|
10
|
+
const entries = [];
|
|
11
|
+
for (const chunk of text.split('----------')) {
|
|
12
|
+
const lines = chunk
|
|
13
|
+
.split('\n')
|
|
14
|
+
.map((l) => l.trim())
|
|
15
|
+
.filter(Boolean);
|
|
16
|
+
let title;
|
|
17
|
+
let id;
|
|
18
|
+
let benchmarkScore;
|
|
19
|
+
let sourceReputation;
|
|
20
|
+
let codeSnippets;
|
|
21
|
+
for (const line of lines) {
|
|
22
|
+
if (line.startsWith('- Title:')) {
|
|
23
|
+
title = line.slice('- Title:'.length).trim();
|
|
24
|
+
}
|
|
25
|
+
else if (line.toLowerCase().includes('context7-compatible library id:')) {
|
|
26
|
+
const idx = line.toLowerCase().indexOf('context7-compatible library id:');
|
|
27
|
+
id = line.slice(idx + 'context7-compatible library id:'.length).trim();
|
|
28
|
+
}
|
|
29
|
+
else if (line.toLowerCase().startsWith('- benchmark score:')) {
|
|
30
|
+
const raw = line.slice('- Benchmark Score:'.length).trim();
|
|
31
|
+
const n = Number(raw);
|
|
32
|
+
if (Number.isFinite(n))
|
|
33
|
+
benchmarkScore = n;
|
|
34
|
+
}
|
|
35
|
+
else if (line.toLowerCase().startsWith('- source reputation:')) {
|
|
36
|
+
const raw = line.slice('- Source Reputation:'.length).trim();
|
|
37
|
+
if (raw === 'High' || raw === 'Medium' || raw === 'Low') {
|
|
38
|
+
sourceReputation = raw;
|
|
39
|
+
}
|
|
40
|
+
else {
|
|
41
|
+
sourceReputation = 'Unknown';
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
else if (line.toLowerCase().startsWith('- code snippets:')) {
|
|
45
|
+
const raw = line.slice('- Code Snippets:'.length).trim();
|
|
46
|
+
const n = Number(raw);
|
|
47
|
+
if (Number.isFinite(n))
|
|
48
|
+
codeSnippets = n;
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
if (id && id.startsWith('/') && id.includes('/')) {
|
|
52
|
+
entries.push({ title, id, benchmarkScore, sourceReputation, codeSnippets });
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
return entries;
|
|
56
|
+
}
|
|
57
|
+
function reputationWeight(rep) {
|
|
58
|
+
if (rep === 'High')
|
|
59
|
+
return 3;
|
|
60
|
+
if (rep === 'Medium')
|
|
61
|
+
return 2;
|
|
62
|
+
if (rep === 'Low')
|
|
63
|
+
return 1;
|
|
64
|
+
return 0;
|
|
65
|
+
}
|
|
66
|
+
function chooseBestEntry(entries, input) {
|
|
67
|
+
const notes = [];
|
|
68
|
+
if (entries.length === 0)
|
|
69
|
+
return { notes };
|
|
70
|
+
const nk = normalizeKey(input);
|
|
71
|
+
// Score entries: prefer name match, then reputation/benchmark/snippets.
|
|
72
|
+
const scored = entries.map((e) => {
|
|
73
|
+
const titleKey = e.title ? normalizeKey(e.title) : '';
|
|
74
|
+
const idKey = normalizeKey(e.id);
|
|
75
|
+
let match = 0;
|
|
76
|
+
if (titleKey && nk && titleKey === nk)
|
|
77
|
+
match = 3;
|
|
78
|
+
else if (titleKey && nk && (titleKey.includes(nk) || nk.includes(titleKey)))
|
|
79
|
+
match = 2;
|
|
80
|
+
else if (nk && idKey.includes(nk))
|
|
81
|
+
match = 1;
|
|
82
|
+
const score = match * 1_000_000 +
|
|
83
|
+
reputationWeight(e.sourceReputation) * 10_000 +
|
|
84
|
+
(e.benchmarkScore ?? 0) * 100 +
|
|
85
|
+
Math.min(e.codeSnippets ?? 0, 50_000) * 0.01;
|
|
86
|
+
return { e, match, score };
|
|
87
|
+
});
|
|
88
|
+
scored.sort((a, b) => b.score - a.score);
|
|
89
|
+
const best = scored[0];
|
|
90
|
+
if (!best)
|
|
91
|
+
return { notes };
|
|
92
|
+
const alternatives = scored.filter((s) => s.e.id !== best.e.id && s.match === best.match).slice(0, 3);
|
|
93
|
+
if (alternatives.length) {
|
|
94
|
+
notes.push(`resolve-library-id chose ${best.e.id}; alternatives: ${alternatives.map((a) => a.e.id).join(', ')}`);
|
|
95
|
+
}
|
|
96
|
+
// If we didn't get even a weak match, still proceed but note.
|
|
97
|
+
if (best.match === 0) {
|
|
98
|
+
notes.push(`resolve-library-id had no clear match for ${JSON.stringify(input)}; using ${best.e.id}`);
|
|
99
|
+
}
|
|
100
|
+
return { id: best.e.id, notes };
|
|
101
|
+
}
|
|
102
|
+
function nowIso() {
|
|
103
|
+
return new Date().toISOString();
|
|
104
|
+
}
|
|
105
|
+
function toPosix(p) {
|
|
106
|
+
return p.replaceAll('\\', '/');
|
|
107
|
+
}
|
|
108
|
+
function sha256Hex(text) {
|
|
109
|
+
return crypto.createHash('sha256').update(text, 'utf8').digest('hex');
|
|
110
|
+
}
|
|
111
|
+
function safeIdSegments(context7Id) {
|
|
112
|
+
const raw = context7Id.trim().replace(/^\/+/, '');
|
|
113
|
+
const parts = raw.split('/').filter(Boolean);
|
|
114
|
+
// Prevent any weird traversal segments.
|
|
115
|
+
return parts.filter((p) => p !== '.' && p !== '..');
|
|
116
|
+
}
|
|
117
|
+
function slug(s) {
|
|
118
|
+
return s
|
|
119
|
+
.trim()
|
|
120
|
+
.toLowerCase()
|
|
121
|
+
.replace(/[^a-z0-9._-]+/g, '_')
|
|
122
|
+
.replace(/^_+|_+$/g, '')
|
|
123
|
+
.slice(0, 64);
|
|
124
|
+
}
|
|
125
|
+
function clipUtf8(text, maxBytes) {
|
|
126
|
+
const normalized = text.replace(/\r\n/g, '\n');
|
|
127
|
+
const buf = Buffer.from(normalized, 'utf8');
|
|
128
|
+
if (buf.length <= maxBytes)
|
|
129
|
+
return { text: normalized, truncated: false };
|
|
130
|
+
// Cutting at a byte boundary may split a multi-byte codepoint; Node will replace invalid sequences.
|
|
131
|
+
const clipped = buf.subarray(0, maxBytes).toString('utf8');
|
|
132
|
+
return { text: `${clipped}\n\n[TRUNCATED]\n`, truncated: true };
|
|
133
|
+
}
|
|
134
|
+
async function ensureDir(p) {
|
|
135
|
+
await fs.mkdir(p, { recursive: true });
|
|
136
|
+
}
|
|
137
|
+
async function writeJson(targetPath, obj) {
|
|
138
|
+
await ensureDir(path.dirname(targetPath));
|
|
139
|
+
await fs.writeFile(targetPath, JSON.stringify(obj, null, 2) + '\n', 'utf8');
|
|
140
|
+
}
|
|
141
|
+
async function callOrThrow(client, name, args) {
|
|
142
|
+
const res = await client.callTool({ name, arguments: args });
|
|
143
|
+
if (res.isError) {
|
|
144
|
+
throw new Error(textFromToolResult(res) || `${name} failed`);
|
|
145
|
+
}
|
|
146
|
+
return {
|
|
147
|
+
text: textFromToolResult(res),
|
|
148
|
+
structured: res.structuredContent,
|
|
149
|
+
};
|
|
150
|
+
}
|
|
151
|
+
async function resolveContext7Id(client, input) {
|
|
152
|
+
const notes = [];
|
|
153
|
+
const trimmed = input.trim();
|
|
154
|
+
if (trimmed.startsWith('/')) {
|
|
155
|
+
return { id: trimmed, notes };
|
|
156
|
+
}
|
|
157
|
+
try {
|
|
158
|
+
const res = await client.callTool({
|
|
159
|
+
name: 'resolve-library-id',
|
|
160
|
+
arguments: { libraryName: trimmed },
|
|
161
|
+
});
|
|
162
|
+
if (res.isError) {
|
|
163
|
+
notes.push(`resolve-library-id error: ${textFromToolResult(res)}`);
|
|
164
|
+
return { notes };
|
|
165
|
+
}
|
|
166
|
+
const text = textFromToolResult(res);
|
|
167
|
+
// Prefer parsing the structured list output for better selection.
|
|
168
|
+
const parsed = parseResolveEntries(text);
|
|
169
|
+
const chosen = chooseBestEntry(parsed, trimmed);
|
|
170
|
+
if (chosen.id) {
|
|
171
|
+
notes.push(...chosen.notes);
|
|
172
|
+
return { id: chosen.id, notes };
|
|
173
|
+
}
|
|
174
|
+
// Fallback: regex/structured extraction.
|
|
175
|
+
const ids = extractContext7IdsFromResult(res);
|
|
176
|
+
if (ids.length === 0) {
|
|
177
|
+
notes.push('resolve-library-id returned no Context7 IDs');
|
|
178
|
+
return { notes };
|
|
179
|
+
}
|
|
180
|
+
if (ids.length > 1) {
|
|
181
|
+
notes.push(`resolve-library-id returned multiple IDs; using first: ${ids[0]}`);
|
|
182
|
+
}
|
|
183
|
+
return { id: ids[0], notes };
|
|
184
|
+
}
|
|
185
|
+
catch (err) {
|
|
186
|
+
notes.push(`resolve-library-id threw: ${err instanceof Error ? err.message : String(err)}`);
|
|
187
|
+
return { notes };
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
export async function ingestContext7Libraries(params) {
|
|
191
|
+
const libs = (params.libraries ?? []).map((s) => s.trim()).filter(Boolean);
|
|
192
|
+
if (libs.length === 0)
|
|
193
|
+
return { files: [], libraries: [] };
|
|
194
|
+
const topics = (params.topics ?? []).map((s) => s.trim()).filter(Boolean).slice(0, 10);
|
|
195
|
+
const maxLibraries = 20;
|
|
196
|
+
await ensureDir(params.bundlePaths.librariesDir);
|
|
197
|
+
let ctx = null;
|
|
198
|
+
try {
|
|
199
|
+
ctx = await connectContext7(params.cfg);
|
|
200
|
+
}
|
|
201
|
+
catch (err) {
|
|
202
|
+
// Best-effort: still write per-library meta.json so the bundle explains what failed.
|
|
203
|
+
const libraries = [];
|
|
204
|
+
for (const input of libs.slice(0, maxLibraries)) {
|
|
205
|
+
const fetchedAt = nowIso();
|
|
206
|
+
const notes = [`context7 connect failed: ${err instanceof Error ? err.message : String(err)}`];
|
|
207
|
+
const baseDir = path.join(params.bundlePaths.librariesDir, 'context7', '_unresolved', slug(input) || 'library');
|
|
208
|
+
await writeJson(path.join(baseDir, 'meta.json'), {
|
|
209
|
+
kind: 'context7',
|
|
210
|
+
input,
|
|
211
|
+
fetchedAt,
|
|
212
|
+
notes,
|
|
213
|
+
});
|
|
214
|
+
libraries.push({ kind: 'context7', input, fetchedAt, notes });
|
|
215
|
+
}
|
|
216
|
+
return { files: [], libraries };
|
|
217
|
+
}
|
|
218
|
+
const client = ctx.client;
|
|
219
|
+
const files = [];
|
|
220
|
+
const libraries = [];
|
|
221
|
+
try {
|
|
222
|
+
for (const input of libs.slice(0, maxLibraries)) {
|
|
223
|
+
const fetchedAt = nowIso();
|
|
224
|
+
const notes = [];
|
|
225
|
+
const fileRelPaths = [];
|
|
226
|
+
const resolved = await resolveContext7Id(client, input);
|
|
227
|
+
notes.push(...resolved.notes);
|
|
228
|
+
const id = resolved.id;
|
|
229
|
+
const baseDir = id
|
|
230
|
+
? path.join(params.bundlePaths.librariesDir, 'context7', ...safeIdSegments(id))
|
|
231
|
+
: path.join(params.bundlePaths.librariesDir, 'context7', '_unresolved', slug(input) || 'library');
|
|
232
|
+
await ensureDir(baseDir);
|
|
233
|
+
if (id) {
|
|
234
|
+
const topicList = topics.length > 0 ? topics : [''];
|
|
235
|
+
for (const topic of topicList) {
|
|
236
|
+
const topicLabel = topic || 'all';
|
|
237
|
+
const fileName = topic
|
|
238
|
+
? `topic-${slug(topicLabel)}-page-1.md`
|
|
239
|
+
: 'docs-page-1.md';
|
|
240
|
+
try {
|
|
241
|
+
const args = {
|
|
242
|
+
context7CompatibleLibraryID: id,
|
|
243
|
+
page: 1,
|
|
244
|
+
};
|
|
245
|
+
if (topic)
|
|
246
|
+
args.topic = topic;
|
|
247
|
+
const { text } = await callOrThrow(client, 'get-library-docs', args);
|
|
248
|
+
if (!text.trim()) {
|
|
249
|
+
notes.push(`get-library-docs returned empty text for topic=${JSON.stringify(topicLabel)}`);
|
|
250
|
+
continue;
|
|
251
|
+
}
|
|
252
|
+
const clipped = clipUtf8(text, params.cfg.maxFileBytes);
|
|
253
|
+
if (clipped.truncated) {
|
|
254
|
+
notes.push(`docs truncated to maxFileBytes=${params.cfg.maxFileBytes} for topic=${JSON.stringify(topicLabel)}`);
|
|
255
|
+
}
|
|
256
|
+
const absDocPath = path.join(baseDir, fileName);
|
|
257
|
+
await fs.writeFile(absDocPath, clipped.text, 'utf8');
|
|
258
|
+
const bundleRelPosix = toPosix(path.relative(params.bundlePaths.rootDir, absDocPath));
|
|
259
|
+
fileRelPaths.push(bundleRelPosix);
|
|
260
|
+
files.push({
|
|
261
|
+
repoId: `context7:${id}`,
|
|
262
|
+
kind: 'doc',
|
|
263
|
+
repoRelativePath: fileName,
|
|
264
|
+
bundleNormRelativePath: bundleRelPosix,
|
|
265
|
+
bundleNormAbsPath: absDocPath,
|
|
266
|
+
sha256: sha256Hex(clipped.text),
|
|
267
|
+
bytes: Buffer.byteLength(clipped.text, 'utf8'),
|
|
268
|
+
});
|
|
269
|
+
}
|
|
270
|
+
catch (err) {
|
|
271
|
+
notes.push(`get-library-docs failed for topic=${JSON.stringify(topicLabel)}: ${err instanceof Error ? err.message : String(err)}`);
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
else {
|
|
276
|
+
notes.push('Context7 ID unresolved; skipped get-library-docs');
|
|
277
|
+
}
|
|
278
|
+
await writeJson(path.join(baseDir, 'meta.json'), {
|
|
279
|
+
kind: 'context7',
|
|
280
|
+
input,
|
|
281
|
+
id,
|
|
282
|
+
fetchedAt,
|
|
283
|
+
topics: topics.length > 0 ? topics : undefined,
|
|
284
|
+
files: fileRelPaths,
|
|
285
|
+
notes: notes.length > 0 ? notes : undefined,
|
|
286
|
+
});
|
|
287
|
+
libraries.push({
|
|
288
|
+
kind: 'context7',
|
|
289
|
+
input,
|
|
290
|
+
id,
|
|
291
|
+
fetchedAt,
|
|
292
|
+
files: fileRelPaths.length > 0 ? fileRelPaths : undefined,
|
|
293
|
+
notes: notes.length > 0 ? notes.slice(0, 50) : undefined,
|
|
294
|
+
});
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
finally {
|
|
298
|
+
await ctx.close();
|
|
299
|
+
}
|
|
300
|
+
return { files, libraries };
|
|
301
|
+
}
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
import crypto from 'node:crypto';
|
|
2
|
+
import fs from 'node:fs/promises';
|
|
3
|
+
import path from 'node:path';
|
|
4
|
+
function nowIso() {
|
|
5
|
+
return new Date().toISOString();
|
|
6
|
+
}
|
|
7
|
+
function toPosix(p) {
|
|
8
|
+
return p.replaceAll('\\', '/');
|
|
9
|
+
}
|
|
10
|
+
function sha256Hex(text) {
|
|
11
|
+
return crypto.createHash('sha256').update(text, 'utf8').digest('hex');
|
|
12
|
+
}
|
|
13
|
+
async function ensureDir(p) {
|
|
14
|
+
await fs.mkdir(p, { recursive: true });
|
|
15
|
+
}
|
|
16
|
+
/**
|
|
17
|
+
* Parse DeepWiki URL to extract owner/repo.
|
|
18
|
+
* Supports formats like:
|
|
19
|
+
* - https://deepwiki.com/owner/repo
|
|
20
|
+
* - https://deepwiki.com/owner/repo/path/to/doc
|
|
21
|
+
*/
|
|
22
|
+
function parseDeepWikiUrl(url) {
|
|
23
|
+
try {
|
|
24
|
+
const parsed = new URL(url);
|
|
25
|
+
if (!parsed.hostname.includes('deepwiki.com'))
|
|
26
|
+
return null;
|
|
27
|
+
const parts = parsed.pathname.split('/').filter(Boolean);
|
|
28
|
+
if (parts.length < 2)
|
|
29
|
+
return null;
|
|
30
|
+
return { owner: parts[0], repo: parts[1] };
|
|
31
|
+
}
|
|
32
|
+
catch {
|
|
33
|
+
return null;
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
/**
|
|
37
|
+
* Fetch a DeepWiki page and extract its content.
|
|
38
|
+
* Returns the page content as Markdown.
|
|
39
|
+
*/
|
|
40
|
+
async function fetchDeepWikiPage(url, timeoutMs = 30000) {
|
|
41
|
+
const controller = new AbortController();
|
|
42
|
+
const timeoutId = setTimeout(() => controller.abort(), timeoutMs);
|
|
43
|
+
try {
|
|
44
|
+
const res = await fetch(url, {
|
|
45
|
+
headers: {
|
|
46
|
+
'User-Agent': 'preflight-mcp/0.1.0',
|
|
47
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
48
|
+
},
|
|
49
|
+
signal: controller.signal,
|
|
50
|
+
});
|
|
51
|
+
if (!res.ok) {
|
|
52
|
+
throw new Error(`HTTP ${res.status}: ${res.statusText}`);
|
|
53
|
+
}
|
|
54
|
+
const html = await res.text();
|
|
55
|
+
// Extract main content - DeepWiki typically renders docs in a main content area.
|
|
56
|
+
// This is a best-effort extraction; real implementation would need more sophisticated parsing.
|
|
57
|
+
const content = extractMarkdownFromHtml(html);
|
|
58
|
+
const title = extractTitle(html);
|
|
59
|
+
return { content, title };
|
|
60
|
+
}
|
|
61
|
+
finally {
|
|
62
|
+
clearTimeout(timeoutId);
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
/**
|
|
66
|
+
* Simple HTML to Markdown-ish text extraction.
|
|
67
|
+
* This is a best-effort converter for documentation pages.
|
|
68
|
+
*/
|
|
69
|
+
function extractMarkdownFromHtml(html) {
|
|
70
|
+
// Remove script and style tags
|
|
71
|
+
let text = html.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '');
|
|
72
|
+
text = text.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '');
|
|
73
|
+
// Convert common HTML elements to Markdown-ish format
|
|
74
|
+
text = text.replace(/<h1[^>]*>([\s\S]*?)<\/h1>/gi, '\n# $1\n');
|
|
75
|
+
text = text.replace(/<h2[^>]*>([\s\S]*?)<\/h2>/gi, '\n## $1\n');
|
|
76
|
+
text = text.replace(/<h3[^>]*>([\s\S]*?)<\/h3>/gi, '\n### $1\n');
|
|
77
|
+
text = text.replace(/<h4[^>]*>([\s\S]*?)<\/h4>/gi, '\n#### $1\n');
|
|
78
|
+
text = text.replace(/<h5[^>]*>([\s\S]*?)<\/h5>/gi, '\n##### $1\n');
|
|
79
|
+
text = text.replace(/<h6[^>]*>([\s\S]*?)<\/h6>/gi, '\n###### $1\n');
|
|
80
|
+
// Code blocks
|
|
81
|
+
text = text.replace(/<pre[^>]*><code[^>]*>([\s\S]*?)<\/code><\/pre>/gi, '\n```\n$1\n```\n');
|
|
82
|
+
text = text.replace(/<pre[^>]*>([\s\S]*?)<\/pre>/gi, '\n```\n$1\n```\n');
|
|
83
|
+
text = text.replace(/<code[^>]*>([\s\S]*?)<\/code>/gi, '`$1`');
|
|
84
|
+
// Lists
|
|
85
|
+
text = text.replace(/<li[^>]*>([\s\S]*?)<\/li>/gi, '- $1\n');
|
|
86
|
+
text = text.replace(/<ul[^>]*>/gi, '\n');
|
|
87
|
+
text = text.replace(/<\/ul>/gi, '\n');
|
|
88
|
+
text = text.replace(/<ol[^>]*>/gi, '\n');
|
|
89
|
+
text = text.replace(/<\/ol>/gi, '\n');
|
|
90
|
+
// Paragraphs and line breaks
|
|
91
|
+
text = text.replace(/<p[^>]*>([\s\S]*?)<\/p>/gi, '\n$1\n');
|
|
92
|
+
text = text.replace(/<br\s*\/?>/gi, '\n');
|
|
93
|
+
text = text.replace(/<hr\s*\/?>/gi, '\n---\n');
|
|
94
|
+
// Bold and italic
|
|
95
|
+
text = text.replace(/<strong[^>]*>([\s\S]*?)<\/strong>/gi, '**$1**');
|
|
96
|
+
text = text.replace(/<b[^>]*>([\s\S]*?)<\/b>/gi, '**$1**');
|
|
97
|
+
text = text.replace(/<em[^>]*>([\s\S]*?)<\/em>/gi, '*$1*');
|
|
98
|
+
text = text.replace(/<i[^>]*>([\s\S]*?)<\/i>/gi, '*$1*');
|
|
99
|
+
// Links
|
|
100
|
+
text = text.replace(/<a[^>]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/gi, '[$2]($1)');
|
|
101
|
+
// Remove remaining HTML tags
|
|
102
|
+
text = text.replace(/<[^>]+>/g, '');
|
|
103
|
+
// Decode HTML entities
|
|
104
|
+
text = text.replace(/&/g, '&');
|
|
105
|
+
text = text.replace(/</g, '<');
|
|
106
|
+
text = text.replace(/>/g, '>');
|
|
107
|
+
text = text.replace(/"/g, '"');
|
|
108
|
+
text = text.replace(/'/g, "'");
|
|
109
|
+
text = text.replace(/ /g, ' ');
|
|
110
|
+
// Clean up whitespace
|
|
111
|
+
text = text.replace(/\n{3,}/g, '\n\n');
|
|
112
|
+
text = text.trim();
|
|
113
|
+
return text;
|
|
114
|
+
}
|
|
115
|
+
function extractTitle(html) {
|
|
116
|
+
const match = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
|
|
117
|
+
if (match?.[1]) {
|
|
118
|
+
return match[1].replace(/\s*[-|]\s*DeepWiki.*$/i, '').trim();
|
|
119
|
+
}
|
|
120
|
+
return undefined;
|
|
121
|
+
}
|
|
122
|
+
function clipUtf8(text, maxBytes) {
|
|
123
|
+
const normalized = text.replace(/\r\n/g, '\n');
|
|
124
|
+
const buf = Buffer.from(normalized, 'utf8');
|
|
125
|
+
if (buf.length <= maxBytes)
|
|
126
|
+
return { text: normalized, truncated: false };
|
|
127
|
+
const clipped = buf.subarray(0, maxBytes).toString('utf8');
|
|
128
|
+
return { text: `${clipped}\n\n[TRUNCATED]\n`, truncated: true };
|
|
129
|
+
}
|
|
130
|
+
export async function ingestDeepWikiRepo(params) {
|
|
131
|
+
const parsed = parseDeepWikiUrl(params.url);
|
|
132
|
+
if (!parsed) {
|
|
133
|
+
return {
|
|
134
|
+
files: [],
|
|
135
|
+
summary: {
|
|
136
|
+
kind: 'deepwiki',
|
|
137
|
+
url: params.url,
|
|
138
|
+
repoId: params.url,
|
|
139
|
+
fetchedAt: nowIso(),
|
|
140
|
+
notes: ['Invalid DeepWiki URL format'],
|
|
141
|
+
},
|
|
142
|
+
};
|
|
143
|
+
}
|
|
144
|
+
const repoId = `${parsed.owner}/${parsed.repo}`;
|
|
145
|
+
const fetchedAt = nowIso();
|
|
146
|
+
const notes = [];
|
|
147
|
+
const files = [];
|
|
148
|
+
const fileRelPaths = [];
|
|
149
|
+
// Create deepwiki directory structure
|
|
150
|
+
const deepwikiDir = path.join(params.bundlePaths.rootDir, 'deepwiki', parsed.owner, parsed.repo);
|
|
151
|
+
const normDir = path.join(deepwikiDir, 'norm');
|
|
152
|
+
await ensureDir(normDir);
|
|
153
|
+
try {
|
|
154
|
+
const { content, title } = await fetchDeepWikiPage(params.url);
|
|
155
|
+
if (!content.trim()) {
|
|
156
|
+
notes.push('DeepWiki page returned empty content');
|
|
157
|
+
}
|
|
158
|
+
else {
|
|
159
|
+
const clipped = clipUtf8(content, params.cfg.maxFileBytes);
|
|
160
|
+
if (clipped.truncated) {
|
|
161
|
+
notes.push(`Content truncated to maxFileBytes=${params.cfg.maxFileBytes}`);
|
|
162
|
+
}
|
|
163
|
+
// Add header with source info
|
|
164
|
+
const header = `# ${title || repoId} (DeepWiki)\n\nSource: ${params.url}\nFetched: ${fetchedAt}\n\n---\n\n`;
|
|
165
|
+
const finalContent = header + clipped.text;
|
|
166
|
+
const fileName = 'index.md';
|
|
167
|
+
const absDocPath = path.join(normDir, fileName);
|
|
168
|
+
await fs.writeFile(absDocPath, finalContent, 'utf8');
|
|
169
|
+
const bundleRelPosix = toPosix(path.relative(params.bundlePaths.rootDir, absDocPath));
|
|
170
|
+
fileRelPaths.push(bundleRelPosix);
|
|
171
|
+
files.push({
|
|
172
|
+
repoId: `deepwiki:${repoId}`,
|
|
173
|
+
kind: 'doc',
|
|
174
|
+
repoRelativePath: fileName,
|
|
175
|
+
bundleNormRelativePath: bundleRelPosix,
|
|
176
|
+
bundleNormAbsPath: absDocPath,
|
|
177
|
+
sha256: sha256Hex(finalContent),
|
|
178
|
+
bytes: Buffer.byteLength(finalContent, 'utf8'),
|
|
179
|
+
});
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
catch (err) {
|
|
183
|
+
notes.push(`Failed to fetch DeepWiki page: ${err instanceof Error ? err.message : String(err)}`);
|
|
184
|
+
}
|
|
185
|
+
// Write meta.json
|
|
186
|
+
const metaPath = path.join(deepwikiDir, 'meta.json');
|
|
187
|
+
await fs.writeFile(metaPath, JSON.stringify({
|
|
188
|
+
kind: 'deepwiki',
|
|
189
|
+
url: params.url,
|
|
190
|
+
repoId,
|
|
191
|
+
fetchedAt,
|
|
192
|
+
files: fileRelPaths,
|
|
193
|
+
notes: notes.length > 0 ? notes : undefined,
|
|
194
|
+
}, null, 2) + '\n', 'utf8');
|
|
195
|
+
return {
|
|
196
|
+
files,
|
|
197
|
+
summary: {
|
|
198
|
+
kind: 'deepwiki',
|
|
199
|
+
url: params.url,
|
|
200
|
+
repoId,
|
|
201
|
+
fetchedAt,
|
|
202
|
+
files: fileRelPaths.length > 0 ? fileRelPaths : undefined,
|
|
203
|
+
notes: notes.length > 0 ? notes : undefined,
|
|
204
|
+
},
|
|
205
|
+
};
|
|
206
|
+
}
|