preflight-mcp 0.1.2 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +49 -142
- package/README.zh-CN.md +141 -124
- package/dist/ast/treeSitter.js +588 -0
- package/dist/bundle/analysis.js +47 -0
- package/dist/bundle/context7.js +65 -36
- package/dist/bundle/facts.js +829 -0
- package/dist/bundle/github.js +34 -3
- package/dist/bundle/githubArchive.js +102 -29
- package/dist/bundle/overview.js +226 -48
- package/dist/bundle/service.js +250 -130
- package/dist/config.js +30 -3
- package/dist/context7/client.js +5 -2
- package/dist/evidence/dependencyGraph.js +1136 -0
- package/dist/http/server.js +109 -0
- package/dist/jobs/progressTracker.js +191 -0
- package/dist/search/sqliteFts.js +150 -10
- package/dist/server.js +340 -326
- package/dist/trace/service.js +108 -0
- package/dist/trace/store.js +170 -0
- package/package.json +4 -2
- package/dist/bundle/deepwiki.js +0 -206
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
import path from 'node:path';
|
|
2
|
+
import * as z from 'zod';
|
|
3
|
+
import { findBundleStorageDir, getBundlePathsForId, listBundlesMulti } from '../bundle/service.js';
|
|
4
|
+
import { ensureTraceDb, queryTraceEdges, upsertTraceEdges } from './store.js';
|
|
5
|
+
export const TraceUpsertInputSchema = {
|
|
6
|
+
bundleId: z.string().describe('Bundle ID to attach trace links to.'),
|
|
7
|
+
edges: z
|
|
8
|
+
.array(z.object({
|
|
9
|
+
id: z.string().optional(),
|
|
10
|
+
source: z.object({ type: z.string(), id: z.string() }),
|
|
11
|
+
target: z.object({ type: z.string(), id: z.string() }),
|
|
12
|
+
type: z.string().describe('Edge type, e.g. mentions/tests/implements/relates_to'),
|
|
13
|
+
confidence: z.number().min(0).max(1).optional(),
|
|
14
|
+
method: z.enum(['exact', 'heuristic']).optional(),
|
|
15
|
+
sources: z
|
|
16
|
+
.array(z.object({
|
|
17
|
+
file: z.string().optional(),
|
|
18
|
+
range: z
|
|
19
|
+
.object({
|
|
20
|
+
startLine: z.number().int().min(1),
|
|
21
|
+
startCol: z.number().int().min(1),
|
|
22
|
+
endLine: z.number().int().min(1),
|
|
23
|
+
endCol: z.number().int().min(1),
|
|
24
|
+
})
|
|
25
|
+
.optional(),
|
|
26
|
+
externalUrl: z.string().optional(),
|
|
27
|
+
note: z.string().optional(),
|
|
28
|
+
}))
|
|
29
|
+
.optional(),
|
|
30
|
+
}))
|
|
31
|
+
.min(1)
|
|
32
|
+
.describe('Trace edges to upsert.'),
|
|
33
|
+
};
|
|
34
|
+
export const TraceQueryInputSchema = {
|
|
35
|
+
// When omitted, the query may scan across bundles (best-effort, capped).
|
|
36
|
+
bundleId: z.string().optional().describe('Optional bundleId. If omitted, scans across bundles (capped).'),
|
|
37
|
+
source_type: z.string(),
|
|
38
|
+
source_id: z.string(),
|
|
39
|
+
target_type: z.string().optional(),
|
|
40
|
+
target_id: z.string().optional(),
|
|
41
|
+
edge_type: z.string().optional(),
|
|
42
|
+
limit: z.number().int().min(1).max(500).default(50),
|
|
43
|
+
timeBudgetMs: z.number().int().min(500).max(30_000).default(5_000),
|
|
44
|
+
maxBundles: z.number().int().min(1).max(200).default(50),
|
|
45
|
+
};
|
|
46
|
+
function traceDbPathForBundleRoot(bundleRoot) {
|
|
47
|
+
return path.join(bundleRoot, 'trace', 'trace.sqlite3');
|
|
48
|
+
}
|
|
49
|
+
async function getTraceDbPathForBundleId(cfg, bundleId) {
|
|
50
|
+
const storageDir = await findBundleStorageDir(cfg.storageDirs, bundleId);
|
|
51
|
+
if (!storageDir) {
|
|
52
|
+
throw new Error(`Bundle not found: ${bundleId}`);
|
|
53
|
+
}
|
|
54
|
+
const paths = getBundlePathsForId(storageDir, bundleId);
|
|
55
|
+
const traceDbPath = traceDbPathForBundleRoot(paths.rootDir);
|
|
56
|
+
await ensureTraceDb(traceDbPath);
|
|
57
|
+
return traceDbPath;
|
|
58
|
+
}
|
|
59
|
+
export async function traceUpsert(cfg, rawArgs) {
|
|
60
|
+
const args = z.object(TraceUpsertInputSchema).parse(rawArgs);
|
|
61
|
+
const traceDbPath = await getTraceDbPathForBundleId(cfg, args.bundleId);
|
|
62
|
+
const res = await upsertTraceEdges(traceDbPath, args.edges);
|
|
63
|
+
return { bundleId: args.bundleId, ...res };
|
|
64
|
+
}
|
|
65
|
+
export async function traceQuery(cfg, rawArgs) {
|
|
66
|
+
const args = z.object(TraceQueryInputSchema).parse(rawArgs);
|
|
67
|
+
const source = { type: args.source_type, id: args.source_id };
|
|
68
|
+
const target = args.target_type && args.target_id ? { type: args.target_type, id: args.target_id } : undefined;
|
|
69
|
+
// Fast path: single bundle
|
|
70
|
+
if (args.bundleId) {
|
|
71
|
+
const dbPath = await getTraceDbPathForBundleId(cfg, args.bundleId);
|
|
72
|
+
const rows = queryTraceEdges(dbPath, { source, target, edgeType: args.edge_type, limit: args.limit });
|
|
73
|
+
return { bundleId: args.bundleId, edges: rows };
|
|
74
|
+
}
|
|
75
|
+
// Slow path: scan across bundles (best-effort, capped)
|
|
76
|
+
const startedAt = Date.now();
|
|
77
|
+
const timeLeft = () => args.timeBudgetMs - (Date.now() - startedAt);
|
|
78
|
+
let truncated = false;
|
|
79
|
+
const bundleIds = (await listBundlesMulti(cfg.storageDirs)).slice(0, args.maxBundles);
|
|
80
|
+
const collected = [];
|
|
81
|
+
for (const bundleId of bundleIds) {
|
|
82
|
+
if (timeLeft() <= 0) {
|
|
83
|
+
truncated = true;
|
|
84
|
+
break;
|
|
85
|
+
}
|
|
86
|
+
try {
|
|
87
|
+
const dbPath = await getTraceDbPathForBundleId(cfg, bundleId);
|
|
88
|
+
const rows = queryTraceEdges(dbPath, { source, target, edgeType: args.edge_type, limit: Math.min(50, args.limit) });
|
|
89
|
+
for (const r of rows) {
|
|
90
|
+
collected.push({ ...r, bundleId });
|
|
91
|
+
if (collected.length >= args.limit)
|
|
92
|
+
break;
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
catch {
|
|
96
|
+
// ignore bundles without trace
|
|
97
|
+
}
|
|
98
|
+
if (collected.length >= args.limit)
|
|
99
|
+
break;
|
|
100
|
+
}
|
|
101
|
+
// Sort by updatedAt desc across bundles
|
|
102
|
+
collected.sort((a, b) => new Date(b.updatedAt).getTime() - new Date(a.updatedAt).getTime());
|
|
103
|
+
return {
|
|
104
|
+
scannedBundles: bundleIds.length,
|
|
105
|
+
truncated: truncated ? true : undefined,
|
|
106
|
+
edges: collected.slice(0, args.limit),
|
|
107
|
+
};
|
|
108
|
+
}
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
import fs from 'node:fs/promises';
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
import crypto from 'node:crypto';
|
|
4
|
+
import Database from 'better-sqlite3';
|
|
5
|
+
function sha256Hex(text) {
|
|
6
|
+
return crypto.createHash('sha256').update(text, 'utf8').digest('hex');
|
|
7
|
+
}
|
|
8
|
+
function edgeDeterministicId(e) {
|
|
9
|
+
if (e.id && e.id.trim())
|
|
10
|
+
return e.id.trim();
|
|
11
|
+
return `tr_${sha256Hex(`${e.source.type}|${e.source.id}|${e.type}|${e.target.type}|${e.target.id}`).slice(0, 24)}`;
|
|
12
|
+
}
|
|
13
|
+
async function ensureDir(p) {
|
|
14
|
+
await fs.mkdir(p, { recursive: true });
|
|
15
|
+
}
|
|
16
|
+
export async function ensureTraceDb(traceDbPath) {
|
|
17
|
+
await ensureDir(path.dirname(traceDbPath));
|
|
18
|
+
const db = new Database(traceDbPath);
|
|
19
|
+
try {
|
|
20
|
+
db.pragma('journal_mode = WAL');
|
|
21
|
+
db.pragma('synchronous = NORMAL');
|
|
22
|
+
db.exec(`
|
|
23
|
+
CREATE TABLE IF NOT EXISTS trace_edges (
|
|
24
|
+
id TEXT PRIMARY KEY,
|
|
25
|
+
source_type TEXT NOT NULL,
|
|
26
|
+
source_id TEXT NOT NULL,
|
|
27
|
+
target_type TEXT NOT NULL,
|
|
28
|
+
target_id TEXT NOT NULL,
|
|
29
|
+
edge_type TEXT NOT NULL,
|
|
30
|
+
confidence REAL NOT NULL,
|
|
31
|
+
method TEXT NOT NULL,
|
|
32
|
+
sources_json TEXT NOT NULL,
|
|
33
|
+
created_at TEXT NOT NULL,
|
|
34
|
+
updated_at TEXT NOT NULL
|
|
35
|
+
);
|
|
36
|
+
|
|
37
|
+
CREATE INDEX IF NOT EXISTS idx_trace_edges_source ON trace_edges(source_type, source_id);
|
|
38
|
+
CREATE INDEX IF NOT EXISTS idx_trace_edges_target ON trace_edges(target_type, target_id);
|
|
39
|
+
CREATE INDEX IF NOT EXISTS idx_trace_edges_edge_type ON trace_edges(edge_type);
|
|
40
|
+
`);
|
|
41
|
+
}
|
|
42
|
+
finally {
|
|
43
|
+
db.close();
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
export async function upsertTraceEdges(traceDbPath, edges) {
|
|
47
|
+
await ensureTraceDb(traceDbPath);
|
|
48
|
+
const db = new Database(traceDbPath);
|
|
49
|
+
try {
|
|
50
|
+
db.pragma('journal_mode = WAL');
|
|
51
|
+
db.pragma('synchronous = NORMAL');
|
|
52
|
+
const upsert = db.prepare(`
|
|
53
|
+
INSERT INTO trace_edges (
|
|
54
|
+
id, source_type, source_id, target_type, target_id, edge_type,
|
|
55
|
+
confidence, method, sources_json, created_at, updated_at
|
|
56
|
+
) VALUES (
|
|
57
|
+
@id, @source_type, @source_id, @target_type, @target_id, @edge_type,
|
|
58
|
+
@confidence, @method, @sources_json, @created_at, @updated_at
|
|
59
|
+
)
|
|
60
|
+
ON CONFLICT(id) DO UPDATE SET
|
|
61
|
+
source_type=excluded.source_type,
|
|
62
|
+
source_id=excluded.source_id,
|
|
63
|
+
target_type=excluded.target_type,
|
|
64
|
+
target_id=excluded.target_id,
|
|
65
|
+
edge_type=excluded.edge_type,
|
|
66
|
+
confidence=excluded.confidence,
|
|
67
|
+
method=excluded.method,
|
|
68
|
+
sources_json=excluded.sources_json,
|
|
69
|
+
updated_at=excluded.updated_at;
|
|
70
|
+
`);
|
|
71
|
+
const now = new Date().toISOString();
|
|
72
|
+
const tx = db.transaction((items) => {
|
|
73
|
+
const ids = [];
|
|
74
|
+
for (const e of items) {
|
|
75
|
+
const id = edgeDeterministicId(e);
|
|
76
|
+
const confidence = typeof e.confidence === 'number' ? Math.max(0, Math.min(1, e.confidence)) : 0.5;
|
|
77
|
+
const method = e.method === 'exact' ? 'exact' : 'heuristic';
|
|
78
|
+
const sourcesJson = JSON.stringify(e.sources ?? []);
|
|
79
|
+
// Preserve created_at on update by reading existing row (cheap single get)
|
|
80
|
+
const existing = db
|
|
81
|
+
.prepare('SELECT created_at FROM trace_edges WHERE id = ?')
|
|
82
|
+
.get(id);
|
|
83
|
+
upsert.run({
|
|
84
|
+
id,
|
|
85
|
+
source_type: e.source.type,
|
|
86
|
+
source_id: e.source.id,
|
|
87
|
+
target_type: e.target.type,
|
|
88
|
+
target_id: e.target.id,
|
|
89
|
+
edge_type: e.type,
|
|
90
|
+
confidence,
|
|
91
|
+
method,
|
|
92
|
+
sources_json: sourcesJson,
|
|
93
|
+
created_at: existing?.created_at ?? now,
|
|
94
|
+
updated_at: now,
|
|
95
|
+
});
|
|
96
|
+
ids.push(id);
|
|
97
|
+
}
|
|
98
|
+
return ids;
|
|
99
|
+
});
|
|
100
|
+
const ids = tx(edges);
|
|
101
|
+
return { upserted: ids.length, ids };
|
|
102
|
+
}
|
|
103
|
+
finally {
|
|
104
|
+
db.close();
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
export function queryTraceEdges(traceDbPath, params) {
|
|
108
|
+
const db = new Database(traceDbPath, { readonly: true });
|
|
109
|
+
try {
|
|
110
|
+
const where = [];
|
|
111
|
+
const bind = {};
|
|
112
|
+
if (params.source) {
|
|
113
|
+
where.push('source_type = @source_type AND source_id = @source_id');
|
|
114
|
+
bind.source_type = params.source.type;
|
|
115
|
+
bind.source_id = params.source.id;
|
|
116
|
+
}
|
|
117
|
+
if (params.target) {
|
|
118
|
+
where.push('target_type = @target_type AND target_id = @target_id');
|
|
119
|
+
bind.target_type = params.target.type;
|
|
120
|
+
bind.target_id = params.target.id;
|
|
121
|
+
}
|
|
122
|
+
if (params.edgeType) {
|
|
123
|
+
where.push('edge_type = @edge_type');
|
|
124
|
+
bind.edge_type = params.edgeType;
|
|
125
|
+
}
|
|
126
|
+
const sqlWhere = where.length ? `WHERE ${where.join(' AND ')}` : '';
|
|
127
|
+
const limit = Math.min(500, Math.max(1, params.limit ?? 50));
|
|
128
|
+
const stmt = db.prepare(`
|
|
129
|
+
SELECT
|
|
130
|
+
id,
|
|
131
|
+
source_type,
|
|
132
|
+
source_id,
|
|
133
|
+
target_type,
|
|
134
|
+
target_id,
|
|
135
|
+
edge_type,
|
|
136
|
+
confidence,
|
|
137
|
+
method,
|
|
138
|
+
sources_json,
|
|
139
|
+
created_at,
|
|
140
|
+
updated_at
|
|
141
|
+
FROM trace_edges
|
|
142
|
+
${sqlWhere}
|
|
143
|
+
ORDER BY updated_at DESC
|
|
144
|
+
LIMIT ${limit}
|
|
145
|
+
`);
|
|
146
|
+
const rows = stmt.all(bind);
|
|
147
|
+
return rows.map((r) => ({
|
|
148
|
+
id: r.id,
|
|
149
|
+
source: { type: r.source_type, id: r.source_id },
|
|
150
|
+
target: { type: r.target_type, id: r.target_id },
|
|
151
|
+
type: r.edge_type,
|
|
152
|
+
confidence: r.confidence,
|
|
153
|
+
method: r.method === 'exact' ? 'exact' : 'heuristic',
|
|
154
|
+
sources: (() => {
|
|
155
|
+
try {
|
|
156
|
+
const parsed = JSON.parse(r.sources_json);
|
|
157
|
+
return Array.isArray(parsed) ? parsed : [];
|
|
158
|
+
}
|
|
159
|
+
catch {
|
|
160
|
+
return [];
|
|
161
|
+
}
|
|
162
|
+
})(),
|
|
163
|
+
createdAt: r.created_at,
|
|
164
|
+
updatedAt: r.updated_at,
|
|
165
|
+
}));
|
|
166
|
+
}
|
|
167
|
+
finally {
|
|
168
|
+
db.close();
|
|
169
|
+
}
|
|
170
|
+
}
|
package/package.json
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "preflight-mcp",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.4",
|
|
4
4
|
"description": "MCP server that creates evidence-based preflight bundles for GitHub repositories and library docs.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"license": "MIT",
|
|
7
7
|
"author": "preflight-mcp contributors",
|
|
8
8
|
"repository": {
|
|
9
9
|
"type": "git",
|
|
10
|
-
"url": "https://github.com/jonnyhoo/preflight-mcp.git"
|
|
10
|
+
"url": "git+https://github.com/jonnyhoo/preflight-mcp.git"
|
|
11
11
|
},
|
|
12
12
|
"bugs": {
|
|
13
13
|
"url": "https://github.com/jonnyhoo/preflight-mcp/issues"
|
|
@@ -48,10 +48,12 @@
|
|
|
48
48
|
},
|
|
49
49
|
"dependencies": {
|
|
50
50
|
"@modelcontextprotocol/sdk": "^1.25.1",
|
|
51
|
+
"@vscode/tree-sitter-wasm": "^0.3.0",
|
|
51
52
|
"adm-zip": "^0.5.16",
|
|
52
53
|
"better-sqlite3": "^12.5.0",
|
|
53
54
|
"ignore": "^7.0.5",
|
|
54
55
|
"node-cron": "^4.2.1",
|
|
56
|
+
"web-tree-sitter": "^0.26.3",
|
|
55
57
|
"zod": "^4.2.1"
|
|
56
58
|
},
|
|
57
59
|
"devDependencies": {
|
package/dist/bundle/deepwiki.js
DELETED
|
@@ -1,206 +0,0 @@
|
|
|
1
|
-
import crypto from 'node:crypto';
|
|
2
|
-
import fs from 'node:fs/promises';
|
|
3
|
-
import path from 'node:path';
|
|
4
|
-
function nowIso() {
|
|
5
|
-
return new Date().toISOString();
|
|
6
|
-
}
|
|
7
|
-
function toPosix(p) {
|
|
8
|
-
return p.replaceAll('\\', '/');
|
|
9
|
-
}
|
|
10
|
-
function sha256Hex(text) {
|
|
11
|
-
return crypto.createHash('sha256').update(text, 'utf8').digest('hex');
|
|
12
|
-
}
|
|
13
|
-
async function ensureDir(p) {
|
|
14
|
-
await fs.mkdir(p, { recursive: true });
|
|
15
|
-
}
|
|
16
|
-
/**
|
|
17
|
-
* Parse DeepWiki URL to extract owner/repo.
|
|
18
|
-
* Supports formats like:
|
|
19
|
-
* - https://deepwiki.com/owner/repo
|
|
20
|
-
* - https://deepwiki.com/owner/repo/path/to/doc
|
|
21
|
-
*/
|
|
22
|
-
function parseDeepWikiUrl(url) {
|
|
23
|
-
try {
|
|
24
|
-
const parsed = new URL(url);
|
|
25
|
-
if (!parsed.hostname.includes('deepwiki.com'))
|
|
26
|
-
return null;
|
|
27
|
-
const parts = parsed.pathname.split('/').filter(Boolean);
|
|
28
|
-
if (parts.length < 2)
|
|
29
|
-
return null;
|
|
30
|
-
return { owner: parts[0], repo: parts[1] };
|
|
31
|
-
}
|
|
32
|
-
catch {
|
|
33
|
-
return null;
|
|
34
|
-
}
|
|
35
|
-
}
|
|
36
|
-
/**
|
|
37
|
-
* Fetch a DeepWiki page and extract its content.
|
|
38
|
-
* Returns the page content as Markdown.
|
|
39
|
-
*/
|
|
40
|
-
async function fetchDeepWikiPage(url, timeoutMs = 30000) {
|
|
41
|
-
const controller = new AbortController();
|
|
42
|
-
const timeoutId = setTimeout(() => controller.abort(), timeoutMs);
|
|
43
|
-
try {
|
|
44
|
-
const res = await fetch(url, {
|
|
45
|
-
headers: {
|
|
46
|
-
'User-Agent': 'preflight-mcp/0.1.1',
|
|
47
|
-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
48
|
-
},
|
|
49
|
-
signal: controller.signal,
|
|
50
|
-
});
|
|
51
|
-
if (!res.ok) {
|
|
52
|
-
throw new Error(`HTTP ${res.status}: ${res.statusText}`);
|
|
53
|
-
}
|
|
54
|
-
const html = await res.text();
|
|
55
|
-
// Extract main content - DeepWiki typically renders docs in a main content area.
|
|
56
|
-
// This is a best-effort extraction; real implementation would need more sophisticated parsing.
|
|
57
|
-
const content = extractMarkdownFromHtml(html);
|
|
58
|
-
const title = extractTitle(html);
|
|
59
|
-
return { content, title };
|
|
60
|
-
}
|
|
61
|
-
finally {
|
|
62
|
-
clearTimeout(timeoutId);
|
|
63
|
-
}
|
|
64
|
-
}
|
|
65
|
-
/**
|
|
66
|
-
* Simple HTML to Markdown-ish text extraction.
|
|
67
|
-
* This is a best-effort converter for documentation pages.
|
|
68
|
-
*/
|
|
69
|
-
function extractMarkdownFromHtml(html) {
|
|
70
|
-
// Remove script and style tags
|
|
71
|
-
let text = html.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '');
|
|
72
|
-
text = text.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '');
|
|
73
|
-
// Convert common HTML elements to Markdown-ish format
|
|
74
|
-
text = text.replace(/<h1[^>]*>([\s\S]*?)<\/h1>/gi, '\n# $1\n');
|
|
75
|
-
text = text.replace(/<h2[^>]*>([\s\S]*?)<\/h2>/gi, '\n## $1\n');
|
|
76
|
-
text = text.replace(/<h3[^>]*>([\s\S]*?)<\/h3>/gi, '\n### $1\n');
|
|
77
|
-
text = text.replace(/<h4[^>]*>([\s\S]*?)<\/h4>/gi, '\n#### $1\n');
|
|
78
|
-
text = text.replace(/<h5[^>]*>([\s\S]*?)<\/h5>/gi, '\n##### $1\n');
|
|
79
|
-
text = text.replace(/<h6[^>]*>([\s\S]*?)<\/h6>/gi, '\n###### $1\n');
|
|
80
|
-
// Code blocks
|
|
81
|
-
text = text.replace(/<pre[^>]*><code[^>]*>([\s\S]*?)<\/code><\/pre>/gi, '\n```\n$1\n```\n');
|
|
82
|
-
text = text.replace(/<pre[^>]*>([\s\S]*?)<\/pre>/gi, '\n```\n$1\n```\n');
|
|
83
|
-
text = text.replace(/<code[^>]*>([\s\S]*?)<\/code>/gi, '`$1`');
|
|
84
|
-
// Lists
|
|
85
|
-
text = text.replace(/<li[^>]*>([\s\S]*?)<\/li>/gi, '- $1\n');
|
|
86
|
-
text = text.replace(/<ul[^>]*>/gi, '\n');
|
|
87
|
-
text = text.replace(/<\/ul>/gi, '\n');
|
|
88
|
-
text = text.replace(/<ol[^>]*>/gi, '\n');
|
|
89
|
-
text = text.replace(/<\/ol>/gi, '\n');
|
|
90
|
-
// Paragraphs and line breaks
|
|
91
|
-
text = text.replace(/<p[^>]*>([\s\S]*?)<\/p>/gi, '\n$1\n');
|
|
92
|
-
text = text.replace(/<br\s*\/?>/gi, '\n');
|
|
93
|
-
text = text.replace(/<hr\s*\/?>/gi, '\n---\n');
|
|
94
|
-
// Bold and italic
|
|
95
|
-
text = text.replace(/<strong[^>]*>([\s\S]*?)<\/strong>/gi, '**$1**');
|
|
96
|
-
text = text.replace(/<b[^>]*>([\s\S]*?)<\/b>/gi, '**$1**');
|
|
97
|
-
text = text.replace(/<em[^>]*>([\s\S]*?)<\/em>/gi, '*$1*');
|
|
98
|
-
text = text.replace(/<i[^>]*>([\s\S]*?)<\/i>/gi, '*$1*');
|
|
99
|
-
// Links
|
|
100
|
-
text = text.replace(/<a[^>]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/gi, '[$2]($1)');
|
|
101
|
-
// Remove remaining HTML tags
|
|
102
|
-
text = text.replace(/<[^>]+>/g, '');
|
|
103
|
-
// Decode HTML entities
|
|
104
|
-
text = text.replace(/&/g, '&');
|
|
105
|
-
text = text.replace(/</g, '<');
|
|
106
|
-
text = text.replace(/>/g, '>');
|
|
107
|
-
text = text.replace(/"/g, '"');
|
|
108
|
-
text = text.replace(/'/g, "'");
|
|
109
|
-
text = text.replace(/ /g, ' ');
|
|
110
|
-
// Clean up whitespace
|
|
111
|
-
text = text.replace(/\n{3,}/g, '\n\n');
|
|
112
|
-
text = text.trim();
|
|
113
|
-
return text;
|
|
114
|
-
}
|
|
115
|
-
function extractTitle(html) {
|
|
116
|
-
const match = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
|
|
117
|
-
if (match?.[1]) {
|
|
118
|
-
return match[1].replace(/\s*[-|]\s*DeepWiki.*$/i, '').trim();
|
|
119
|
-
}
|
|
120
|
-
return undefined;
|
|
121
|
-
}
|
|
122
|
-
function clipUtf8(text, maxBytes) {
|
|
123
|
-
const normalized = text.replace(/\r\n/g, '\n');
|
|
124
|
-
const buf = Buffer.from(normalized, 'utf8');
|
|
125
|
-
if (buf.length <= maxBytes)
|
|
126
|
-
return { text: normalized, truncated: false };
|
|
127
|
-
const clipped = buf.subarray(0, maxBytes).toString('utf8');
|
|
128
|
-
return { text: `${clipped}\n\n[TRUNCATED]\n`, truncated: true };
|
|
129
|
-
}
|
|
130
|
-
export async function ingestDeepWikiRepo(params) {
|
|
131
|
-
const parsed = parseDeepWikiUrl(params.url);
|
|
132
|
-
if (!parsed) {
|
|
133
|
-
return {
|
|
134
|
-
files: [],
|
|
135
|
-
summary: {
|
|
136
|
-
kind: 'deepwiki',
|
|
137
|
-
url: params.url,
|
|
138
|
-
repoId: params.url,
|
|
139
|
-
fetchedAt: nowIso(),
|
|
140
|
-
notes: ['Invalid DeepWiki URL format'],
|
|
141
|
-
},
|
|
142
|
-
};
|
|
143
|
-
}
|
|
144
|
-
const repoId = `${parsed.owner}/${parsed.repo}`;
|
|
145
|
-
const fetchedAt = nowIso();
|
|
146
|
-
const notes = [];
|
|
147
|
-
const files = [];
|
|
148
|
-
const fileRelPaths = [];
|
|
149
|
-
// Create deepwiki directory structure
|
|
150
|
-
const deepwikiDir = path.join(params.bundlePaths.rootDir, 'deepwiki', parsed.owner, parsed.repo);
|
|
151
|
-
const normDir = path.join(deepwikiDir, 'norm');
|
|
152
|
-
await ensureDir(normDir);
|
|
153
|
-
try {
|
|
154
|
-
const { content, title } = await fetchDeepWikiPage(params.url);
|
|
155
|
-
if (!content.trim()) {
|
|
156
|
-
notes.push('DeepWiki page returned empty content');
|
|
157
|
-
}
|
|
158
|
-
else {
|
|
159
|
-
const clipped = clipUtf8(content, params.cfg.maxFileBytes);
|
|
160
|
-
if (clipped.truncated) {
|
|
161
|
-
notes.push(`Content truncated to maxFileBytes=${params.cfg.maxFileBytes}`);
|
|
162
|
-
}
|
|
163
|
-
// Add header with source info
|
|
164
|
-
const header = `# ${title || repoId} (DeepWiki)\n\nSource: ${params.url}\nFetched: ${fetchedAt}\n\n---\n\n`;
|
|
165
|
-
const finalContent = header + clipped.text;
|
|
166
|
-
const fileName = 'index.md';
|
|
167
|
-
const absDocPath = path.join(normDir, fileName);
|
|
168
|
-
await fs.writeFile(absDocPath, finalContent, 'utf8');
|
|
169
|
-
const bundleRelPosix = toPosix(path.relative(params.bundlePaths.rootDir, absDocPath));
|
|
170
|
-
fileRelPaths.push(bundleRelPosix);
|
|
171
|
-
files.push({
|
|
172
|
-
repoId: `deepwiki:${repoId}`,
|
|
173
|
-
kind: 'doc',
|
|
174
|
-
repoRelativePath: fileName,
|
|
175
|
-
bundleNormRelativePath: bundleRelPosix,
|
|
176
|
-
bundleNormAbsPath: absDocPath,
|
|
177
|
-
sha256: sha256Hex(finalContent),
|
|
178
|
-
bytes: Buffer.byteLength(finalContent, 'utf8'),
|
|
179
|
-
});
|
|
180
|
-
}
|
|
181
|
-
}
|
|
182
|
-
catch (err) {
|
|
183
|
-
notes.push(`Failed to fetch DeepWiki page: ${err instanceof Error ? err.message : String(err)}`);
|
|
184
|
-
}
|
|
185
|
-
// Write meta.json
|
|
186
|
-
const metaPath = path.join(deepwikiDir, 'meta.json');
|
|
187
|
-
await fs.writeFile(metaPath, JSON.stringify({
|
|
188
|
-
kind: 'deepwiki',
|
|
189
|
-
url: params.url,
|
|
190
|
-
repoId,
|
|
191
|
-
fetchedAt,
|
|
192
|
-
files: fileRelPaths,
|
|
193
|
-
notes: notes.length > 0 ? notes : undefined,
|
|
194
|
-
}, null, 2) + '\n', 'utf8');
|
|
195
|
-
return {
|
|
196
|
-
files,
|
|
197
|
-
summary: {
|
|
198
|
-
kind: 'deepwiki',
|
|
199
|
-
url: params.url,
|
|
200
|
-
repoId,
|
|
201
|
-
fetchedAt,
|
|
202
|
-
files: fileRelPaths.length > 0 ? fileRelPaths : undefined,
|
|
203
|
-
notes: notes.length > 0 ? notes : undefined,
|
|
204
|
-
},
|
|
205
|
-
};
|
|
206
|
-
}
|