sdtk-wiki-kit 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +186 -13
- package/assets/atlas/build_atlas.py +164 -79
- package/package.json +1 -1
- package/src/commands/help.js +38 -3
- package/src/commands/lint.js +2 -1
- package/src/commands/operations.js +345 -0
- package/src/commands/search.js +89 -0
- package/src/commands/wiki.js +83 -9
- package/src/index.js +35 -1
- package/src/lib/wiki-compile.js +694 -6
- package/src/lib/wiki-extract.js +637 -0
- package/src/lib/wiki-flags.js +8 -0
- package/src/lib/wiki-lint.js +179 -2
- package/src/lib/wiki-search.js +175 -0
|
@@ -0,0 +1,637 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
|
|
3
|
+
const crypto = require("crypto");
|
|
4
|
+
const fs = require("fs");
|
|
5
|
+
const path = require("path");
|
|
6
|
+
const { CliError, ValidationError } = require("./errors");
|
|
7
|
+
const {
|
|
8
|
+
assertWikiWorkspaceWritePath,
|
|
9
|
+
getWikiReportsPath,
|
|
10
|
+
resolveProjectPath,
|
|
11
|
+
} = require("./wiki-paths");
|
|
12
|
+
|
|
13
|
+
const REPORT_PREFIX = "semantic-extraction-dry-run";
|
|
14
|
+
const SCHEMA_VERSION = 1;
|
|
15
|
+
const EXCLUDE_FRAGS = [
|
|
16
|
+
".git",
|
|
17
|
+
".sdtk/wiki",
|
|
18
|
+
".sdtk/atlas",
|
|
19
|
+
"node_modules",
|
|
20
|
+
".venv",
|
|
21
|
+
"venv",
|
|
22
|
+
"dist",
|
|
23
|
+
"build",
|
|
24
|
+
"coverage",
|
|
25
|
+
".next",
|
|
26
|
+
".turbo",
|
|
27
|
+
".cache",
|
|
28
|
+
"__pycache__",
|
|
29
|
+
];
|
|
30
|
+
|
|
31
|
+
const CONCEPT_RULES = [
|
|
32
|
+
{
|
|
33
|
+
concept_id: "concept_self_hosted_project_management",
|
|
34
|
+
name: "self-hosted project management",
|
|
35
|
+
aliases: ["Trello alternative", "realtime collaboration", "project management"],
|
|
36
|
+
keywords: ["trello", "jira", "project management", "realtime collaboration", "collaboration"],
|
|
37
|
+
category: "project_management",
|
|
38
|
+
},
|
|
39
|
+
{
|
|
40
|
+
concept_id: "concept_agent_skills",
|
|
41
|
+
name: "agent skills",
|
|
42
|
+
aliases: ["AI agent skills", "skill pack", "coding agent skills"],
|
|
43
|
+
keywords: ["skill", "skills", "agent", "claude code", "codex", "cursor", "gemini"],
|
|
44
|
+
category: "agent_skills",
|
|
45
|
+
},
|
|
46
|
+
{
|
|
47
|
+
concept_id: "concept_secret_scanning",
|
|
48
|
+
name: "secret scanning",
|
|
49
|
+
aliases: ["API key detection", "token detection", "credential scanning"],
|
|
50
|
+
keywords: ["gitleaks", "secret", "token", "password", "api key"],
|
|
51
|
+
category: "secret_scanning",
|
|
52
|
+
},
|
|
53
|
+
{
|
|
54
|
+
concept_id: "concept_self_hosted_local_first_tools",
|
|
55
|
+
name: "self-hosted local-first tools",
|
|
56
|
+
aliases: ["local-first", "privacy", "self-hosted"],
|
|
57
|
+
keywords: ["self-hosted", "local", "privacy", "offline"],
|
|
58
|
+
category: "local_first_privacy",
|
|
59
|
+
},
|
|
60
|
+
{
|
|
61
|
+
concept_id: "concept_ai_media_generation",
|
|
62
|
+
name: "AI media generation",
|
|
63
|
+
aliases: ["AI video", "AI music", "generative media"],
|
|
64
|
+
keywords: ["video", "music", "suno", "lipdub", "comfyui", "ace-step"],
|
|
65
|
+
category: "ai_media",
|
|
66
|
+
},
|
|
67
|
+
{
|
|
68
|
+
concept_id: "concept_document_management",
|
|
69
|
+
name: "document management",
|
|
70
|
+
aliases: ["paperless", "PDF workflow", "document archive"],
|
|
71
|
+
keywords: ["paperless", "document", "pdf", "ocr", "archive"],
|
|
72
|
+
category: "document_management",
|
|
73
|
+
},
|
|
74
|
+
{
|
|
75
|
+
concept_id: "concept_developer_tooling",
|
|
76
|
+
name: "developer tooling",
|
|
77
|
+
aliases: ["open source developer tool", "CLI", "framework"],
|
|
78
|
+
keywords: ["github", "repo", "open-source", "mã nguồn", "cli", "framework", "developer"],
|
|
79
|
+
category: "developer_tooling",
|
|
80
|
+
},
|
|
81
|
+
];
|
|
82
|
+
|
|
83
|
+
function toPosix(value) {
|
|
84
|
+
return String(value || "").replace(/\\/g, "/");
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
function isRemoteUrl(value) {
|
|
88
|
+
return /^(?:https?|ftp):\/\//i.test(String(value || ""));
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
function sha256(value) {
|
|
92
|
+
return crypto.createHash("sha256").update(String(value)).digest("hex");
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
function timestampStamp(date = new Date()) {
|
|
96
|
+
return date.toISOString().replace(/[-:]/g, "").replace(/\.\d{3}Z$/, "Z").replace("T", "-");
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
function safeSlug(value, fallback = "item") {
|
|
100
|
+
const slug = String(value || "")
|
|
101
|
+
.normalize("NFKD")
|
|
102
|
+
.replace(/[\u0300-\u036f]/g, "")
|
|
103
|
+
.toLowerCase()
|
|
104
|
+
.replace(/https?:\/\/\S+/g, "")
|
|
105
|
+
.replace(/[^a-z0-9]+/g, "-")
|
|
106
|
+
.replace(/^-+|-+$/g, "")
|
|
107
|
+
.replace(/-+/g, "-")
|
|
108
|
+
.slice(0, 64)
|
|
109
|
+
.replace(/^-+|-+$/g, "");
|
|
110
|
+
return slug || fallback;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
function normaliseExcludeFragment(frag) {
|
|
114
|
+
return toPosix(frag).replace(/^\/+|\/+$/g, "").toLowerCase().split("/").filter(Boolean);
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
function isExcluded(targetPath, sourceRoot) {
|
|
118
|
+
const relative = toPosix(path.relative(sourceRoot, targetPath)).toLowerCase();
|
|
119
|
+
const parts = relative.split("/").filter((part) => part && part !== ".");
|
|
120
|
+
for (const frag of EXCLUDE_FRAGS) {
|
|
121
|
+
const fragParts = normaliseExcludeFragment(frag);
|
|
122
|
+
if (fragParts.length === 0) continue;
|
|
123
|
+
if (fragParts.length === 1) {
|
|
124
|
+
if (parts.includes(fragParts[0])) return `exclude:${frag}`;
|
|
125
|
+
continue;
|
|
126
|
+
}
|
|
127
|
+
for (let idx = 0; idx <= parts.length - fragParts.length; idx += 1) {
|
|
128
|
+
if (parts.slice(idx, idx + fragParts.length).join("/") === fragParts.join("/")) {
|
|
129
|
+
return `exclude:${frag}`;
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
return null;
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
function collectMarkdownFiles(sourceRoot) {
|
|
137
|
+
const files = [];
|
|
138
|
+
const skipped = [];
|
|
139
|
+
let scanned = 0;
|
|
140
|
+
|
|
141
|
+
function visit(current) {
|
|
142
|
+
const stat = fs.statSync(current);
|
|
143
|
+
if (stat.isDirectory()) {
|
|
144
|
+
const excluded = isExcluded(current, sourceRoot);
|
|
145
|
+
if (excluded) return;
|
|
146
|
+
for (const child of fs.readdirSync(current).sort()) {
|
|
147
|
+
visit(path.join(current, child));
|
|
148
|
+
}
|
|
149
|
+
return;
|
|
150
|
+
}
|
|
151
|
+
if (!stat.isFile()) return;
|
|
152
|
+
if (!/\.md(?:arkdown)?$/i.test(current)) return;
|
|
153
|
+
scanned += 1;
|
|
154
|
+
const excluded = isExcluded(current, sourceRoot);
|
|
155
|
+
if (excluded) {
|
|
156
|
+
skipped.push({
|
|
157
|
+
path: toPosix(path.relative(sourceRoot, current)),
|
|
158
|
+
reason: excluded,
|
|
159
|
+
});
|
|
160
|
+
return;
|
|
161
|
+
}
|
|
162
|
+
files.push(current);
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
const stat = fs.statSync(sourceRoot);
|
|
166
|
+
if (stat.isFile()) {
|
|
167
|
+
scanned += /\.md(?:arkdown)?$/i.test(sourceRoot) ? 1 : 0;
|
|
168
|
+
if (/\.md(?:arkdown)?$/i.test(sourceRoot)) files.push(sourceRoot);
|
|
169
|
+
} else {
|
|
170
|
+
visit(sourceRoot);
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
return { files: files.sort((a, b) => toPosix(a).localeCompare(toPosix(b))), skipped, scanned };
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
function parseFrontmatterTitle(text) {
|
|
177
|
+
const lines = text.split(/\r?\n/);
|
|
178
|
+
if (!lines.length || lines[0].trim() !== "---") return "";
|
|
179
|
+
for (let idx = 1; idx < lines.length; idx += 1) {
|
|
180
|
+
const line = lines[idx];
|
|
181
|
+
if (line.trim() === "---") break;
|
|
182
|
+
const match = line.match(/^title:\s*(.+?)\s*$/i);
|
|
183
|
+
if (match) return match[1].trim().replace(/^["']|["']$/g, "");
|
|
184
|
+
}
|
|
185
|
+
return "";
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
function extractTitle(text, filePath) {
|
|
189
|
+
const frontmatterTitle = parseFrontmatterTitle(text);
|
|
190
|
+
if (frontmatterTitle) return frontmatterTitle;
|
|
191
|
+
const heading = text.match(/^#\s+(.+?)\s*$/m);
|
|
192
|
+
if (heading) return heading[1].trim();
|
|
193
|
+
return path.basename(filePath, path.extname(filePath)).replace(/[_-]+/g, " ").trim();
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
function detectMojibake(text) {
|
|
197
|
+
const matches = text.match(/�|Ã.|Â.|â€|ðŸ/gi) || [];
|
|
198
|
+
return {
|
|
199
|
+
hasMojibake: matches.length > 0,
|
|
200
|
+
score: Math.min(1, matches.length / Math.max(1, text.length / 500)),
|
|
201
|
+
};
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
function extractGithubRepos(text) {
|
|
205
|
+
const repos = [];
|
|
206
|
+
const seen = new Set();
|
|
207
|
+
const githubRegex = /(?:https?:\/\/)?(?:www\.)?github\.com\/([A-Za-z0-9](?:[A-Za-z0-9-]{0,38}))\/([A-Za-z0-9._-]+)/gi;
|
|
208
|
+
let match;
|
|
209
|
+
while ((match = githubRegex.exec(text)) !== null) {
|
|
210
|
+
const owner = match[1];
|
|
211
|
+
const rawRepo = match[2].replace(/[).,;:]+$/g, "");
|
|
212
|
+
if (!rawRepo || rawRepo === "..." || rawRepo.includes("...")) continue;
|
|
213
|
+
const repo = rawRepo.replace(/\.git$/i, "");
|
|
214
|
+
const key = `${owner.toLowerCase()}/${repo.toLowerCase()}`;
|
|
215
|
+
if (seen.has(key)) continue;
|
|
216
|
+
seen.add(key);
|
|
217
|
+
repos.push({
|
|
218
|
+
owner,
|
|
219
|
+
repo,
|
|
220
|
+
github_url: `https://github.com/${owner}/${repo}`,
|
|
221
|
+
key,
|
|
222
|
+
});
|
|
223
|
+
}
|
|
224
|
+
return repos;
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
function extractUnsupportedGithubItems(text) {
|
|
228
|
+
const items = [];
|
|
229
|
+
const invalidRegex = /github\.com\/(?:\.\.\.|[^\s)]+\.{3}[^\s)]*)/gi;
|
|
230
|
+
let match;
|
|
231
|
+
while ((match = invalidRegex.exec(text)) !== null) {
|
|
232
|
+
items.push(match[0]);
|
|
233
|
+
}
|
|
234
|
+
return [...new Set(items)];
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
function inferConcepts(text) {
|
|
238
|
+
const lower = text.toLowerCase();
|
|
239
|
+
return CONCEPT_RULES.filter((rule) => rule.keywords.some((keyword) => lower.includes(keyword.toLowerCase())));
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
function categoryForSource(text, concepts) {
|
|
243
|
+
if (concepts.length > 0) return concepts[0].category;
|
|
244
|
+
const lower = text.toLowerCase();
|
|
245
|
+
if (lower.includes("github") || lower.includes("repo")) return "developer_tooling";
|
|
246
|
+
return "uncategorized";
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
function confidenceTier(confidence) {
|
|
250
|
+
if (confidence >= 0.8) return "high";
|
|
251
|
+
if (confidence >= 0.5) return "medium";
|
|
252
|
+
if (confidence >= 0.2) return "low";
|
|
253
|
+
return "unsupported";
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
function makeSourceRootRef(sourceRoot) {
|
|
257
|
+
const display = toPosix(sourceRoot);
|
|
258
|
+
const label = safeSlug(path.basename(sourceRoot), "source-root");
|
|
259
|
+
return {
|
|
260
|
+
source_root_id: `root_${sha256(`source-root:v1:${display.toLowerCase()}`).slice(0, 12)}`,
|
|
261
|
+
source_root_label: label,
|
|
262
|
+
source_root_type: "external_local",
|
|
263
|
+
source_root_display_path: display,
|
|
264
|
+
normalization_version: "source-root:v1",
|
|
265
|
+
};
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
function buildProvenance({ sourceId, sourceHash, sourceRelativePath, sourceLogicalPath, line, generatedAt, confidence }) {
|
|
269
|
+
return {
|
|
270
|
+
provenance_id: `prov_${sourceId}_${String(line || 1).padStart(3, "0")}`,
|
|
271
|
+
source_id: sourceId,
|
|
272
|
+
source_hash: sourceHash,
|
|
273
|
+
source_relative_path: sourceRelativePath,
|
|
274
|
+
source_logical_path: sourceLogicalPath,
|
|
275
|
+
locator: {
|
|
276
|
+
type: "line_range",
|
|
277
|
+
start_line: line || 1,
|
|
278
|
+
end_line: line || 1,
|
|
279
|
+
heading: null,
|
|
280
|
+
},
|
|
281
|
+
evidence_quote_hash: sha256(`${sourceId}:${line || 1}`),
|
|
282
|
+
extractor: "sdtk-wiki.semantic-extract",
|
|
283
|
+
extractor_version: "bk132-dry-run",
|
|
284
|
+
generated_at: generatedAt,
|
|
285
|
+
confidence,
|
|
286
|
+
};
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
function lineOf(text, needle) {
|
|
290
|
+
const idx = text.indexOf(needle);
|
|
291
|
+
if (idx < 0) return 1;
|
|
292
|
+
return text.slice(0, idx).split(/\r?\n/).length;
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
function buildExtraction({ projectPath, sourceRoot }) {
|
|
296
|
+
const generatedAt = new Date().toISOString();
|
|
297
|
+
const sourceRootRef = makeSourceRootRef(sourceRoot);
|
|
298
|
+
const collected = collectMarkdownFiles(sourceRoot);
|
|
299
|
+
const sources = [];
|
|
300
|
+
const toolEntitiesById = new Map();
|
|
301
|
+
const conceptsById = new Map();
|
|
302
|
+
const claims = [];
|
|
303
|
+
const relations = [];
|
|
304
|
+
const provenance = [];
|
|
305
|
+
const sourceQualityFindings = [];
|
|
306
|
+
const unsupportedItems = [];
|
|
307
|
+
const sourceUrlUsage = new Map();
|
|
308
|
+
|
|
309
|
+
for (const filePath of collected.files) {
|
|
310
|
+
const text = fs.readFileSync(filePath, "utf-8");
|
|
311
|
+
const sourceHash = sha256(fs.readFileSync(filePath));
|
|
312
|
+
const stats = fs.statSync(filePath);
|
|
313
|
+
const sourceRelativePath = toPosix(path.relative(sourceRoot, filePath));
|
|
314
|
+
const sourceLogicalPath = `${sourceRootRef.source_root_label}/${sourceRelativePath}`;
|
|
315
|
+
const sourceDisplayPath = toPosix(filePath);
|
|
316
|
+
const sourceId = `src_${sha256(`local-md:v1:${sourceRootRef.source_root_id}:${sourceRelativePath.toLowerCase()}`).slice(0, 16)}`;
|
|
317
|
+
const title = extractTitle(text, filePath);
|
|
318
|
+
const repos = extractGithubRepos(text);
|
|
319
|
+
const unsupportedGithub = extractUnsupportedGithubItems(text);
|
|
320
|
+
const concepts = inferConcepts(`${title}\n${text}`);
|
|
321
|
+
const category = categoryForSource(`${title}\n${text}`, concepts);
|
|
322
|
+
const mojibake = detectMojibake(text);
|
|
323
|
+
const weakTitle = title.length < 6 || /^untitled|note|readme$/i.test(title);
|
|
324
|
+
const sourceUrl = repos.length > 0 ? repos[0].github_url : null;
|
|
325
|
+
const sourceSlug = `${safeSlug(title || sourceRelativePath, "source")}--${sourceId.slice(0, 8)}`;
|
|
326
|
+
const qualityFlags = [];
|
|
327
|
+
const qualityNotes = [];
|
|
328
|
+
|
|
329
|
+
if (mojibake.hasMojibake) {
|
|
330
|
+
qualityFlags.push("mojibake_detected");
|
|
331
|
+
qualityNotes.push("Potential mojibake or replacement characters detected.");
|
|
332
|
+
}
|
|
333
|
+
if (!sourceUrl) {
|
|
334
|
+
qualityFlags.push("missing_source_url");
|
|
335
|
+
qualityNotes.push("No valid GitHub/source URL was extracted.");
|
|
336
|
+
}
|
|
337
|
+
if (weakTitle) {
|
|
338
|
+
qualityFlags.push("weak_title");
|
|
339
|
+
qualityNotes.push("Title is missing, very short, or generic.");
|
|
340
|
+
}
|
|
341
|
+
if (repos.length === 0) {
|
|
342
|
+
qualityFlags.push("low_confidence_extraction");
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
const sourceRecord = {
|
|
346
|
+
source_id: sourceId,
|
|
347
|
+
source_root_id: sourceRootRef.source_root_id,
|
|
348
|
+
source_relative_path: sourceRelativePath,
|
|
349
|
+
source_logical_path: sourceLogicalPath,
|
|
350
|
+
source_display_path: sourceDisplayPath,
|
|
351
|
+
source_type: "markdown",
|
|
352
|
+
title,
|
|
353
|
+
source_url: sourceUrl,
|
|
354
|
+
source_hash: sourceHash,
|
|
355
|
+
size_bytes: stats.size,
|
|
356
|
+
modified_time: stats.mtime.toISOString(),
|
|
357
|
+
encoding_quality: mojibake.hasMojibake ? "suspect" : "clean",
|
|
358
|
+
source_quality: {
|
|
359
|
+
has_mojibake: mojibake.hasMojibake,
|
|
360
|
+
mojibake_score: Number(mojibake.score.toFixed(3)),
|
|
361
|
+
has_source_url: Boolean(sourceUrl),
|
|
362
|
+
weak_title: weakTitle,
|
|
363
|
+
duplicate_candidate: false,
|
|
364
|
+
duplicate_group_id: null,
|
|
365
|
+
low_confidence_extraction: repos.length === 0,
|
|
366
|
+
quality_flags: qualityFlags,
|
|
367
|
+
notes: qualityNotes,
|
|
368
|
+
},
|
|
369
|
+
provenance_refs: [],
|
|
370
|
+
target_page_path: `.sdtk/wiki/personal-brain/sources/${sourceSlug}.md`,
|
|
371
|
+
};
|
|
372
|
+
|
|
373
|
+
sources.push(sourceRecord);
|
|
374
|
+
|
|
375
|
+
if (sourceUrl) {
|
|
376
|
+
const existing = sourceUrlUsage.get(sourceUrl) || [];
|
|
377
|
+
existing.push(sourceRecord);
|
|
378
|
+
sourceUrlUsage.set(sourceUrl, existing);
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
for (const rawUnsupported of unsupportedGithub) {
|
|
382
|
+
unsupportedItems.push({
|
|
383
|
+
record_type: "unsupported_item",
|
|
384
|
+
item_id: `unsupported_${sourceId}_${String(unsupportedItems.length + 1).padStart(3, "0")}`,
|
|
385
|
+
source_id: sourceId,
|
|
386
|
+
reason: "unsupported_url_format",
|
|
387
|
+
raw_observation_summary: rawUnsupported,
|
|
388
|
+
confidence: 0.1,
|
|
389
|
+
confidence_tier: "unsupported",
|
|
390
|
+
provenance_refs: [],
|
|
391
|
+
});
|
|
392
|
+
if (!qualityFlags.includes("unsupported_url_format")) {
|
|
393
|
+
sourceRecord.source_quality.quality_flags.push("unsupported_url_format");
|
|
394
|
+
}
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
if (qualityFlags.length > 0) {
|
|
398
|
+
sourceQualityFindings.push({
|
|
399
|
+
finding_id: `sq_${sourceId}`,
|
|
400
|
+
source_id: sourceId,
|
|
401
|
+
source_relative_path: sourceRelativePath,
|
|
402
|
+
source_logical_path: sourceLogicalPath,
|
|
403
|
+
quality_flags: [...sourceRecord.source_quality.quality_flags],
|
|
404
|
+
confidence: repos.length === 0 ? 0.3 : 0.7,
|
|
405
|
+
confidence_tier: repos.length === 0 ? "low" : "medium",
|
|
406
|
+
notes: qualityNotes,
|
|
407
|
+
});
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
for (const repo of repos) {
|
|
411
|
+
const entityId = `tool_github_${safeSlug(repo.owner, "owner")}_${safeSlug(repo.repo, "repo")}`;
|
|
412
|
+
const prov = buildProvenance({
|
|
413
|
+
sourceId,
|
|
414
|
+
sourceHash,
|
|
415
|
+
sourceRelativePath,
|
|
416
|
+
sourceLogicalPath,
|
|
417
|
+
line: lineOf(text, repo.github_url),
|
|
418
|
+
generatedAt,
|
|
419
|
+
confidence: 0.9,
|
|
420
|
+
});
|
|
421
|
+
provenance.push(prov);
|
|
422
|
+
sourceRecord.provenance_refs.push(prov.provenance_id);
|
|
423
|
+
|
|
424
|
+
if (!toolEntitiesById.has(entityId)) {
|
|
425
|
+
toolEntitiesById.set(entityId, {
|
|
426
|
+
entity_id: entityId,
|
|
427
|
+
entity_type: "tool_entity",
|
|
428
|
+
name: repo.repo,
|
|
429
|
+
repo_owner: repo.owner,
|
|
430
|
+
repo_name: repo.repo,
|
|
431
|
+
github_url: repo.github_url,
|
|
432
|
+
category,
|
|
433
|
+
summary: `${repo.repo} is a locally sourced GitHub tool candidate in category ${category}.`,
|
|
434
|
+
confidence: 0.9,
|
|
435
|
+
confidence_tier: "high",
|
|
436
|
+
source_refs: [],
|
|
437
|
+
provenance_refs: [],
|
|
438
|
+
target_page_path: `.sdtk/wiki/personal-brain/entities/tools/${safeSlug(repo.repo, "tool")}--${entityId}.md`,
|
|
439
|
+
});
|
|
440
|
+
}
|
|
441
|
+
const entity = toolEntitiesById.get(entityId);
|
|
442
|
+
if (!entity.source_refs.includes(sourceId)) entity.source_refs.push(sourceId);
|
|
443
|
+
if (!entity.provenance_refs.includes(prov.provenance_id)) entity.provenance_refs.push(prov.provenance_id);
|
|
444
|
+
|
|
445
|
+
claims.push({
|
|
446
|
+
claim_id: `claim_${sourceId}_${String(claims.length + 1).padStart(3, "0")}`,
|
|
447
|
+
text: `The local source presents ${repo.repo} as a ${category} tool or project.`,
|
|
448
|
+
subject_entity_id: entityId,
|
|
449
|
+
source_refs: [sourceId],
|
|
450
|
+
provenance_refs: [prov.provenance_id],
|
|
451
|
+
confidence: 0.75,
|
|
452
|
+
confidence_tier: "medium",
|
|
453
|
+
contested: false,
|
|
454
|
+
});
|
|
455
|
+
|
|
456
|
+
relations.push({
|
|
457
|
+
relation_id: `rel_${sourceId}_${String(relations.length + 1).padStart(3, "0")}`,
|
|
458
|
+
source_id: sourceId,
|
|
459
|
+
target_id: entityId,
|
|
460
|
+
relation_type: "source_mentions_entity",
|
|
461
|
+
evidence: "The local Markdown source includes a GitHub repository URL.",
|
|
462
|
+
source_refs: [sourceId],
|
|
463
|
+
provenance_refs: [prov.provenance_id],
|
|
464
|
+
confidence: 0.9,
|
|
465
|
+
confidence_tier: "high",
|
|
466
|
+
});
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
for (const conceptRule of concepts) {
|
|
470
|
+
if (!conceptsById.has(conceptRule.concept_id)) {
|
|
471
|
+
conceptsById.set(conceptRule.concept_id, {
|
|
472
|
+
concept_id: conceptRule.concept_id,
|
|
473
|
+
name: conceptRule.name,
|
|
474
|
+
aliases: conceptRule.aliases,
|
|
475
|
+
definition: `Local sources contain evidence related to ${conceptRule.name}.`,
|
|
476
|
+
related_entities: [],
|
|
477
|
+
source_refs: [],
|
|
478
|
+
provenance_refs: [],
|
|
479
|
+
confidence: 0.65,
|
|
480
|
+
confidence_tier: "medium",
|
|
481
|
+
target_page_path: `.sdtk/wiki/personal-brain/concepts/${safeSlug(conceptRule.name, "concept")}.md`,
|
|
482
|
+
});
|
|
483
|
+
}
|
|
484
|
+
const concept = conceptsById.get(conceptRule.concept_id);
|
|
485
|
+
if (!concept.source_refs.includes(sourceId)) concept.source_refs.push(sourceId);
|
|
486
|
+
|
|
487
|
+
for (const repo of repos) {
|
|
488
|
+
const entityId = `tool_github_${safeSlug(repo.owner, "owner")}_${safeSlug(repo.repo, "repo")}`;
|
|
489
|
+
if (!concept.related_entities.includes(entityId)) concept.related_entities.push(entityId);
|
|
490
|
+
relations.push({
|
|
491
|
+
relation_id: `rel_${sourceId}_${String(relations.length + 1).padStart(3, "0")}`,
|
|
492
|
+
source_id: entityId,
|
|
493
|
+
target_id: conceptRule.concept_id,
|
|
494
|
+
relation_type: "entity_implements_concept",
|
|
495
|
+
evidence: `The local source text matches ${conceptRule.name} keywords.`,
|
|
496
|
+
source_refs: [sourceId],
|
|
497
|
+
provenance_refs: [...sourceRecord.provenance_refs],
|
|
498
|
+
confidence: 0.7,
|
|
499
|
+
confidence_tier: "medium",
|
|
500
|
+
});
|
|
501
|
+
}
|
|
502
|
+
}
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
for (const [sourceUrl, sourceRecords] of sourceUrlUsage.entries()) {
|
|
506
|
+
if (sourceRecords.length < 2) continue;
|
|
507
|
+
const duplicateGroupId = `dup_${sha256(sourceUrl).slice(0, 12)}`;
|
|
508
|
+
for (const record of sourceRecords) {
|
|
509
|
+
record.source_quality.duplicate_candidate = true;
|
|
510
|
+
record.source_quality.duplicate_group_id = duplicateGroupId;
|
|
511
|
+
if (!record.source_quality.quality_flags.includes("duplicate_source_candidate")) {
|
|
512
|
+
record.source_quality.quality_flags.push("duplicate_source_candidate");
|
|
513
|
+
}
|
|
514
|
+
sourceQualityFindings.push({
|
|
515
|
+
finding_id: `sq_duplicate_${record.source_id}`,
|
|
516
|
+
source_id: record.source_id,
|
|
517
|
+
source_relative_path: record.source_relative_path,
|
|
518
|
+
source_logical_path: record.source_logical_path,
|
|
519
|
+
quality_flags: ["duplicate_source_candidate"],
|
|
520
|
+
duplicate_group_id: duplicateGroupId,
|
|
521
|
+
confidence: 0.8,
|
|
522
|
+
confidence_tier: "high",
|
|
523
|
+
notes: [`Duplicate source URL candidate: ${sourceUrl}`],
|
|
524
|
+
});
|
|
525
|
+
}
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
const toolEntities = [...toolEntitiesById.values()].sort((a, b) => a.entity_id.localeCompare(b.entity_id));
|
|
529
|
+
const concepts = [...conceptsById.values()].sort((a, b) => a.concept_id.localeCompare(b.concept_id));
|
|
530
|
+
const comparisons = [];
|
|
531
|
+
const syntheses = [];
|
|
532
|
+
|
|
533
|
+
for (const concept of concepts) {
|
|
534
|
+
if (concept.related_entities.length < 2) continue;
|
|
535
|
+
const topicSlug = safeSlug(concept.name, "topic");
|
|
536
|
+
comparisons.push({
|
|
537
|
+
comparison_id: `comparison_${topicSlug}_${sha256(concept.related_entities.join("|")).slice(0, 8)}`,
|
|
538
|
+
topic: concept.name,
|
|
539
|
+
compared_entities: concept.related_entities.slice(0, 8),
|
|
540
|
+
criteria: ["local evidence", "category fit", "source confidence"],
|
|
541
|
+
source_refs: concept.source_refs,
|
|
542
|
+
provenance_refs: concept.provenance_refs,
|
|
543
|
+
confidence: 0.55,
|
|
544
|
+
confidence_tier: "medium",
|
|
545
|
+
target_page_path: `.sdtk/wiki/personal-brain/comparisons/${topicSlug}.md`,
|
|
546
|
+
});
|
|
547
|
+
syntheses.push({
|
|
548
|
+
synthesis_id: `synthesis_${topicSlug}_${sha256(concept.source_refs.join("|")).slice(0, 8)}`,
|
|
549
|
+
topic: concept.name,
|
|
550
|
+
summary: `Local sources mention ${concept.related_entities.length} tool candidate(s) related to ${concept.name}.`,
|
|
551
|
+
recommendations: ["Review extracted entities and source quality findings before compile/apply work."],
|
|
552
|
+
source_refs: concept.source_refs,
|
|
553
|
+
provenance_refs: concept.provenance_refs,
|
|
554
|
+
confidence: 0.55,
|
|
555
|
+
confidence_tier: "medium",
|
|
556
|
+
target_page_path: `.sdtk/wiki/personal-brain/syntheses/${topicSlug}.md`,
|
|
557
|
+
});
|
|
558
|
+
}
|
|
559
|
+
|
|
560
|
+
const lowConfidence = sourceQualityFindings.filter((finding) => ["low", "unsupported"].includes(finding.confidence_tier)).length;
|
|
561
|
+
|
|
562
|
+
return {
|
|
563
|
+
schema_version: SCHEMA_VERSION,
|
|
564
|
+
record_type: "sdtk_wiki_semantic_extraction",
|
|
565
|
+
generated_at: generatedAt,
|
|
566
|
+
project_path: projectPath,
|
|
567
|
+
source_root_refs: [sourceRootRef],
|
|
568
|
+
source_counts: {
|
|
569
|
+
scanned: collected.scanned,
|
|
570
|
+
indexed: sources.length,
|
|
571
|
+
extracted: toolEntities.length,
|
|
572
|
+
skipped: collected.skipped.length,
|
|
573
|
+
low_confidence: lowConfidence,
|
|
574
|
+
unsupported: unsupportedItems.length,
|
|
575
|
+
},
|
|
576
|
+
skipped_sources: collected.skipped,
|
|
577
|
+
sources,
|
|
578
|
+
tool_entities: toolEntities,
|
|
579
|
+
concepts,
|
|
580
|
+
claims,
|
|
581
|
+
relations,
|
|
582
|
+
comparisons,
|
|
583
|
+
syntheses,
|
|
584
|
+
source_quality_findings: sourceQualityFindings,
|
|
585
|
+
unsupported_items: unsupportedItems,
|
|
586
|
+
provenance,
|
|
587
|
+
};
|
|
588
|
+
}
|
|
589
|
+
|
|
590
|
+
function resolveSourceRoot(sourceRootArg) {
|
|
591
|
+
if (!sourceRootArg) {
|
|
592
|
+
throw new ValidationError("sdtk-wiki wiki extract requires --source-root <path>. No project files were changed.");
|
|
593
|
+
}
|
|
594
|
+
if (isRemoteUrl(sourceRootArg)) {
|
|
595
|
+
throw new ValidationError("Remote URL source roots are not supported for semantic extraction dry-run. No project files were changed.");
|
|
596
|
+
}
|
|
597
|
+
const resolved = path.resolve(sourceRootArg);
|
|
598
|
+
if (!fs.existsSync(resolved)) {
|
|
599
|
+
throw new ValidationError(`--source-root does not exist: ${resolved}. No project files were changed.`);
|
|
600
|
+
}
|
|
601
|
+
const stat = fs.statSync(resolved);
|
|
602
|
+
if (!stat.isDirectory() && !stat.isFile()) {
|
|
603
|
+
throw new ValidationError(`--source-root is not a file or directory: ${resolved}. No project files were changed.`);
|
|
604
|
+
}
|
|
605
|
+
return resolved;
|
|
606
|
+
}
|
|
607
|
+
|
|
608
|
+
function runWikiExtractDryRun({ projectPath, sourceRootArg }) {
|
|
609
|
+
const resolvedProjectPath = resolveProjectPath(projectPath || process.cwd());
|
|
610
|
+
if (!fs.existsSync(resolvedProjectPath) || !fs.statSync(resolvedProjectPath).isDirectory()) {
|
|
611
|
+
throw new ValidationError(`--project-path is not a valid directory: ${resolvedProjectPath}`);
|
|
612
|
+
}
|
|
613
|
+
const sourceRoot = resolveSourceRoot(sourceRootArg);
|
|
614
|
+
|
|
615
|
+
try {
|
|
616
|
+
const reportsPath = getWikiReportsPath(resolvedProjectPath);
|
|
617
|
+
assertWikiWorkspaceWritePath(reportsPath, resolvedProjectPath);
|
|
618
|
+
const extraction = buildExtraction({ projectPath: resolvedProjectPath, sourceRoot });
|
|
619
|
+
const reportPath = path.join(reportsPath, `${REPORT_PREFIX}-${timestampStamp(new Date(extraction.generated_at))}.json`);
|
|
620
|
+
assertWikiWorkspaceWritePath(reportPath, resolvedProjectPath);
|
|
621
|
+
fs.mkdirSync(reportsPath, { recursive: true });
|
|
622
|
+
fs.writeFileSync(reportPath, JSON.stringify(extraction, null, 2) + "\n", "utf-8");
|
|
623
|
+
return {
|
|
624
|
+
reportPath,
|
|
625
|
+
extraction,
|
|
626
|
+
};
|
|
627
|
+
} catch (error) {
|
|
628
|
+
if (error instanceof CliError) throw error;
|
|
629
|
+
throw new CliError(`Failed to write SDTK-WIKI semantic extraction dry-run report: ${error.message}`);
|
|
630
|
+
}
|
|
631
|
+
}
|
|
632
|
+
|
|
633
|
+
module.exports = {
|
|
634
|
+
REPORT_PREFIX,
|
|
635
|
+
buildExtraction,
|
|
636
|
+
runWikiExtractDryRun,
|
|
637
|
+
};
|
package/src/lib/wiki-flags.js
CHANGED
|
@@ -42,11 +42,18 @@ const DISCOVER_FLAG_DEFS = {
|
|
|
42
42
|
plan: { type: "boolean" },
|
|
43
43
|
};
|
|
44
44
|
|
|
45
|
+
const EXTRACT_FLAG_DEFS = {
|
|
46
|
+
"project-path": { type: "string" },
|
|
47
|
+
"source-root": { type: "string" },
|
|
48
|
+
"dry-run": { type: "boolean" },
|
|
49
|
+
};
|
|
50
|
+
|
|
45
51
|
const COMPILE_FLAG_DEFS = {
|
|
46
52
|
"project-path": { type: "string" },
|
|
47
53
|
plan: { type: "string" },
|
|
48
54
|
"dry-run": { type: "boolean" },
|
|
49
55
|
apply: { type: "boolean" },
|
|
56
|
+
yes: { type: "boolean" },
|
|
50
57
|
};
|
|
51
58
|
|
|
52
59
|
function parseWikiFlags(args, defs) {
|
|
@@ -80,6 +87,7 @@ module.exports = {
|
|
|
80
87
|
BASE_FLAG_DEFS,
|
|
81
88
|
COMPILE_FLAG_DEFS,
|
|
82
89
|
DISCOVER_FLAG_DEFS,
|
|
90
|
+
EXTRACT_FLAG_DEFS,
|
|
83
91
|
INIT_FLAG_DEFS,
|
|
84
92
|
LINT_FLAG_DEFS,
|
|
85
93
|
OPEN_FLAG_DEFS,
|