sdtk-wiki-kit 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,6 +8,7 @@ const {
8
8
  getWikiGraphPath,
9
9
  getWikiPagesPath,
10
10
  getWikiProvenanceSourcesPath,
11
+ getWikiRawSourcesPath,
11
12
  getWikiReportsPath,
12
13
  getWikiWorkspacePath,
13
14
  isPathInsideOrEqual,
@@ -38,10 +39,11 @@ const CATEGORY_DEFS = [
38
39
  ["stale", "Stale pages"],
39
40
  ["markers", "TODO/Open Questions/Gaps"],
40
41
  ["contradictions", "Candidate contradictions"],
42
+ ["sourceQuality", "Source quality"],
41
43
  ];
42
44
 
43
45
  function toPosix(value) {
44
- return value.split(path.sep).join("/");
46
+ return String(value || "").replace(/\\/g, "/");
45
47
  }
46
48
 
47
49
  function stripQuotes(value) {
@@ -171,19 +173,192 @@ function readLintInputs(projectPath, findings) {
171
173
  const pagesRoot = getWikiPagesPath(projectPath);
172
174
  const pageFiles = listMarkdownPages(pagesRoot);
173
175
  const provenance = readJsonIfPresent(getWikiProvenanceSourcesPath(projectPath), { sources: [] });
176
+ const raw = readJsonIfPresent(getWikiRawSourcesPath(projectPath), { sources: [] });
174
177
  const graph = readJsonIfPresent(path.join(getWikiGraphPath(projectPath), "SDTK_DOC_GRAPH.json"), {
175
178
  edges: [],
176
179
  });
180
+ const graphIndex = readJsonIfPresent(path.join(getWikiGraphPath(projectPath), "SDTK_DOC_INDEX.json"), {
181
+ documents: [],
182
+ });
177
183
 
178
184
  if (provenance && provenance.__lintError) {
179
185
  appendFinding(findings, "provenance", provenance.__lintError);
180
186
  }
187
+ if (raw && raw.__lintError) {
188
+ appendFinding(findings, "sourceQuality", raw.__lintError);
189
+ }
181
190
  if (graph && graph.__lintError) {
182
191
  appendFinding(findings, "downstream", graph.__lintError);
183
192
  }
193
+ if (graphIndex && graphIndex.__lintError) {
194
+ appendFinding(findings, "sourceQuality", graphIndex.__lintError);
195
+ }
184
196
 
185
197
  const sources = provenance && Array.isArray(provenance.sources) ? provenance.sources : [];
186
- return { graph, pageFiles, pagesRoot, sources };
198
+ const rawSources = raw && Array.isArray(raw.sources) ? raw.sources : [];
199
+ const graphDocuments = graphIndex && Array.isArray(graphIndex.documents) ? graphIndex.documents : [];
200
+ return { graph, graphDocuments, pageFiles, pagesRoot, rawSources, sources };
201
+ }
202
+
203
+ function normalizeSourcePath(value) {
204
+ return toPosix(value).replace(/^\.\//, "");
205
+ }
206
+
207
+ function sourceRecordPath(record) {
208
+ if (!record || typeof record !== "object") return "";
209
+ return normalizeSourcePath(record.sourcePath || record.path || record.id || "");
210
+ }
211
+
212
+ function resolveSourceFilePath(projectPath, sourcePath) {
213
+ if (!sourcePath) return null;
214
+ const nativePath = sourcePath.replace(/\//g, path.sep);
215
+ const candidate = path.isAbsolute(nativePath) ? path.resolve(nativePath) : path.resolve(projectPath, nativePath);
216
+ if (!fs.existsSync(candidate) || !fs.statSync(candidate).isFile()) return null;
217
+ return candidate;
218
+ }
219
+
220
+ function extractGithubRepos(text) {
221
+ const repos = [];
222
+ const seen = new Set();
223
+ const matcher = /(?:https?:\/\/)?(?:www\.)?github\.com\/([A-Za-z0-9](?:[A-Za-z0-9-]{0,38}))\/([A-Za-z0-9._-]+)/gi;
224
+ let match;
225
+ while ((match = matcher.exec(String(text || ""))) !== null) {
226
+ const owner = match[1];
227
+ const repo = match[2].replace(/[).,;:]+$/g, "").replace(/\.git$/i, "");
228
+ if (!repo || repo.includes("...")) continue;
229
+ const url = `https://github.com/${owner}/${repo}`;
230
+ const key = url.toLowerCase();
231
+ if (seen.has(key)) continue;
232
+ seen.add(key);
233
+ repos.push({ owner, repo, url });
234
+ }
235
+ return repos;
236
+ }
237
+
238
+ function detectMojibakeExamples(text) {
239
+ const examples = [];
240
+ const matcher = /�|Ã.|Â.|â€|ðŸ/gi;
241
+ let match;
242
+ while ((match = matcher.exec(String(text || ""))) !== null) {
243
+ const start = Math.max(0, match.index - 20);
244
+ const end = Math.min(text.length, match.index + 40);
245
+ examples.push(text.slice(start, end).replace(/\s+/g, " ").trim());
246
+ if (examples.length >= 3) break;
247
+ }
248
+ return examples;
249
+ }
250
+
251
+ function weakTitle(title, sourcePath) {
252
+ const text = String(title || "").trim();
253
+ const stem = path.basename(sourcePath || "", path.extname(sourcePath || ""));
254
+ return (
255
+ text.length < 6 ||
256
+ /^untitled|note|readme$/i.test(text) ||
257
+ text === stem.replace(/[_-]+/g, " ")
258
+ );
259
+ }
260
+
261
+ function analyzeSourceQuality(projectPath, inputs, findings) {
262
+ const provenancePaths = new Set(inputs.sources.map(sourceRecordPath).filter(Boolean));
263
+ const rawPaths = new Set(inputs.rawSources.map(sourceRecordPath).filter(Boolean));
264
+ const graphPaths = new Set(
265
+ inputs.graphDocuments
266
+ .map((record) => normalizeSourcePath(record && (record.id || record.path)))
267
+ .filter(Boolean)
268
+ );
269
+ const repoToSources = new Map();
270
+ const urlToSources = new Map();
271
+
272
+ for (const sourcePath of rawPaths) {
273
+ if (!provenancePaths.has(sourcePath)) {
274
+ appendFinding(
275
+ findings,
276
+ "sourceQuality",
277
+ `Raw source \`${sourcePath}\` is registered but absent from graph/provenance source coverage.`
278
+ );
279
+ }
280
+ }
281
+
282
+ for (const sourcePath of provenancePaths) {
283
+ if (graphPaths.size > 0 && !graphPaths.has(sourcePath)) {
284
+ appendFinding(
285
+ findings,
286
+ "sourceQuality",
287
+ `Provenance source \`${sourcePath}\` is absent from graph document index.`
288
+ );
289
+ }
290
+ }
291
+
292
+ for (const record of inputs.sources) {
293
+ const sourcePath = sourceRecordPath(record);
294
+ const resolved = resolveSourceFilePath(projectPath, sourcePath);
295
+ const title = String((record && record.title) || "");
296
+ if (!resolved) {
297
+ appendFinding(
298
+ findings,
299
+ "sourceQuality",
300
+ `Source \`${sourcePath || "(missing)"}\` could not be read for source-quality lint.`
301
+ );
302
+ continue;
303
+ }
304
+
305
+ const text = fs.readFileSync(resolved, "utf-8");
306
+ const repos = extractGithubRepos(text);
307
+ const mojibakeExamples = detectMojibakeExamples(text);
308
+
309
+ if (mojibakeExamples.length > 0) {
310
+ appendFinding(
311
+ findings,
312
+ "sourceQuality",
313
+ `Source \`${sourcePath}\` has mojibake-like text examples: ${mojibakeExamples.map((item) => `\`${item}\``).join("; ")}.`
314
+ );
315
+ }
316
+
317
+ if (repos.length === 0) {
318
+ appendFinding(findings, "sourceQuality", `Source \`${sourcePath}\` has no detected GitHub/source URL.`);
319
+ }
320
+
321
+ if (weakTitle(title, sourcePath)) {
322
+ appendFinding(findings, "sourceQuality", `Source \`${sourcePath}\` has a weak or filename-derived title \`${title || "(missing)"}\`.`);
323
+ }
324
+
325
+ if (repos.length === 0) {
326
+ appendFinding(findings, "sourceQuality", `Source \`${sourcePath}\` has low-confidence extraction because no valid GitHub repo candidate was detected.`);
327
+ }
328
+
329
+ if (repos.length > 0) {
330
+ const sourceUrl = repos[0].url;
331
+ const existing = urlToSources.get(sourceUrl.toLowerCase()) || [];
332
+ existing.push(sourcePath);
333
+ urlToSources.set(sourceUrl.toLowerCase(), existing);
334
+ }
335
+
336
+ for (const repo of repos) {
337
+ const existing = repoToSources.get(repo.url.toLowerCase()) || [];
338
+ existing.push(sourcePath);
339
+ repoToSources.set(repo.url.toLowerCase(), existing);
340
+ }
341
+ }
342
+
343
+ for (const [url, paths] of urlToSources.entries()) {
344
+ if (paths.length > 1) {
345
+ appendFinding(
346
+ findings,
347
+ "sourceQuality",
348
+ `Duplicate source URL candidate \`${url}\` appears in ${paths.map((item) => `\`${item}\``).join(", ")}.`
349
+ );
350
+ }
351
+ }
352
+
353
+ for (const [repo, paths] of repoToSources.entries()) {
354
+ if (paths.length > 1) {
355
+ appendFinding(
356
+ findings,
357
+ "sourceQuality",
358
+ `Duplicate GitHub repo candidate \`${repo}\` appears in ${paths.map((item) => `\`${item}\``).join(", ")}.`
359
+ );
360
+ }
361
+ }
187
362
  }
188
363
 
189
364
  function analyzePages(projectPath) {
@@ -373,6 +548,8 @@ function analyzePages(projectPath) {
373
548
  }
374
549
  }
375
550
 
551
+ analyzeSourceQuality(projectPath, inputs, findings);
552
+
376
553
  return { findings, pageCount: pages.length };
377
554
  }
378
555
 
@@ -0,0 +1,175 @@
1
+ "use strict";
2
+
3
+ const fs = require("fs");
4
+ const path = require("path");
5
+ const { ValidationError } = require("./errors");
6
+ const {
7
+ getWikiWorkspacePath,
8
+ isPathInsideOrEqual,
9
+ resolveProjectPath,
10
+ } = require("./wiki-paths");
11
+
12
+ const DEFAULT_LIMIT = 10;
13
+ const PERSONAL_BRAIN_RELATIVE = path.join(".sdtk", "wiki", "personal-brain");
14
+
15
+ function toPosix(value) {
16
+ return String(value || "").replace(/\\/g, "/");
17
+ }
18
+
19
+ function normalizeText(value) {
20
+ return String(value || "").toLowerCase();
21
+ }
22
+
23
+ function tokenize(query) {
24
+ return normalizeText(query)
25
+ .split(/[^a-z0-9\u00c0-\u1ef9_]+/i)
26
+ .map((part) => part.trim())
27
+ .filter((part) => part.length >= 2);
28
+ }
29
+
30
+ function collectMarkdownFiles(rootPath) {
31
+ const files = [];
32
+ function visit(current) {
33
+ const stat = fs.statSync(current);
34
+ if (stat.isDirectory()) {
35
+ for (const child of fs.readdirSync(current).sort()) {
36
+ visit(path.join(current, child));
37
+ }
38
+ return;
39
+ }
40
+ if (stat.isFile() && /\.md(?:arkdown)?$/i.test(current)) {
41
+ files.push(current);
42
+ }
43
+ }
44
+ visit(rootPath);
45
+ return files.sort((a, b) => toPosix(a).localeCompare(toPosix(b)));
46
+ }
47
+
48
+ function extractTitle(text, filePath) {
49
+ const heading = text.match(/^#\s+(.+?)\s*$/m);
50
+ if (heading) return heading[1].trim();
51
+ return path.basename(filePath, path.extname(filePath)).replace(/[-_]+/g, " ").trim();
52
+ }
53
+
54
+ function snippetFor(text, query, tokens) {
55
+ const lower = normalizeText(text);
56
+ const phrase = normalizeText(query);
57
+ let index = phrase ? lower.indexOf(phrase) : -1;
58
+ if (index < 0) {
59
+ for (const token of tokens) {
60
+ index = lower.indexOf(token);
61
+ if (index >= 0) break;
62
+ }
63
+ }
64
+ if (index < 0) {
65
+ return text.replace(/\s+/g, " ").trim().slice(0, 180);
66
+ }
67
+ const start = Math.max(0, index - 80);
68
+ const end = Math.min(text.length, index + 180);
69
+ return text.slice(start, end).replace(/\s+/g, " ").trim();
70
+ }
71
+
72
+ function scoreFile({ text, title, relativePath, query, tokens }) {
73
+ const lowerText = normalizeText(text);
74
+ const lowerTitle = normalizeText(title);
75
+ const lowerPath = normalizeText(relativePath);
76
+ const phrase = normalizeText(query);
77
+ const reasons = [];
78
+ let score = 0;
79
+
80
+ if (phrase && lowerText.includes(phrase)) {
81
+ score += 50;
82
+ reasons.push("exact phrase match in page content");
83
+ }
84
+ if (phrase && lowerTitle.includes(phrase)) {
85
+ score += 30;
86
+ reasons.push("exact phrase match in title");
87
+ }
88
+ if (phrase && lowerPath.includes(phrase)) {
89
+ score += 20;
90
+ reasons.push("exact phrase match in path");
91
+ }
92
+
93
+ let matchedTokens = 0;
94
+ for (const token of tokens) {
95
+ const inText = lowerText.includes(token);
96
+ const inTitle = lowerTitle.includes(token);
97
+ const inPath = lowerPath.includes(token);
98
+ if (inText || inTitle || inPath) {
99
+ matchedTokens += 1;
100
+ score += inTitle ? 12 : inPath ? 8 : 5;
101
+ }
102
+ }
103
+ if (matchedTokens > 0) {
104
+ reasons.push(`matched ${matchedTokens}/${tokens.length} query token(s)`);
105
+ }
106
+
107
+ return { score, reasons };
108
+ }
109
+
110
+ function runWikiSearch({ projectPath, query, limit = DEFAULT_LIMIT }) {
111
+ const resolvedProjectPath = resolveProjectPath(projectPath || process.cwd());
112
+ if (!fs.existsSync(resolvedProjectPath) || !fs.statSync(resolvedProjectPath).isDirectory()) {
113
+ throw new ValidationError(`--project-path is not a valid directory: ${resolvedProjectPath}`);
114
+ }
115
+
116
+ const normalizedQuery = String(query || "").trim();
117
+ if (!normalizedQuery) {
118
+ throw new ValidationError('sdtk-wiki search requires a query, for example: sdtk-wiki search --project-path <path> "multi-agent".');
119
+ }
120
+
121
+ const parsedLimit = Number.parseInt(limit, 10);
122
+ const safeLimit = Number.isFinite(parsedLimit) && parsedLimit > 0 ? Math.min(parsedLimit, 50) : DEFAULT_LIMIT;
123
+ const personalBrainPath = path.join(getWikiWorkspacePath(resolvedProjectPath), "personal-brain");
124
+ if (!isPathInsideOrEqual(personalBrainPath, resolvedProjectPath)) {
125
+ throw new ValidationError("Refusing to search outside the project root.");
126
+ }
127
+ if (!fs.existsSync(personalBrainPath) || !fs.statSync(personalBrainPath).isDirectory()) {
128
+ throw new ValidationError(
129
+ `No SDTK-WIKI personal brain found at ${personalBrainPath}. Run extract, compile dry-run, and compile --apply --yes from the generated JSON sidecar first.`
130
+ );
131
+ }
132
+
133
+ const tokens = tokenize(normalizedQuery);
134
+ const files = collectMarkdownFiles(personalBrainPath);
135
+ const matches = [];
136
+
137
+ for (const filePath of files) {
138
+ const text = fs.readFileSync(filePath, "utf-8");
139
+ const relativePath = toPosix(path.relative(resolvedProjectPath, filePath));
140
+ const title = extractTitle(text, filePath);
141
+ const scored = scoreFile({ text, title, relativePath, query: normalizedQuery, tokens });
142
+ if (scored.score <= 0) continue;
143
+ matches.push({
144
+ path: relativePath,
145
+ title,
146
+ score: scored.score,
147
+ why: scored.reasons.join("; "),
148
+ snippet: snippetFor(text, normalizedQuery, tokens),
149
+ });
150
+ }
151
+
152
+ matches.sort((a, b) => {
153
+ if (b.score !== a.score) return b.score - a.score;
154
+ return a.path.localeCompare(b.path);
155
+ });
156
+
157
+ return {
158
+ query: normalizedQuery,
159
+ projectPath: resolvedProjectPath,
160
+ personalBrainPath,
161
+ scannedFiles: files.length,
162
+ matches: matches.slice(0, safeLimit),
163
+ totalMatches: matches.length,
164
+ limit: safeLimit,
165
+ searchMode: "local_deterministic_personal_brain_markdown",
166
+ premiumRequired: false,
167
+ mutated: false,
168
+ };
169
+ }
170
+
171
+ module.exports = {
172
+ PERSONAL_BRAIN_RELATIVE,
173
+ runWikiSearch,
174
+ tokenize,
175
+ };