sdtk-wiki-kit 0.1.2 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -173,6 +173,39 @@ function collectMarkdownFiles(sourceRoot) {
173
173
  return { files: files.sort((a, b) => toPosix(a).localeCompare(toPosix(b))), skipped, scanned };
174
174
  }
175
175
 
176
+ function collectJsonFiles(sourceRoot) {
177
+ const files = [];
178
+ let scanned = 0;
179
+
180
+ function visit(current) {
181
+ const stat = fs.statSync(current);
182
+ if (stat.isDirectory()) {
183
+ const excluded = isExcluded(current, sourceRoot);
184
+ if (excluded) return;
185
+ for (const child of fs.readdirSync(current).sort()) {
186
+ visit(path.join(current, child));
187
+ }
188
+ return;
189
+ }
190
+ if (!stat.isFile()) return;
191
+ if (!/\.json$/i.test(current)) return;
192
+ scanned += 1;
193
+ const excluded = isExcluded(current, sourceRoot);
194
+ if (excluded) return;
195
+ files.push(current);
196
+ }
197
+
198
+ const stat = fs.statSync(sourceRoot);
199
+ if (stat.isFile()) {
200
+ scanned += /\.json$/i.test(sourceRoot) ? 1 : 0;
201
+ if (/\.json$/i.test(sourceRoot)) files.push(sourceRoot);
202
+ } else {
203
+ visit(sourceRoot);
204
+ }
205
+
206
+ return { files: files.sort((a, b) => toPosix(a).localeCompare(toPosix(b))), scanned };
207
+ }
208
+
176
209
  function parseFrontmatterTitle(text) {
177
210
  const lines = text.split(/\r?\n/);
178
211
  if (!lines.length || lines[0].trim() !== "---") return "";
@@ -224,6 +257,20 @@ function extractGithubRepos(text) {
224
257
  return repos;
225
258
  }
226
259
 
260
+ function parseGithubRepoUrl(value) {
261
+ const match = String(value || "").match(/^(?:https?:\/\/)?(?:www\.)?github\.com\/([A-Za-z0-9](?:[A-Za-z0-9-]{0,38}))\/([A-Za-z0-9._-]+)(?:\.git)?(?:[/?#].*)?$/i);
262
+ if (!match) return null;
263
+ const owner = match[1];
264
+ const repo = match[2].replace(/[).,;:]+$/g, "").replace(/\.git$/i, "");
265
+ if (!repo || repo === "..." || repo.includes("...")) return null;
266
+ return {
267
+ owner,
268
+ repo,
269
+ github_url: `https://github.com/${owner}/${repo}`,
270
+ key: `${owner.toLowerCase()}/${repo.toLowerCase()}`,
271
+ };
272
+ }
273
+
227
274
  function extractUnsupportedGithubItems(text) {
228
275
  const items = [];
229
276
  const invalidRegex = /github\.com\/(?:\.\.\.|[^\s)]+\.{3}[^\s)]*)/gi;
@@ -234,11 +281,39 @@ function extractUnsupportedGithubItems(text) {
234
281
  return [...new Set(items)];
235
282
  }
236
283
 
284
+ function normalizeTopic(value) {
285
+ return safeSlug(String(value || "").replace(/_/g, " "), "topic");
286
+ }
287
+
288
+ function conceptFromTopic(topic) {
289
+ const name = String(topic || "").trim();
290
+ const slug = normalizeTopic(name);
291
+ if (!slug || slug === "topic") return null;
292
+ return {
293
+ concept_id: `concept_topic_${slug.replace(/-/g, "_")}`,
294
+ name: name.replace(/[_-]+/g, " "),
295
+ aliases: [name],
296
+ definition: `Local structured sources include topic evidence for ${name.replace(/[_-]+/g, " ")}.`,
297
+ related_entities: [],
298
+ source_refs: [],
299
+ provenance_refs: [],
300
+ confidence: 0.6,
301
+ confidence_tier: "medium",
302
+ target_page_path: `wiki/concepts/${slug}.md`,
303
+ };
304
+ }
305
+
237
306
  function inferConcepts(text) {
238
307
  const lower = text.toLowerCase();
239
308
  return CONCEPT_RULES.filter((rule) => rule.keywords.some((keyword) => lower.includes(keyword.toLowerCase())));
240
309
  }
241
310
 
311
+ function inferConceptsFromTopics(topics) {
312
+ return (Array.isArray(topics) ? topics : [])
313
+ .map(conceptFromTopic)
314
+ .filter(Boolean);
315
+ }
316
+
242
317
  function categoryForSource(text, concepts) {
243
318
  if (concepts.length > 0) return concepts[0].category;
244
319
  const lower = text.toLowerCase();
@@ -246,6 +321,54 @@ function categoryForSource(text, concepts) {
246
321
  return "uncategorized";
247
322
  }
248
323
 
324
+ function confidenceNumber(value) {
325
+ if (typeof value === "number" && Number.isFinite(value)) return Math.max(0, Math.min(1, value));
326
+ const text = String(value || "").trim().toLowerCase();
327
+ if (text === "high") return 0.85;
328
+ if (text === "medium") return 0.65;
329
+ if (text === "low") return 0.35;
330
+ if (text === "unsupported") return 0.1;
331
+ return 0.5;
332
+ }
333
+
334
+ function firstArray(value) {
335
+ if (Array.isArray(value)) return value;
336
+ if (!value || typeof value !== "object") return [];
337
+ for (const key of ["records", "repos", "repositories", "items", "data"]) {
338
+ if (Array.isArray(value[key])) return value[key];
339
+ }
340
+ return [value];
341
+ }
342
+
343
+ function normalizeJsonRepoRecord(raw) {
344
+ if (!raw || typeof raw !== "object" || Array.isArray(raw)) return null;
345
+ const repoUrl = String(raw.repo_url || raw.github_url || raw.url || "").trim();
346
+ const parsedRepo = parseGithubRepoUrl(repoUrl);
347
+ const owner = String(raw.owner || parsedRepo?.owner || "").trim();
348
+ const repoName = String(raw.repo_name || raw.name || raw.repo || parsedRepo?.repo || "").trim();
349
+ const topics = Array.isArray(raw.topics) ? raw.topics.map((topic) => String(topic).trim()).filter(Boolean) : [];
350
+ const snippet = String(raw.message_text_snippet || raw.snippet || raw.description || raw.summary || "").trim();
351
+ const sourceLink = String(raw.source_link || raw.source_url || raw.link || "").trim();
352
+ if (!repoUrl && !owner && !repoName && !snippet && topics.length === 0 && !sourceLink) return null;
353
+ return {
354
+ repo_url: parsedRepo ? parsedRepo.github_url : repoUrl || null,
355
+ owner,
356
+ repo_name: repoName,
357
+ message_text_snippet: snippet,
358
+ source_link: sourceLink || null,
359
+ topics,
360
+ confidence_raw: raw.confidence ?? null,
361
+ confidence: confidenceNumber(raw.confidence),
362
+ parsed_repo: parsedRepo || (owner && repoName ? {
363
+ owner,
364
+ repo: repoName,
365
+ github_url: repoUrl || `https://github.com/${owner}/${repoName}`,
366
+ key: `${owner.toLowerCase()}/${repoName.toLowerCase()}`,
367
+ } : null),
368
+ raw,
369
+ };
370
+ }
371
+
249
372
  function confidenceTier(confidence) {
250
373
  if (confidence >= 0.8) return "high";
251
374
  if (confidence >= 0.5) return "medium";
@@ -292,10 +415,67 @@ function lineOf(text, needle) {
292
415
  return text.slice(0, idx).split(/\r?\n/).length;
293
416
  }
294
417
 
418
+ function boundedText(value, maxLength = 700) {
419
+ const text = String(value || "").replace(/\s+/g, " ").trim();
420
+ if (text.length <= maxLength) return text;
421
+ return `${text.slice(0, maxLength - 1).trim()}...`;
422
+ }
423
+
424
+ function asArray(value) {
425
+ return Array.isArray(value) ? value : [];
426
+ }
427
+
428
+ function localRepoSnippet(text, repoUrl) {
429
+ const lines = String(text || "").split(/\r?\n/);
430
+ const index = lines.findIndex((line) => line.includes(repoUrl));
431
+ if (index < 0) return boundedText(text, 500);
432
+ const start = Math.max(0, index - 2);
433
+ const end = Math.min(lines.length, index + 4);
434
+ return boundedText(lines.slice(start, end).join(" "));
435
+ }
436
+
437
+ function pushUnique(array, value, keyFn = (item) => item) {
438
+ if (!value) return;
439
+ const key = keyFn(value);
440
+ if (!array.some((item) => keyFn(item) === key)) array.push(value);
441
+ }
442
+
443
+ function sourceRefsForEntity(entity) {
444
+ return [...new Set(asArray(entity && entity.source_refs).filter(Boolean).map(String))];
445
+ }
446
+
447
+ function decisionStrengthsForEntity(entity, conceptName) {
448
+ const strengths = [];
449
+ if (sourceRefsForEntity(entity).length > 1) strengths.push("mentioned by multiple local source records");
450
+ if (asArray(entity.topics).length > 0) strengths.push(`topic fit: ${asArray(entity.topics).slice(0, 3).join(", ")}`);
451
+ if (entity.category && entity.category !== "uncategorized") strengths.push(`category fit: ${entity.category}`);
452
+ if (entity.summary) strengths.push("has local snippet evidence");
453
+ return strengths.length > 0 ? strengths : [`local evidence connects this repository to ${conceptName}`];
454
+ }
455
+
456
+ function decisionCaveatsForEntity(entity) {
457
+ const caveats = ["local evidence only; verify externally before adoption"];
458
+ if (!entity.github_url) caveats.push("missing canonical repository URL");
459
+ if (["low", "unsupported"].includes(entity.confidence_tier)) caveats.push("low extraction confidence");
460
+ if (sourceRefsForEntity(entity).length <= 1) caveats.push("single local source reference");
461
+ return caveats;
462
+ }
463
+
464
+ function confidenceSummary(records) {
465
+ const counts = { high: 0, medium: 0, low: 0, unsupported: 0, unknown: 0 };
466
+ for (const record of asArray(records)) {
467
+ const tier = String(record.confidence_tier || confidenceTier(record.confidence) || "unknown").toLowerCase();
468
+ if (Object.prototype.hasOwnProperty.call(counts, tier)) counts[tier] += 1;
469
+ else counts.unknown += 1;
470
+ }
471
+ return counts;
472
+ }
473
+
295
474
  function buildExtraction({ projectPath, sourceRoot }) {
296
475
  const generatedAt = new Date().toISOString();
297
476
  const sourceRootRef = makeSourceRootRef(sourceRoot);
298
477
  const collected = collectMarkdownFiles(sourceRoot);
478
+ const collectedJson = collectJsonFiles(sourceRoot);
299
479
  const sources = [];
300
480
  const toolEntitiesById = new Map();
301
481
  const conceptsById = new Map();
@@ -367,7 +547,7 @@ function buildExtraction({ projectPath, sourceRoot }) {
367
547
  notes: qualityNotes,
368
548
  },
369
549
  provenance_refs: [],
370
- target_page_path: `.sdtk/wiki/personal-brain/sources/${sourceSlug}.md`,
550
+ target_page_path: `wiki/sources/${sourceSlug}.md`,
371
551
  };
372
552
 
373
553
  sources.push(sourceRecord);
@@ -435,12 +615,37 @@ function buildExtraction({ projectPath, sourceRoot }) {
435
615
  confidence_tier: "high",
436
616
  source_refs: [],
437
617
  provenance_refs: [],
438
- target_page_path: `.sdtk/wiki/personal-brain/entities/tools/${safeSlug(repo.repo, "tool")}--${entityId}.md`,
618
+ topics: concepts.map((concept) => concept.name),
619
+ source_links: [],
620
+ evidence_snippets: [],
621
+ discovery_sources: [],
622
+ evidence_records: [],
623
+ target_page_path: `wiki/entities/tools/${safeSlug(repo.repo, "tool")}--${entityId}.md`,
439
624
  });
440
625
  }
441
626
  const entity = toolEntitiesById.get(entityId);
442
627
  if (!entity.source_refs.includes(sourceId)) entity.source_refs.push(sourceId);
443
628
  if (!entity.provenance_refs.includes(prov.provenance_id)) entity.provenance_refs.push(prov.provenance_id);
629
+ for (const concept of concepts) {
630
+ if (!entity.topics) entity.topics = [];
631
+ pushUnique(entity.topics, concept.name);
632
+ }
633
+ const snippet = localRepoSnippet(text, repo.github_url);
634
+ if (!entity.evidence_snippets) entity.evidence_snippets = [];
635
+ pushUnique(entity.evidence_snippets, snippet);
636
+ if (!entity.discovery_sources) entity.discovery_sources = [];
637
+ pushUnique(entity.discovery_sources, sourceLogicalPath);
638
+ if (!entity.evidence_records) entity.evidence_records = [];
639
+ pushUnique(entity.evidence_records, {
640
+ source_id: sourceId,
641
+ source_logical_path: sourceLogicalPath,
642
+ source_link: sourceUrl || null,
643
+ snippet,
644
+ topics: concepts.map((concept) => concept.name),
645
+ provenance_refs: [prov.provenance_id],
646
+ confidence: 0.9,
647
+ confidence_tier: "high",
648
+ }, (record) => `${record.source_id}:${record.source_link || ""}:${record.snippet}`);
444
649
 
445
650
  claims.push({
446
651
  claim_id: `claim_${sourceId}_${String(claims.length + 1).padStart(3, "0")}`,
@@ -478,7 +683,7 @@ function buildExtraction({ projectPath, sourceRoot }) {
478
683
  provenance_refs: [],
479
684
  confidence: 0.65,
480
685
  confidence_tier: "medium",
481
- target_page_path: `.sdtk/wiki/personal-brain/concepts/${safeSlug(conceptRule.name, "concept")}.md`,
686
+ target_page_path: `wiki/concepts/${safeSlug(conceptRule.name, "concept")}.md`,
482
687
  });
483
688
  }
484
689
  const concept = conceptsById.get(conceptRule.concept_id);
@@ -502,6 +707,374 @@ function buildExtraction({ projectPath, sourceRoot }) {
502
707
  }
503
708
  }
504
709
 
710
+ for (const filePath of collectedJson.files) {
711
+ const sourceRelativePath = toPosix(path.relative(sourceRoot, filePath));
712
+ const sourceDisplayPath = toPosix(filePath);
713
+ const sourceLogicalFilePath = `${sourceRootRef.source_root_label}/${sourceRelativePath}`;
714
+ const fileBytes = fs.readFileSync(filePath);
715
+ const fileHash = sha256(fileBytes);
716
+ const stats = fs.statSync(filePath);
717
+ let parsed;
718
+
719
+ try {
720
+ parsed = JSON.parse(fileBytes.toString("utf-8"));
721
+ } catch (error) {
722
+ const sourceId = `src_${sha256(`local-json-file:v1:${sourceRootRef.source_root_id}:${sourceRelativePath.toLowerCase()}`).slice(0, 16)}`;
723
+ const sourceRecord = {
724
+ source_id: sourceId,
725
+ source_root_id: sourceRootRef.source_root_id,
726
+ source_relative_path: sourceRelativePath,
727
+ source_logical_path: sourceLogicalFilePath,
728
+ source_display_path: sourceDisplayPath,
729
+ source_type: "json",
730
+ title: path.basename(filePath),
731
+ source_url: null,
732
+ source_hash: fileHash,
733
+ size_bytes: stats.size,
734
+ modified_time: stats.mtime.toISOString(),
735
+ encoding_quality: "unknown",
736
+ source_quality: {
737
+ has_mojibake: false,
738
+ mojibake_score: 0,
739
+ has_source_url: false,
740
+ weak_title: false,
741
+ duplicate_candidate: false,
742
+ duplicate_group_id: null,
743
+ low_confidence_extraction: true,
744
+ quality_flags: ["invalid_json"],
745
+ notes: [`Invalid JSON could not be parsed: ${error.message}`],
746
+ },
747
+ provenance_refs: [],
748
+ target_page_path: `wiki/sources/${safeSlug(path.basename(filePath), "json-source")}--${sourceId.slice(0, 8)}.md`,
749
+ };
750
+ sources.push(sourceRecord);
751
+ sourceQualityFindings.push({
752
+ finding_id: `sq_${sourceId}`,
753
+ source_id: sourceId,
754
+ source_relative_path: sourceRelativePath,
755
+ source_logical_path: sourceLogicalFilePath,
756
+ quality_flags: ["invalid_json"],
757
+ confidence: 0.1,
758
+ confidence_tier: "unsupported",
759
+ notes: sourceRecord.source_quality.notes,
760
+ });
761
+ unsupportedItems.push({
762
+ record_type: "unsupported_item",
763
+ item_id: `unsupported_${sourceId}_001`,
764
+ source_id: sourceId,
765
+ reason: "invalid_json",
766
+ raw_observation_summary: `Invalid JSON file: ${sourceRelativePath}`,
767
+ confidence: 0.1,
768
+ confidence_tier: "unsupported",
769
+ provenance_refs: [],
770
+ });
771
+ continue;
772
+ }
773
+
774
+ const rawRecords = firstArray(parsed);
775
+ const normalizedRecords = rawRecords.map(normalizeJsonRepoRecord).filter(Boolean);
776
+ if (normalizedRecords.length === 0) {
777
+ const sourceId = `src_${sha256(`local-json-empty:v1:${sourceRootRef.source_root_id}:${sourceRelativePath.toLowerCase()}`).slice(0, 16)}`;
778
+ const sourceRecord = {
779
+ source_id: sourceId,
780
+ source_root_id: sourceRootRef.source_root_id,
781
+ source_relative_path: sourceRelativePath,
782
+ source_logical_path: sourceLogicalFilePath,
783
+ source_display_path: sourceDisplayPath,
784
+ source_type: "json",
785
+ title: path.basename(filePath),
786
+ source_url: null,
787
+ source_hash: fileHash,
788
+ size_bytes: stats.size,
789
+ modified_time: stats.mtime.toISOString(),
790
+ encoding_quality: "clean",
791
+ source_quality: {
792
+ has_mojibake: false,
793
+ mojibake_score: 0,
794
+ has_source_url: false,
795
+ weak_title: false,
796
+ duplicate_candidate: false,
797
+ duplicate_group_id: null,
798
+ low_confidence_extraction: true,
799
+ quality_flags: ["empty_json_records"],
800
+ notes: ["JSON parsed successfully but contained no supported repository records."],
801
+ },
802
+ provenance_refs: [],
803
+ target_page_path: `wiki/sources/${safeSlug(path.basename(filePath), "json-source")}--${sourceId.slice(0, 8)}.md`,
804
+ };
805
+ sources.push(sourceRecord);
806
+ sourceQualityFindings.push({
807
+ finding_id: `sq_${sourceId}`,
808
+ source_id: sourceId,
809
+ source_relative_path: sourceRelativePath,
810
+ source_logical_path: sourceLogicalFilePath,
811
+ quality_flags: ["empty_json_records"],
812
+ confidence: 0.2,
813
+ confidence_tier: "unsupported",
814
+ notes: sourceRecord.source_quality.notes,
815
+ });
816
+ continue;
817
+ }
818
+
819
+ normalizedRecords.forEach((record, index) => {
820
+ const recordRef = `record-${String(index + 1).padStart(3, "0")}`;
821
+ const repoKey = record.repo_url || `${record.owner}/${record.repo_name}` || sha256(JSON.stringify(record.raw));
822
+ const sourceId = `src_${sha256(`local-json:v1:${sourceRootRef.source_root_id}:${sourceRelativePath.toLowerCase()}:${repoKey.toLowerCase()}`).slice(0, 16)}`;
823
+ const sourceHash = sha256(`${fileHash}:${sha256(JSON.stringify(record.raw))}`);
824
+ const title = record.repo_name || record.owner || `${path.basename(filePath)} ${recordRef}`;
825
+ const sourceLogicalPath = `${sourceLogicalFilePath}#${recordRef}`;
826
+ const sourceDisplayRecordPath = `${sourceDisplayPath}#${recordRef}`;
827
+ const mojibake = detectMojibake(record.message_text_snippet);
828
+ const weakTitle = title.length < 3;
829
+ const qualityFlags = [];
830
+ const qualityNotes = [];
831
+ const sourceUrl = record.repo_url || record.source_link || null;
832
+ const confidence = record.confidence;
833
+ const confidenceBand = confidenceTier(confidence);
834
+
835
+ if (mojibake.hasMojibake) {
836
+ qualityFlags.push("mojibake_detected");
837
+ qualityNotes.push("Potential mojibake or replacement characters detected in JSON snippet.");
838
+ }
839
+ if (!record.parsed_repo) {
840
+ qualityFlags.push("missing_repo_url");
841
+ qualityNotes.push("JSON record does not include a supported GitHub repository URL.");
842
+ }
843
+ if (!record.source_link) {
844
+ qualityFlags.push("missing_source_link");
845
+ qualityNotes.push("JSON record does not include a source_link.");
846
+ }
847
+ if (weakTitle) {
848
+ qualityFlags.push("weak_title");
849
+ qualityNotes.push("Repository title is missing or very short.");
850
+ }
851
+ if (confidence < 0.5) {
852
+ qualityFlags.push("low_confidence_extraction");
853
+ }
854
+
855
+ const sourceRecord = {
856
+ source_id: sourceId,
857
+ source_root_id: sourceRootRef.source_root_id,
858
+ source_relative_path: sourceRelativePath,
859
+ source_logical_path: sourceLogicalPath,
860
+ source_display_path: sourceDisplayRecordPath,
861
+ source_type: "json_record",
862
+ title,
863
+ source_url: sourceUrl,
864
+ source_hash: sourceHash,
865
+ size_bytes: stats.size,
866
+ modified_time: stats.mtime.toISOString(),
867
+ encoding_quality: mojibake.hasMojibake ? "suspect" : "clean",
868
+ source_record_locator: {
869
+ type: "json_record",
870
+ record_index: index,
871
+ record_ref: recordRef,
872
+ record_pointer: `/${index}`,
873
+ },
874
+ structured_fields: {
875
+ repo_url: record.repo_url,
876
+ owner: record.owner,
877
+ repo_name: record.repo_name,
878
+ message_text_snippet: record.message_text_snippet,
879
+ source_link: record.source_link,
880
+ topics: record.topics,
881
+ confidence: record.confidence_raw,
882
+ },
883
+ source_quality: {
884
+ has_mojibake: mojibake.hasMojibake,
885
+ mojibake_score: Number(mojibake.score.toFixed(3)),
886
+ has_source_url: Boolean(sourceUrl),
887
+ weak_title: weakTitle,
888
+ duplicate_candidate: false,
889
+ duplicate_group_id: null,
890
+ low_confidence_extraction: confidence < 0.5,
891
+ quality_flags: qualityFlags,
892
+ notes: qualityNotes,
893
+ },
894
+ provenance_refs: [],
895
+ target_page_path: `wiki/sources/${safeSlug(title || sourceRelativePath, "source")}--${sourceId.slice(0, 8)}.md`,
896
+ };
897
+
898
+ sources.push(sourceRecord);
899
+ if (record.repo_url) {
900
+ const existing = sourceUrlUsage.get(record.repo_url) || [];
901
+ existing.push(sourceRecord);
902
+ sourceUrlUsage.set(record.repo_url, existing);
903
+ }
904
+
905
+ if (qualityFlags.length > 0) {
906
+ sourceQualityFindings.push({
907
+ finding_id: `sq_${sourceId}`,
908
+ source_id: sourceId,
909
+ source_relative_path: sourceRelativePath,
910
+ source_logical_path: sourceLogicalPath,
911
+ quality_flags: [...qualityFlags],
912
+ confidence,
913
+ confidence_tier: confidenceBand,
914
+ notes: qualityNotes,
915
+ });
916
+ }
917
+
918
+ const concepts = [
919
+ ...inferConcepts(`${title}\n${record.message_text_snippet}\n${record.topics.join("\n")}`),
920
+ ...inferConceptsFromTopics(record.topics),
921
+ ];
922
+ const category = record.topics[0] ? normalizeTopic(record.topics[0]) : categoryForSource(record.message_text_snippet, concepts);
923
+
924
+ if (!record.parsed_repo) {
925
+ unsupportedItems.push({
926
+ record_type: "unsupported_item",
927
+ item_id: `unsupported_${sourceId}_001`,
928
+ source_id: sourceId,
929
+ reason: "missing_repo_url",
930
+ raw_observation_summary: `${sourceRelativePath}#${recordRef}`,
931
+ confidence: 0.2,
932
+ confidence_tier: "unsupported",
933
+ provenance_refs: [],
934
+ });
935
+ }
936
+
937
+ if (record.parsed_repo) {
938
+ const repo = record.parsed_repo;
939
+ const entityId = `tool_github_${safeSlug(repo.owner, "owner")}_${safeSlug(repo.repo, "repo")}`;
940
+ const prov = {
941
+ provenance_id: `prov_${sourceId}_${recordRef}`,
942
+ source_id: sourceId,
943
+ source_hash: sourceHash,
944
+ source_relative_path: sourceRelativePath,
945
+ source_logical_path: sourceLogicalPath,
946
+ locator: {
947
+ type: "json_record",
948
+ record_index: index,
949
+ record_ref: recordRef,
950
+ record_pointer: `/${index}`,
951
+ field: "repo_url",
952
+ },
953
+ evidence_quote_hash: sha256(`${sourceId}:${recordRef}:${repo.github_url}`),
954
+ extractor: "sdtk-wiki.semantic-extract",
955
+ extractor_version: "bk140-json-records",
956
+ generated_at: generatedAt,
957
+ confidence,
958
+ };
959
+ provenance.push(prov);
960
+ sourceRecord.provenance_refs.push(prov.provenance_id);
961
+
962
+ if (!toolEntitiesById.has(entityId)) {
963
+ toolEntitiesById.set(entityId, {
964
+ entity_id: entityId,
965
+ entity_type: "tool_entity",
966
+ name: repo.repo,
967
+ repo_owner: repo.owner,
968
+ repo_name: repo.repo,
969
+ github_url: repo.github_url,
970
+ category,
971
+ summary: record.message_text_snippet || `${repo.repo} is a locally sourced GitHub tool candidate in category ${category}.`,
972
+ confidence,
973
+ confidence_tier: confidenceBand,
974
+ source_refs: [],
975
+ provenance_refs: [],
976
+ topics: [...record.topics],
977
+ source_links: record.source_link ? [record.source_link] : [],
978
+ evidence_snippets: record.message_text_snippet ? [boundedText(record.message_text_snippet)] : [],
979
+ discovery_sources: [record.source_link, sourceLogicalPath].filter(Boolean),
980
+ evidence_records: [{
981
+ source_id: sourceId,
982
+ source_logical_path: sourceLogicalPath,
983
+ source_link: record.source_link || null,
984
+ snippet: boundedText(record.message_text_snippet),
985
+ topics: [...record.topics],
986
+ provenance_refs: [prov.provenance_id],
987
+ confidence,
988
+ confidence_tier: confidenceBand,
989
+ }],
990
+ target_page_path: `wiki/entities/tools/${safeSlug(repo.repo, "tool")}--${entityId}.md`,
991
+ });
992
+ }
993
+ const entity = toolEntitiesById.get(entityId);
994
+ if (!entity.source_refs.includes(sourceId)) entity.source_refs.push(sourceId);
995
+ if (!entity.provenance_refs.includes(prov.provenance_id)) entity.provenance_refs.push(prov.provenance_id);
996
+ for (const topic of record.topics) {
997
+ if (!entity.topics) entity.topics = [];
998
+ if (!entity.topics.includes(topic)) entity.topics.push(topic);
999
+ }
1000
+ if (record.source_link) {
1001
+ if (!entity.source_links) entity.source_links = [];
1002
+ if (!entity.source_links.includes(record.source_link)) entity.source_links.push(record.source_link);
1003
+ }
1004
+ if (record.message_text_snippet) {
1005
+ if (!entity.evidence_snippets) entity.evidence_snippets = [];
1006
+ pushUnique(entity.evidence_snippets, boundedText(record.message_text_snippet));
1007
+ }
1008
+ if (!entity.discovery_sources) entity.discovery_sources = [];
1009
+ pushUnique(entity.discovery_sources, record.source_link);
1010
+ pushUnique(entity.discovery_sources, sourceLogicalPath);
1011
+ if (!entity.evidence_records) entity.evidence_records = [];
1012
+ pushUnique(entity.evidence_records, {
1013
+ source_id: sourceId,
1014
+ source_logical_path: sourceLogicalPath,
1015
+ source_link: record.source_link || null,
1016
+ snippet: boundedText(record.message_text_snippet),
1017
+ topics: [...record.topics],
1018
+ provenance_refs: [prov.provenance_id],
1019
+ confidence,
1020
+ confidence_tier: confidenceBand,
1021
+ }, (item) => `${item.source_id}:${item.source_link || ""}:${item.snippet || ""}`);
1022
+
1023
+ claims.push({
1024
+ claim_id: `claim_${sourceId}_${String(claims.length + 1).padStart(3, "0")}`,
1025
+ text: `The local JSON record presents ${repo.repo} as a ${category} repository candidate.`,
1026
+ subject_entity_id: entityId,
1027
+ source_refs: [sourceId],
1028
+ provenance_refs: [prov.provenance_id],
1029
+ confidence,
1030
+ confidence_tier: confidenceBand,
1031
+ contested: false,
1032
+ });
1033
+
1034
+ relations.push({
1035
+ relation_id: `rel_${sourceId}_${String(relations.length + 1).padStart(3, "0")}`,
1036
+ source_id: sourceId,
1037
+ target_id: entityId,
1038
+ relation_type: "source_mentions_entity",
1039
+ evidence: "The local JSON record includes a GitHub repository URL.",
1040
+ source_refs: [sourceId],
1041
+ provenance_refs: [prov.provenance_id],
1042
+ confidence,
1043
+ confidence_tier: confidenceBand,
1044
+ });
1045
+
1046
+ for (const conceptRule of concepts) {
1047
+ if (!conceptsById.has(conceptRule.concept_id)) {
1048
+ conceptsById.set(conceptRule.concept_id, {
1049
+ ...conceptRule,
1050
+ related_entities: conceptRule.related_entities || [],
1051
+ source_refs: conceptRule.source_refs || [],
1052
+ provenance_refs: conceptRule.provenance_refs || [],
1053
+ confidence: conceptRule.confidence || 0.6,
1054
+ confidence_tier: conceptRule.confidence_tier || "medium",
1055
+ target_page_path: conceptRule.target_page_path || `wiki/concepts/${safeSlug(conceptRule.name, "concept")}.md`,
1056
+ });
1057
+ }
1058
+ const concept = conceptsById.get(conceptRule.concept_id);
1059
+ if (!concept.source_refs.includes(sourceId)) concept.source_refs.push(sourceId);
1060
+ if (!concept.provenance_refs.includes(prov.provenance_id)) concept.provenance_refs.push(prov.provenance_id);
1061
+ if (!concept.related_entities.includes(entityId)) concept.related_entities.push(entityId);
1062
+ relations.push({
1063
+ relation_id: `rel_${sourceId}_${String(relations.length + 1).padStart(3, "0")}`,
1064
+ source_id: entityId,
1065
+ target_id: conceptRule.concept_id,
1066
+ relation_type: "entity_implements_concept",
1067
+ evidence: "The local JSON record includes matching topics or semantic keywords.",
1068
+ source_refs: [sourceId],
1069
+ provenance_refs: [prov.provenance_id],
1070
+ confidence: Math.min(0.8, confidence + 0.05),
1071
+ confidence_tier: confidenceTier(Math.min(0.8, confidence + 0.05)),
1072
+ });
1073
+ }
1074
+ }
1075
+ });
1076
+ }
1077
+
505
1078
  for (const [sourceUrl, sourceRecords] of sourceUrlUsage.entries()) {
506
1079
  if (sourceRecords.length < 2) continue;
507
1080
  const duplicateGroupId = `dup_${sha256(sourceUrl).slice(0, 12)}`;
@@ -525,6 +1098,67 @@ function buildExtraction({ projectPath, sourceRoot }) {
525
1098
  }
526
1099
  }
527
1100
 
1101
+ const toolEntityValues = [...toolEntitiesById.values()];
1102
+ for (const entity of toolEntityValues) {
1103
+ const entityTopics = new Set(asArray(entity.topics).map((topic) => String(topic).toLowerCase()));
1104
+ const related = [];
1105
+ for (const candidate of toolEntityValues) {
1106
+ if (candidate.entity_id === entity.entity_id) continue;
1107
+ const sharedTopics = asArray(candidate.topics).filter((topic) => entityTopics.has(String(topic).toLowerCase()));
1108
+ const sameCategory = entity.category && candidate.category && entity.category === candidate.category;
1109
+ if (sharedTopics.length === 0 && !sameCategory) continue;
1110
+ related.push({
1111
+ entity_id: candidate.entity_id,
1112
+ name: candidate.name || candidate.repo_name || candidate.entity_id,
1113
+ github_url: candidate.github_url || null,
1114
+ shared_topics: sharedTopics,
1115
+ relation_hint: sharedTopics.length > 0 ? "shared_topic" : "same_category",
1116
+ });
1117
+ }
1118
+ entity.related_repos = related
1119
+ .sort((a, b) => b.shared_topics.length - a.shared_topics.length || a.entity_id.localeCompare(b.entity_id))
1120
+ .slice(0, 8);
1121
+ }
1122
+
1123
+ for (const concept of conceptsById.values()) {
1124
+ const relatedDetails = [];
1125
+ const axisCounts = new Map();
1126
+ for (const entityId of asArray(concept.related_entities)) {
1127
+ const entity = toolEntitiesById.get(entityId);
1128
+ if (!entity) continue;
1129
+ for (const topic of asArray(entity.topics)) {
1130
+ const key = String(topic || "").trim();
1131
+ if (key) axisCounts.set(key, (axisCounts.get(key) || 0) + 1);
1132
+ }
1133
+ if (entity.category) axisCounts.set(entity.category, (axisCounts.get(entity.category) || 0) + 1);
1134
+ relatedDetails.push({
1135
+ entity_id: entity.entity_id,
1136
+ name: entity.name || entity.repo_name || entity.entity_id,
1137
+ repo_owner: entity.repo_owner || null,
1138
+ repo_name: entity.repo_name || null,
1139
+ github_url: entity.github_url || null,
1140
+ category: entity.category || "uncategorized",
1141
+ topics: asArray(entity.topics).slice(0, 8),
1142
+ summary: entity.summary || "",
1143
+ target_page_path: entity.target_page_path || null,
1144
+ source_refs: sourceRefsForEntity(entity),
1145
+ confidence: entity.confidence,
1146
+ confidence_tier: entity.confidence_tier || confidenceTier(entity.confidence),
1147
+ });
1148
+ }
1149
+ concept.related_entity_details = relatedDetails
1150
+ .sort((a, b) => (b.source_refs.length - a.source_refs.length) || a.name.localeCompare(b.name))
1151
+ .slice(0, 12);
1152
+ concept.key_axes = [...axisCounts.entries()]
1153
+ .sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0]))
1154
+ .slice(0, 8)
1155
+ .map(([name, count]) => ({ name, evidence_count: count }));
1156
+ concept.patterns = concept.key_axes.slice(0, 5).map((axis) => ({
1157
+ pattern: axis.name,
1158
+ evidence: `${axis.evidence_count} locally extracted tool candidate(s) are associated with this axis.`,
1159
+ }));
1160
+ }
1161
+
528
1162
  const toolEntities = [...toolEntitiesById.values()].sort((a, b) => a.entity_id.localeCompare(b.entity_id));
529
1163
  const concepts = [...conceptsById.values()].sort((a, b) => a.concept_id.localeCompare(b.concept_id));
530
1164
  const comparisons = [];
@@ -533,27 +1167,69 @@ function buildExtraction({ projectPath, sourceRoot }) {
533
1167
  for (const concept of concepts) {
534
1168
  if (concept.related_entities.length < 2) continue;
535
1169
  const topicSlug = safeSlug(concept.name, "topic");
1170
+ const comparedDetails = asArray(concept.related_entity_details).slice(0, 8);
1171
+ const decisionAxes = asArray(concept.key_axes).slice(0, 6);
1172
+ const matrixRows = comparedDetails.map((entity) => ({
1173
+ entity_id: entity.entity_id,
1174
+ name: entity.repo_owner && entity.repo_name ? `${entity.repo_owner}/${entity.repo_name}` : entity.name,
1175
+ github_url: entity.github_url || null,
1176
+ category: entity.category || "uncategorized",
1177
+ topics: asArray(entity.topics).slice(0, 6),
1178
+ strengths: decisionStrengthsForEntity(entity, concept.name),
1179
+ caveats: decisionCaveatsForEntity(entity),
1180
+ source_ref_count: sourceRefsForEntity(entity).length,
1181
+ source_confidence: entity.confidence_tier || confidenceTier(entity.confidence),
1182
+ local_recommendation: sourceRefsForEntity(entity).length > 1
1183
+ ? "shortlist for human review"
1184
+ : "keep as a candidate until more local evidence is available",
1185
+ }));
1186
+ const summary = `Local sources mention ${concept.related_entities.length} tool candidate(s) related to ${concept.name}. This page compares candidates for review, not verified ranking.`;
1187
+ const recommendations = matrixRows.slice(0, 3).map((row) => `${row.name}: ${row.local_recommendation}; strengths: ${row.strengths.slice(0, 2).join("; ")}.`);
536
1188
  comparisons.push({
537
1189
  comparison_id: `comparison_${topicSlug}_${sha256(concept.related_entities.join("|")).slice(0, 8)}`,
538
1190
  topic: concept.name,
539
1191
  compared_entities: concept.related_entities.slice(0, 8),
540
- criteria: ["local evidence", "category fit", "source confidence"],
1192
+ compared_entity_details: comparedDetails,
1193
+ decision_axes: decisionAxes,
1194
+ matrix_rows: matrixRows,
1195
+ criteria: ["local evidence", "category/topic fit", "source confidence", "review caveats"],
1196
+ summary,
1197
+ recommendations,
1198
+ caveats: [
1199
+ "The comparison uses local source evidence only.",
1200
+ "Do not treat ordering as an external quality ranking.",
1201
+ "Verify license, maintenance, security, and ecosystem fit before adoption.",
1202
+ ],
1203
+ source_confidence_summary: confidenceSummary(matrixRows),
541
1204
  source_refs: concept.source_refs,
542
1205
  provenance_refs: concept.provenance_refs,
543
1206
  confidence: 0.55,
544
1207
  confidence_tier: "medium",
545
- target_page_path: `.sdtk/wiki/personal-brain/comparisons/${topicSlug}.md`,
1208
+ target_page_path: `wiki/comparisons/${topicSlug}.md`,
546
1209
  });
547
1210
  syntheses.push({
548
1211
  synthesis_id: `synthesis_${topicSlug}_${sha256(concept.source_refs.join("|")).slice(0, 8)}`,
549
1212
  topic: concept.name,
550
- summary: `Local sources mention ${concept.related_entities.length} tool candidate(s) related to ${concept.name}.`,
551
- recommendations: ["Review extracted entities and source quality findings before compile/apply work."],
1213
+ summary,
1214
+ landscape_axes: decisionAxes,
1215
+ candidate_tools: matrixRows,
1216
+ patterns: asArray(concept.patterns),
1217
+ related_comparison_path: `wiki/comparisons/${topicSlug}.md`,
1218
+ source_confidence_summary: confidenceSummary(matrixRows),
1219
+ recommendations: [
1220
+ ...recommendations,
1221
+ "Use this synthesis to select review candidates; defer adoption until external verification is complete.",
1222
+ ],
1223
+ caveats: [
1224
+ "Local extraction can include stale or incomplete source snippets.",
1225
+ "No web verification, GitHub API data, stars, licenses, or release cadence are claimed here.",
1226
+ "Human review should resolve topic fit and source-quality warnings before product decisions.",
1227
+ ],
552
1228
  source_refs: concept.source_refs,
553
1229
  provenance_refs: concept.provenance_refs,
554
1230
  confidence: 0.55,
555
1231
  confidence_tier: "medium",
556
- target_page_path: `.sdtk/wiki/personal-brain/syntheses/${topicSlug}.md`,
1232
+ target_page_path: `wiki/syntheses/${topicSlug}.md`,
557
1233
  });
558
1234
  }
559
1235
 
@@ -566,7 +1242,7 @@ function buildExtraction({ projectPath, sourceRoot }) {
566
1242
  project_path: projectPath,
567
1243
  source_root_refs: [sourceRootRef],
568
1244
  source_counts: {
569
- scanned: collected.scanned,
1245
+ scanned: collected.scanned + collectedJson.scanned,
570
1246
  indexed: sources.length,
571
1247
  extracted: toolEntities.length,
572
1248
  skipped: collected.skipped.length,