sneakoscope 2.0.12 → 2.0.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -1
- package/crates/sks-core/Cargo.lock +1 -1
- package/crates/sks-core/Cargo.toml +1 -1
- package/crates/sks-core/src/main.rs +1 -1
- package/dist/.sks-build-stamp.json +4 -4
- package/dist/bin/sks.js +1 -1
- package/dist/build-manifest.json +24 -8
- package/dist/core/codex-control/codex-sdk-adapter.js +10 -0
- package/dist/core/codex-control/codex-task-runner.js +4 -2
- package/dist/core/commands/research-command.js +43 -4
- package/dist/core/fsx.js +1 -1
- package/dist/core/research/claim-evidence-matrix.js +160 -0
- package/dist/core/research/experiment-plan.js +53 -0
- package/dist/core/research/falsification.js +18 -0
- package/dist/core/research/implementation-blueprint-markdown.js +31 -0
- package/dist/core/research/implementation-blueprint.js +66 -0
- package/dist/core/research/replication-pack.js +50 -0
- package/dist/core/research/research-cycle-runner.js +25 -0
- package/dist/core/research/research-final-reviewer.js +58 -0
- package/dist/core/research/research-handoff.js +51 -0
- package/dist/core/research/research-prompt-contract.js +24 -0
- package/dist/core/research/research-quality-contract.js +61 -0
- package/dist/core/research/research-report-quality.js +67 -0
- package/dist/core/research/research-stage-runner.js +16 -0
- package/dist/core/research/research-work-graph.js +75 -0
- package/dist/core/research/source-quality-report.js +94 -0
- package/dist/core/research.js +344 -44
- package/dist/core/version.js +1 -1
- package/dist/core/zellij/zellij-slot-column-anchor.js +5 -3
- package/dist/core/zellij/zellij-slot-pane-renderer.js +259 -16
- package/dist/scripts/codex-sdk-research-pipeline-check.js +7 -0
- package/dist/scripts/packlist-performance-check.js +1 -1
- package/dist/scripts/research-quality-gate-check.js +86 -0
- package/dist/scripts/zellij-slot-column-anchor-check.js +26 -5
- package/dist/scripts/zellij-slot-pane-renderer-check.js +73 -5
- package/package.json +13 -1
- package/schemas/research/claim-evidence-matrix.schema.json +37 -0
- package/schemas/research/experiment-plan.schema.json +17 -0
- package/schemas/research/implementation-blueprint.schema.json +30 -0
- package/schemas/research/replication-pack.schema.json +17 -0
- package/schemas/research/research-final-review.schema.json +16 -0
- package/schemas/research/research-quality-contract.schema.json +37 -0
- package/schemas/research/source-quality-report.schema.json +18 -0
package/dist/core/research.js
CHANGED
|
@@ -3,6 +3,19 @@ import { appendJsonlBounded, nowIso, readJson, readText, writeJsonAtomic, writeT
|
|
|
3
3
|
import { OUTCOME_RUBRIC } from './proof-field.js';
|
|
4
4
|
import { RESEARCH_AGENT_PERSONA_CONTRACT, validateResearchAgentPersonas } from './recallpulse.js';
|
|
5
5
|
import { appendAgentLedgerEvent, initializeAgentCentralLedger } from './agents/agent-central-ledger.js';
|
|
6
|
+
import { CLAIM_EVIDENCE_MATRIX_ARTIFACT, buildClaimEvidenceMatrixFromLedgers, defaultClaimEvidenceMatrix, readClaimEvidenceMatrix, validateClaimEvidenceMatrix, writeClaimEvidenceMatrix } from './research/claim-evidence-matrix.js';
|
|
7
|
+
import { EXPERIMENT_PLAN_JSON_ARTIFACT, EXPERIMENT_PLAN_MARKDOWN_ARTIFACT, defaultExperimentPlan, readExperimentPlan, validateExperimentPlan, writeExperimentPlan } from './research/experiment-plan.js';
|
|
8
|
+
import { IMPLEMENTATION_BLUEPRINT_ARTIFACT, defaultImplementationBlueprint, readImplementationBlueprint, validateImplementationBlueprint, writeImplementationBlueprint } from './research/implementation-blueprint.js';
|
|
9
|
+
import { IMPLEMENTATION_BLUEPRINT_MARKDOWN_ARTIFACT, renderImplementationBlueprintMarkdown } from './research/implementation-blueprint-markdown.js';
|
|
10
|
+
import { REPLICATION_PACK_ARTIFACT, defaultReplicationPack, readReplicationPack, validateReplicationPack, writeReplicationPack } from './research/replication-pack.js';
|
|
11
|
+
import { RESEARCH_QUALITY_CONTRACT_ARTIFACT, DEFAULT_RESEARCH_QUALITY_CONTRACT, readResearchQualityContract, writeResearchQualityContract } from './research/research-quality-contract.js';
|
|
12
|
+
import { RESEARCH_FINAL_REVIEW_ARTIFACT, readResearchFinalReview, runResearchFinalReviewer } from './research/research-final-reviewer.js';
|
|
13
|
+
import { SOURCE_QUALITY_REPORT_ARTIFACT, readSourceQualityReport, writeSourceQualityReport } from './research/source-quality-report.js';
|
|
14
|
+
import { analyzeResearchReportQuality, countWords } from './research/research-report-quality.js';
|
|
15
|
+
import { validateFalsificationCoverage } from './research/falsification.js';
|
|
16
|
+
import { writeResearchHandoffArtifacts } from './research/research-handoff.js';
|
|
17
|
+
import { RESEARCH_WORK_GRAPH_ARTIFACT, writeResearchWorkGraph } from './research/research-work-graph.js';
|
|
18
|
+
import { researchPromptContractText, validateResearchPromptContract } from './research/research-prompt-contract.js';
|
|
6
19
|
export const RESEARCH_PAPER_ARTIFACT = 'research-paper.md';
|
|
7
20
|
export const RESEARCH_SOURCE_SKILL_ARTIFACT = 'research-source-skill.md';
|
|
8
21
|
export const RESEARCH_GENIUS_SUMMARY_ARTIFACT = 'genius-opinion-summary.md';
|
|
@@ -246,6 +259,7 @@ export function createResearchPlan(prompt, opts = {}) {
|
|
|
246
259
|
created_at: createdAt,
|
|
247
260
|
methodology: opts.autoresearch ? 'native-agent-autoresearch-batch-frontier-loop' : 'native-agent-research-council-frontier-discovery-loop',
|
|
248
261
|
paper_artifact: paperArtifact,
|
|
262
|
+
quality_contract: DEFAULT_RESEARCH_QUALITY_CONTRACT,
|
|
249
263
|
native_agent_plan: nativeAgentPlan,
|
|
250
264
|
agent_sessions: nativeAgentPlan.personas,
|
|
251
265
|
agent_batches: nativeAgentPlan.batches,
|
|
@@ -254,7 +268,17 @@ export function createResearchPlan(prompt, opts = {}) {
|
|
|
254
268
|
research_paper: paperArtifact,
|
|
255
269
|
legacy_research_paper: RESEARCH_PAPER_ARTIFACT,
|
|
256
270
|
genius_opinion_summary: RESEARCH_GENIUS_SUMMARY_ARTIFACT,
|
|
257
|
-
research_source_skill: RESEARCH_SOURCE_SKILL_ARTIFACT
|
|
271
|
+
research_source_skill: RESEARCH_SOURCE_SKILL_ARTIFACT,
|
|
272
|
+
quality_contract: RESEARCH_QUALITY_CONTRACT_ARTIFACT,
|
|
273
|
+
claim_evidence_matrix: CLAIM_EVIDENCE_MATRIX_ARTIFACT,
|
|
274
|
+
source_quality_report: SOURCE_QUALITY_REPORT_ARTIFACT,
|
|
275
|
+
implementation_blueprint: IMPLEMENTATION_BLUEPRINT_ARTIFACT,
|
|
276
|
+
implementation_blueprint_markdown: IMPLEMENTATION_BLUEPRINT_MARKDOWN_ARTIFACT,
|
|
277
|
+
experiment_plan: EXPERIMENT_PLAN_JSON_ARTIFACT,
|
|
278
|
+
experiment_plan_markdown: EXPERIMENT_PLAN_MARKDOWN_ARTIFACT,
|
|
279
|
+
replication_pack: REPLICATION_PACK_ARTIFACT,
|
|
280
|
+
final_review: RESEARCH_FINAL_REVIEW_ARTIFACT,
|
|
281
|
+
research_work_graph: RESEARCH_WORK_GRAPH_ARTIFACT
|
|
258
282
|
},
|
|
259
283
|
objective: 'Find the shortest useful mechanism that can be falsified or applied, grounded in maximum available source retrieval rather than broad summary.',
|
|
260
284
|
execution_policy: {
|
|
@@ -365,6 +389,16 @@ export function createResearchPlan(prompt, opts = {}) {
|
|
|
365
389
|
paperArtifact,
|
|
366
390
|
RESEARCH_GENIUS_SUMMARY_ARTIFACT,
|
|
367
391
|
RESEARCH_SOURCE_SKILL_ARTIFACT,
|
|
392
|
+
RESEARCH_QUALITY_CONTRACT_ARTIFACT,
|
|
393
|
+
CLAIM_EVIDENCE_MATRIX_ARTIFACT,
|
|
394
|
+
SOURCE_QUALITY_REPORT_ARTIFACT,
|
|
395
|
+
IMPLEMENTATION_BLUEPRINT_ARTIFACT,
|
|
396
|
+
IMPLEMENTATION_BLUEPRINT_MARKDOWN_ARTIFACT,
|
|
397
|
+
EXPERIMENT_PLAN_JSON_ARTIFACT,
|
|
398
|
+
EXPERIMENT_PLAN_MARKDOWN_ARTIFACT,
|
|
399
|
+
REPLICATION_PACK_ARTIFACT,
|
|
400
|
+
RESEARCH_FINAL_REVIEW_ARTIFACT,
|
|
401
|
+
RESEARCH_WORK_GRAPH_ARTIFACT,
|
|
368
402
|
'source-ledger.json',
|
|
369
403
|
'agent-ledger.json',
|
|
370
404
|
'debate-ledger.json',
|
|
@@ -391,6 +425,20 @@ export function researchPlanMarkdown(plan) {
|
|
|
391
425
|
if (plan.mutation_policy)
|
|
392
426
|
lines.push(`Mutation policy: ${plan.mutation_policy.rule}`);
|
|
393
427
|
lines.push('');
|
|
428
|
+
if (plan.quality_contract) {
|
|
429
|
+
const contract = plan.quality_contract;
|
|
430
|
+
lines.push('## Quality Contract');
|
|
431
|
+
lines.push(`- minimum sources: ${contract.min_sources_total}`);
|
|
432
|
+
lines.push(`- minimum source layers covered: ${contract.min_source_layers_covered}`);
|
|
433
|
+
lines.push(`- minimum counterevidence sources: ${contract.min_counterevidence_sources}`);
|
|
434
|
+
lines.push(`- minimum key claims: ${contract.min_key_claims}`);
|
|
435
|
+
lines.push(`- minimum triangulated claims: ${contract.min_trianguled_claims}`);
|
|
436
|
+
lines.push(`- minimum blueprint sections: ${contract.min_implementation_blueprint_sections}`);
|
|
437
|
+
lines.push(`- minimum falsification cases: ${contract.min_falsification_cases}`);
|
|
438
|
+
lines.push(`- minimum experiment steps: ${contract.min_experiment_steps}`);
|
|
439
|
+
lines.push(`- minimum report words: ${contract.min_report_words}`);
|
|
440
|
+
lines.push('');
|
|
441
|
+
}
|
|
394
442
|
if (plan.native_agent_plan) {
|
|
395
443
|
lines.push('## Native Agent Plan');
|
|
396
444
|
lines.push(`Backend: ${plan.native_agent_plan.backend}`);
|
|
@@ -489,10 +537,7 @@ export function countGeniusOpinionSummaries(text = '') {
|
|
|
489
537
|
}
|
|
490
538
|
export async function writeResearchPlan(dir, prompt, opts = {}) {
|
|
491
539
|
const plan = createResearchPlan(prompt, opts);
|
|
492
|
-
|
|
493
|
-
await writeTextAtomic(path.join(dir, 'research-plan.md'), researchPlanMarkdown(plan));
|
|
494
|
-
await writeTextAtomic(path.join(dir, RESEARCH_SOURCE_SKILL_ARTIFACT), researchSourceSkillMarkdown(plan));
|
|
495
|
-
await writeJsonAtomic(path.join(dir, 'novelty-ledger.json'), {
|
|
540
|
+
const noveltyLedger = {
|
|
496
541
|
schema_version: 1,
|
|
497
542
|
entries: [],
|
|
498
543
|
rubric: {
|
|
@@ -500,8 +545,26 @@ export async function writeResearchPlan(dir, prompt, opts = {}) {
|
|
|
500
545
|
confidence: '0 speculation, 1 weak, 2 supported, 3 strongly supported',
|
|
501
546
|
falsifiability: '0 vague, 1 indirectly testable, 2 directly testable, 3 cheap decisive test exists'
|
|
502
547
|
}
|
|
503
|
-
}
|
|
504
|
-
|
|
548
|
+
};
|
|
549
|
+
const sourceLedger = defaultSourceLedger(plan);
|
|
550
|
+
const claimMatrix = defaultClaimEvidenceMatrix(plan.mission_id || '');
|
|
551
|
+
const blueprint = defaultImplementationBlueprint(plan);
|
|
552
|
+
const experimentPlan = defaultExperimentPlan(plan);
|
|
553
|
+
const replicationPack = defaultReplicationPack(plan);
|
|
554
|
+
await writeJsonAtomic(path.join(dir, 'research-plan.json'), plan);
|
|
555
|
+
await writeTextAtomic(path.join(dir, 'research-plan.md'), researchPlanMarkdown(plan));
|
|
556
|
+
await writeTextAtomic(path.join(dir, RESEARCH_SOURCE_SKILL_ARTIFACT), researchSourceSkillMarkdown(plan));
|
|
557
|
+
await writeResearchQualityContract(dir, plan.quality_contract);
|
|
558
|
+
await writeJsonAtomic(path.join(dir, 'novelty-ledger.json'), noveltyLedger);
|
|
559
|
+
await writeJsonAtomic(path.join(dir, 'source-ledger.json'), sourceLedger);
|
|
560
|
+
await writeClaimEvidenceMatrix(dir, claimMatrix);
|
|
561
|
+
await writeSourceQualityReport(dir, sourceLedger, claimMatrix);
|
|
562
|
+
await writeImplementationBlueprint(dir, blueprint);
|
|
563
|
+
await writeTextAtomic(path.join(dir, IMPLEMENTATION_BLUEPRINT_MARKDOWN_ARTIFACT), renderImplementationBlueprintMarkdown(blueprint));
|
|
564
|
+
await writeExperimentPlan(dir, experimentPlan);
|
|
565
|
+
await writeReplicationPack(dir, replicationPack);
|
|
566
|
+
await writeResearchHandoffArtifacts(dir, plan, blueprint);
|
|
567
|
+
await writeResearchWorkGraph(dir, plan);
|
|
505
568
|
await writeJsonAtomic(path.join(dir, 'agent-ledger.json'), defaultAgentLedger(plan));
|
|
506
569
|
await writeJsonAtomic(path.join(dir, 'debate-ledger.json'), defaultDebateLedger(plan));
|
|
507
570
|
await writeJsonAtomic(path.join(dir, 'falsification-ledger.json'), defaultFalsificationLedger());
|
|
@@ -648,10 +711,14 @@ export function defaultSourceLedger(plan = null) {
|
|
|
648
711
|
},
|
|
649
712
|
quality_model: {
|
|
650
713
|
reporting_basis: 'Record enough source metadata to make search reproducible, including query, layer, locator, publisher or author, publication date when known, accessed_at, reliability, credibility, stance, and cited claim ids.',
|
|
651
|
-
source_quality_fields: ['layer', 'kind', 'title', 'locator', 'publisher_or_author', 'published_at', 'accessed_at', 'reliability', 'credibility', 'stance', 'supports', 'undermines']
|
|
714
|
+
source_quality_fields: ['layer', 'kind', 'title', 'locator', 'publisher_or_author', 'published_at', 'accessed_at', 'reliability', 'credibility', 'stance', 'supports', 'undermines', 'claim_ids']
|
|
652
715
|
},
|
|
653
716
|
citation_coverage: {
|
|
654
717
|
all_key_claims_cited: false,
|
|
718
|
+
key_claim_ids: [],
|
|
719
|
+
cited_claim_ids: [],
|
|
720
|
+
uncited_claim_ids: [],
|
|
721
|
+
source_claim_map: {},
|
|
655
722
|
notes: []
|
|
656
723
|
},
|
|
657
724
|
blockers: []
|
|
@@ -723,7 +790,12 @@ export function defaultDebateLedger(plan = null) {
|
|
|
723
790
|
export function defaultFalsificationLedger() {
|
|
724
791
|
return {
|
|
725
792
|
schema_version: 1,
|
|
793
|
+
schema: 'sks.falsification-ledger.v1',
|
|
726
794
|
created_at: nowIso(),
|
|
795
|
+
quality_contract: {
|
|
796
|
+
min_cases: DEFAULT_RESEARCH_QUALITY_CONTRACT.min_falsification_cases,
|
|
797
|
+
required_fields: ['id', 'target_claim', 'attack', 'source_ids', 'result', 'next_decisive_test']
|
|
798
|
+
},
|
|
727
799
|
cases: [],
|
|
728
800
|
unresolved_failures: [],
|
|
729
801
|
next_decisive_tests: []
|
|
@@ -827,8 +899,12 @@ export function defaultResearchGate() {
|
|
|
827
899
|
}
|
|
828
900
|
export async function evaluateResearchGate(dir) {
|
|
829
901
|
const gate = await readJson(path.join(dir, 'research-gate.json'), defaultResearchGate());
|
|
902
|
+
const contract = await readResearchQualityContract(dir);
|
|
830
903
|
const plan = await readJson(path.join(dir, 'research-plan.json'), null);
|
|
831
904
|
const reportPresent = await exists(path.join(dir, 'research-report.md'));
|
|
905
|
+
const reportText = reportPresent ? await readText(path.join(dir, 'research-report.md'), '') : '';
|
|
906
|
+
const reportQuality = analyzeResearchReportQuality(reportText);
|
|
907
|
+
const reportWordCount = countWords(reportText);
|
|
832
908
|
const paperArtifact = await findResearchPaperArtifact(dir, plan);
|
|
833
909
|
const paperPresent = paperArtifact.exists;
|
|
834
910
|
const paperText = paperPresent ? await readText(paperArtifact.path, '') : '';
|
|
@@ -845,10 +921,25 @@ export async function evaluateResearchGate(dir) {
|
|
|
845
921
|
const agentLedger = await readJson(path.join(dir, 'agent-ledger.json'), null);
|
|
846
922
|
const debateLedger = await readJson(path.join(dir, 'debate-ledger.json'), null);
|
|
847
923
|
const falsificationLedger = await readJson(path.join(dir, 'falsification-ledger.json'), null);
|
|
924
|
+
const noveltyLedger = await readJson(path.join(dir, 'novelty-ledger.json'), null);
|
|
925
|
+
const claimMatrixSummary = await readClaimEvidenceMatrix(dir);
|
|
926
|
+
const claimMatrix = claimMatrixSummary.matrix;
|
|
927
|
+
const claimMatrixValidation = validateClaimEvidenceMatrix(claimMatrix, sourceLedger, falsificationLedger);
|
|
928
|
+
const blueprint = await readImplementationBlueprint(dir);
|
|
929
|
+
const blueprintValidation = validateImplementationBlueprint(blueprint, contract);
|
|
930
|
+
const experimentPlan = await readExperimentPlan(dir);
|
|
931
|
+
const experimentValidation = validateExperimentPlan(experimentPlan, contract);
|
|
932
|
+
const replicationPack = await readReplicationPack(dir);
|
|
933
|
+
const replicationValidation = validateReplicationPack(replicationPack);
|
|
934
|
+
const falsificationValidation = validateFalsificationCoverage(falsificationLedger, contract);
|
|
935
|
+
let sourceQualityReport = await readSourceQualityReport(dir);
|
|
936
|
+
if (!sourceQualityReport && sourceLedger)
|
|
937
|
+
sourceQualityReport = await writeSourceQualityReport(dir, sourceLedger, claimMatrix);
|
|
848
938
|
const geniusSummaryText = geniusSummaryPresent ? await readText(path.join(dir, RESEARCH_GENIUS_SUMMARY_ARTIFACT), '') : '';
|
|
849
939
|
const personaValidation = validateResearchAgentLedger(agentLedger || {}, geniusSummaryText);
|
|
850
940
|
const sourceEntries = Array.isArray(sourceLedger?.sources) ? sourceLedger.sources.length : 0;
|
|
851
941
|
const counterEvidenceEntries = Array.isArray(sourceLedger?.counterevidence_sources) ? sourceLedger.counterevidence_sources.length : 0;
|
|
942
|
+
const totalSourceEntries = sourceEntries + counterEvidenceEntries;
|
|
852
943
|
const webSearchPasses = Math.max(Number(gate.web_search_passes || 0), Number(sourceLedger?.web_search_passes || 0));
|
|
853
944
|
const requiredSourceLayers = sourceLayerIdsForPlan(plan);
|
|
854
945
|
const sourceLayerStats = sourceLayerCoverageStats(sourceLedger, requiredSourceLayers);
|
|
@@ -871,6 +962,10 @@ export async function evaluateResearchGate(dir) {
|
|
|
871
962
|
const reasons = [];
|
|
872
963
|
if (!reportPresent && gate.report_present !== true)
|
|
873
964
|
reasons.push('research_report_missing');
|
|
965
|
+
if (reportWordCount < contract.min_report_words)
|
|
966
|
+
reasons.push('research_report_too_short');
|
|
967
|
+
if (!reportQuality.ok)
|
|
968
|
+
reasons.push(...reportQuality.blockers);
|
|
874
969
|
if (!paperPresent)
|
|
875
970
|
reasons.push('research_paper_missing');
|
|
876
971
|
if (paperSections < RESEARCH_PAPER_SECTION_GROUPS.length)
|
|
@@ -895,8 +990,12 @@ export async function evaluateResearchGate(dir) {
|
|
|
895
990
|
reasons.push('web_search_pass_missing');
|
|
896
991
|
if (Math.max(Number(gate.source_entries || 0), sourceEntries) < 1)
|
|
897
992
|
reasons.push('source_entry_missing');
|
|
993
|
+
if (Math.max(Number(gate.source_entries || 0), totalSourceEntries) < contract.min_sources_total)
|
|
994
|
+
reasons.push('source_entries_below_research_quality_contract');
|
|
898
995
|
if (Math.max(Number(gate.source_layers_covered || 0), sourceLayerStats.covered.length) < requiredSourceLayers.length)
|
|
899
996
|
reasons.push('source_layer_coverage_missing');
|
|
997
|
+
if (Math.max(Number(gate.source_layers_covered || 0), sourceLayerStats.covered.length) < contract.min_source_layers_covered)
|
|
998
|
+
reasons.push('source_layer_coverage_below_contract');
|
|
900
999
|
if (Math.max(Number(gate.triangulation_checks || 0), triangulationChecks) < 1)
|
|
901
1000
|
reasons.push('cross_layer_triangulation_missing');
|
|
902
1001
|
if (Math.max(Number(gate.independent_agents || 0), independentAgents) < RESEARCH_AGENT_COUNCIL.length)
|
|
@@ -919,16 +1018,66 @@ export async function evaluateResearchGate(dir) {
|
|
|
919
1018
|
reasons.push('unanimous_consensus_missing');
|
|
920
1019
|
if (Math.max(Number(gate.counterevidence_sources || 0), counterEvidenceEntries) < 1)
|
|
921
1020
|
reasons.push('counterevidence_source_missing');
|
|
1021
|
+
if (Math.max(Number(gate.counterevidence_sources || 0), counterEvidenceEntries) < contract.min_counterevidence_sources)
|
|
1022
|
+
reasons.push('counterevidence_below_contract');
|
|
922
1023
|
if ((gate.candidate_insights || 0) < 1)
|
|
923
1024
|
reasons.push('candidate_insight_missing');
|
|
924
1025
|
if ((gate.falsification_passes || 0) < 1)
|
|
925
1026
|
reasons.push('falsification_missing');
|
|
926
1027
|
if (Math.max(Number(gate.falsification_cases || 0), falsificationCases) < 1)
|
|
927
1028
|
reasons.push('falsification_case_missing');
|
|
1029
|
+
if (!falsificationValidation.ok)
|
|
1030
|
+
reasons.push(...falsificationValidation.blockers);
|
|
928
1031
|
if ((gate.testable_predictions || 0) < 1)
|
|
929
1032
|
reasons.push('testable_prediction_missing');
|
|
930
1033
|
if (!citationCoverage)
|
|
931
1034
|
reasons.push('citation_coverage_missing');
|
|
1035
|
+
if (!claimMatrixSummary.present)
|
|
1036
|
+
reasons.push('claim_evidence_matrix_missing');
|
|
1037
|
+
if (claimMatrix.key_claim_ids.length < contract.min_key_claims)
|
|
1038
|
+
reasons.push('key_claims_below_contract');
|
|
1039
|
+
if (claimMatrix.triangulated_claim_count < contract.min_trianguled_claims)
|
|
1040
|
+
reasons.push('triangulated_claims_below_contract');
|
|
1041
|
+
if (!claimMatrixValidation.ok)
|
|
1042
|
+
reasons.push(...claimMatrixValidation.blockers);
|
|
1043
|
+
if (!sourceQualityReport)
|
|
1044
|
+
reasons.push('source_quality_report_missing');
|
|
1045
|
+
if (sourceQualityReport && sourceQualityReport.ok !== true)
|
|
1046
|
+
reasons.push(...(Array.isArray(sourceQualityReport.blockers) ? sourceQualityReport.blockers : ['source_quality_report_not_ok']));
|
|
1047
|
+
if (!blueprint)
|
|
1048
|
+
reasons.push('implementation_blueprint_missing');
|
|
1049
|
+
if (!blueprintValidation.ok)
|
|
1050
|
+
reasons.push(...blueprintValidation.blockers);
|
|
1051
|
+
if (!experimentPlan)
|
|
1052
|
+
reasons.push('experiment_plan_missing');
|
|
1053
|
+
if (!experimentValidation.ok)
|
|
1054
|
+
reasons.push(...experimentValidation.blockers);
|
|
1055
|
+
if (!replicationPack)
|
|
1056
|
+
reasons.push('replication_pack_missing');
|
|
1057
|
+
if (!replicationValidation.ok)
|
|
1058
|
+
reasons.push(...replicationValidation.blockers);
|
|
1059
|
+
for (const artifact of contract.required_artifacts || []) {
|
|
1060
|
+
if (artifact === RESEARCH_FINAL_REVIEW_ARTIFACT)
|
|
1061
|
+
continue;
|
|
1062
|
+
if (!(await exists(path.join(dir, artifact))))
|
|
1063
|
+
reasons.push(`required_artifact_missing:${artifact}`);
|
|
1064
|
+
}
|
|
1065
|
+
let finalReview = await readResearchFinalReview(dir);
|
|
1066
|
+
finalReview = await runResearchFinalReviewer(dir, {
|
|
1067
|
+
contract,
|
|
1068
|
+
sourceLedger,
|
|
1069
|
+
claimMatrix,
|
|
1070
|
+
blueprint,
|
|
1071
|
+
experimentPlan,
|
|
1072
|
+
replicationPack,
|
|
1073
|
+
falsificationLedger,
|
|
1074
|
+
reportText,
|
|
1075
|
+
preliminaryReasons: reasons
|
|
1076
|
+
});
|
|
1077
|
+
if (!finalReview)
|
|
1078
|
+
reasons.push('final_review_missing');
|
|
1079
|
+
if (finalReview && finalReview.approved !== true)
|
|
1080
|
+
reasons.push('research_final_review_not_approved');
|
|
932
1081
|
if (searchBlockers.length > 0)
|
|
933
1082
|
reasons.push('web_search_blocked');
|
|
934
1083
|
if (gate.unsafe_or_destructive_actions === true)
|
|
@@ -942,16 +1091,31 @@ export async function evaluateResearchGate(dir) {
|
|
|
942
1091
|
metrics: {
|
|
943
1092
|
research_paper_artifact: paperArtifact.name,
|
|
944
1093
|
paper_present: paperPresent || gate.paper_present === true,
|
|
1094
|
+
quality_contract: contract,
|
|
1095
|
+
report_word_count: reportWordCount,
|
|
1096
|
+
report_min_words: contract.min_report_words,
|
|
1097
|
+
report_quality: reportQuality,
|
|
945
1098
|
web_search_passes: webSearchPasses,
|
|
946
1099
|
paper_sections: Math.max(Number(gate.paper_sections || 0), paperSections),
|
|
947
1100
|
genius_opinion_summary_present: geniusSummaryPresent || gate.genius_opinion_summary_present === true,
|
|
948
1101
|
genius_opinion_summaries: Math.max(Number(gate.genius_opinion_summaries || 0), geniusSummaryCount),
|
|
949
1102
|
research_source_skill_present: sourceSkillPresent || gate.research_source_skill_present === true,
|
|
950
1103
|
source_entries: Math.max(Number(gate.source_entries || 0), sourceEntries),
|
|
1104
|
+
source_entries_total_with_counterevidence: totalSourceEntries,
|
|
1105
|
+
min_sources_total: contract.min_sources_total,
|
|
951
1106
|
source_layers_required: requiredSourceLayers.length,
|
|
952
1107
|
source_layers_covered: Math.max(Number(gate.source_layers_covered || 0), sourceLayerStats.covered.length),
|
|
1108
|
+
min_source_layers_covered: contract.min_source_layers_covered,
|
|
953
1109
|
source_layers_missing: sourceLayerStats.missing,
|
|
954
1110
|
triangulation_checks: Math.max(Number(gate.triangulation_checks || 0), triangulationChecks),
|
|
1111
|
+
claim_evidence_matrix_present: claimMatrixSummary.present,
|
|
1112
|
+
key_claims: claimMatrix.key_claim_ids.length,
|
|
1113
|
+
min_key_claims: contract.min_key_claims,
|
|
1114
|
+
triangulated_claims: claimMatrix.triangulated_claim_count,
|
|
1115
|
+
min_triangulated_claims: contract.min_trianguled_claims,
|
|
1116
|
+
claim_evidence_matrix_ok: claimMatrixValidation.ok,
|
|
1117
|
+
claim_evidence_matrix_blockers: claimMatrixValidation.blockers,
|
|
1118
|
+
source_quality_report_ok: sourceQualityReport?.ok === true,
|
|
955
1119
|
independent_agents: Math.max(Number(gate.independent_agents || 0), independentAgents),
|
|
956
1120
|
xhigh_agents: Math.max(Number(gate.xhigh_agents || 0), xhighAgents),
|
|
957
1121
|
eureka_moments: Math.max(Number(gate.eureka_moments || 0), eurekaMoments),
|
|
@@ -965,7 +1129,15 @@ export async function evaluateResearchGate(dir) {
|
|
|
965
1129
|
consensus_agreed_agents: consensus.agreed_count,
|
|
966
1130
|
consensus_missing_agents: consensus.missing,
|
|
967
1131
|
counterevidence_sources: Math.max(Number(gate.counterevidence_sources || 0), counterEvidenceEntries),
|
|
1132
|
+
min_counterevidence_sources: contract.min_counterevidence_sources,
|
|
968
1133
|
falsification_cases: Math.max(Number(gate.falsification_cases || 0), falsificationCases),
|
|
1134
|
+
falsification_validation: falsificationValidation,
|
|
1135
|
+
implementation_blueprint_validation: blueprintValidation,
|
|
1136
|
+
experiment_plan_validation: experimentValidation,
|
|
1137
|
+
replication_pack_validation: replicationValidation,
|
|
1138
|
+
novelty_entries: Array.isArray(noveltyLedger?.entries) ? noveltyLedger.entries.length : null,
|
|
1139
|
+
final_review_approved: finalReview?.approved === true,
|
|
1140
|
+
final_review_blockers: Array.isArray(finalReview?.blockers) ? finalReview.blockers : [],
|
|
969
1141
|
citation_coverage: citationCoverage,
|
|
970
1142
|
web_search_blockers: searchBlockers.length
|
|
971
1143
|
},
|
|
@@ -980,20 +1152,42 @@ export async function evaluateResearchGate(dir) {
|
|
|
980
1152
|
}
|
|
981
1153
|
export async function writeMockResearchResult(dir, plan) {
|
|
982
1154
|
const paperArtifact = researchPaperArtifactForPlan(plan);
|
|
983
|
-
const
|
|
1155
|
+
const mockClaimIds = Array.from({ length: DEFAULT_RESEARCH_QUALITY_CONTRACT.min_key_claims }, (_unused, index) => `mock-claim-${index + 1}`);
|
|
1156
|
+
const primaryMockSources = RESEARCH_SOURCE_LAYERS.map((layer, index) => ({
|
|
984
1157
|
id: `mock-source-${index + 1}`,
|
|
985
1158
|
layer: layer.id,
|
|
986
1159
|
kind: 'selftest',
|
|
987
1160
|
title: `Mock ${layer.label} coverage`,
|
|
988
1161
|
locator: 'writeMockResearchResult',
|
|
1162
|
+
publisher_or_author: 'SKS mock research fixture',
|
|
1163
|
+
published_at: nowIso().slice(0, 10),
|
|
989
1164
|
accessed_at: nowIso(),
|
|
990
1165
|
reliability: 'mock',
|
|
991
1166
|
credibility: 'mock',
|
|
992
1167
|
stance: layer.id === 'counterevidence_factcheck' ? 'undermines' : 'supports',
|
|
993
|
-
supports: layer.id === 'counterevidence_factcheck' ? [] : [
|
|
994
|
-
undermines: layer.id === 'counterevidence_factcheck' ? [
|
|
1168
|
+
supports: layer.id === 'counterevidence_factcheck' ? [] : [mockClaimIds[index % mockClaimIds.length]],
|
|
1169
|
+
undermines: layer.id === 'counterevidence_factcheck' ? [mockClaimIds[0]] : [],
|
|
1170
|
+
claim_ids: [mockClaimIds[index % mockClaimIds.length]],
|
|
995
1171
|
notes: `Selftest fixture for the ${layer.id} source layer; no live web call is made in --mock mode.`
|
|
996
1172
|
}));
|
|
1173
|
+
const supplementalMockSources = RESEARCH_SOURCE_LAYERS.map((layer, index) => ({
|
|
1174
|
+
id: `mock-source-${index + 8}`,
|
|
1175
|
+
layer: layer.id,
|
|
1176
|
+
kind: 'selftest-supplement',
|
|
1177
|
+
title: `Supplemental mock ${layer.label} triangulation`,
|
|
1178
|
+
locator: 'writeMockResearchResult',
|
|
1179
|
+
publisher_or_author: 'SKS mock research fixture',
|
|
1180
|
+
published_at: nowIso().slice(0, 10),
|
|
1181
|
+
accessed_at: nowIso(),
|
|
1182
|
+
reliability: 'mock',
|
|
1183
|
+
credibility: 'mock',
|
|
1184
|
+
stance: layer.id === 'counterevidence_factcheck' ? 'undermines' : 'supports',
|
|
1185
|
+
supports: layer.id === 'counterevidence_factcheck' ? [] : [mockClaimIds[(index + 1) % mockClaimIds.length]],
|
|
1186
|
+
undermines: layer.id === 'counterevidence_factcheck' ? [mockClaimIds[(index + 2) % mockClaimIds.length]] : [],
|
|
1187
|
+
claim_ids: [mockClaimIds[(index + 1) % mockClaimIds.length]],
|
|
1188
|
+
notes: `Second selftest source for ${layer.id}; it makes source-count and triangulation checks non-trivial.`
|
|
1189
|
+
}));
|
|
1190
|
+
const mockLayerSources = [...primaryMockSources, ...supplementalMockSources];
|
|
997
1191
|
const sourceLedger = {
|
|
998
1192
|
schema_version: 1,
|
|
999
1193
|
policy: 'layered_source_retrieval_and_triangulation',
|
|
@@ -1011,8 +1205,8 @@ export async function writeMockResearchResult(dir, plan) {
|
|
|
1011
1205
|
status: 'covered',
|
|
1012
1206
|
evidence_role: layer.evidence_role,
|
|
1013
1207
|
query_templates: layer.query_templates || [],
|
|
1014
|
-
source_ids: [`mock-source-${index + 1}`],
|
|
1015
|
-
counterevidence_ids: layer.id === 'counterevidence_factcheck' ? ['mock-counter-1'] : [],
|
|
1208
|
+
source_ids: [`mock-source-${index + 1}`, `mock-source-${index + 8}`],
|
|
1209
|
+
counterevidence_ids: layer.id === 'counterevidence_factcheck' ? ['mock-counter-1', 'mock-counter-2'] : [],
|
|
1016
1210
|
blocker: null,
|
|
1017
1211
|
notes: 'Mock mode records layer coverage without live web access.'
|
|
1018
1212
|
})),
|
|
@@ -1036,12 +1230,31 @@ export async function writeMockResearchResult(dir, plan) {
|
|
|
1036
1230
|
kind: 'selftest',
|
|
1037
1231
|
title: 'Mock overclaim counterexample',
|
|
1038
1232
|
locator: 'writeMockResearchResult',
|
|
1233
|
+
publisher_or_author: 'SKS mock research fixture',
|
|
1234
|
+
published_at: nowIso().slice(0, 10),
|
|
1039
1235
|
accessed_at: nowIso(),
|
|
1040
1236
|
reliability: 'mock',
|
|
1041
1237
|
credibility: 'mock',
|
|
1042
1238
|
stance: 'undermines',
|
|
1043
|
-
undermines: [
|
|
1239
|
+
undermines: [mockClaimIds[0]],
|
|
1240
|
+
claim_ids: [mockClaimIds[0]],
|
|
1044
1241
|
notes: 'Shows the gate must fail if a run produces no tests or falsifiers.'
|
|
1242
|
+
},
|
|
1243
|
+
{
|
|
1244
|
+
id: 'mock-counter-2',
|
|
1245
|
+
layer: 'counterevidence_factcheck',
|
|
1246
|
+
kind: 'selftest',
|
|
1247
|
+
title: 'Mock missing-replication counterexample',
|
|
1248
|
+
locator: 'writeMockResearchResult',
|
|
1249
|
+
publisher_or_author: 'SKS mock research fixture',
|
|
1250
|
+
published_at: nowIso().slice(0, 10),
|
|
1251
|
+
accessed_at: nowIso(),
|
|
1252
|
+
reliability: 'mock',
|
|
1253
|
+
credibility: 'mock',
|
|
1254
|
+
stance: 'undermines',
|
|
1255
|
+
undermines: [mockClaimIds[1]],
|
|
1256
|
+
claim_ids: [mockClaimIds[1]],
|
|
1257
|
+
notes: 'Shows the gate must fail if replication commands and experiment steps are absent.'
|
|
1045
1258
|
}
|
|
1046
1259
|
],
|
|
1047
1260
|
triangulation: {
|
|
@@ -1065,7 +1278,11 @@ export async function writeMockResearchResult(dir, plan) {
|
|
|
1065
1278
|
quality_model: defaultSourceLedger(plan).quality_model,
|
|
1066
1279
|
citation_coverage: {
|
|
1067
1280
|
all_key_claims_cited: true,
|
|
1068
|
-
|
|
1281
|
+
key_claim_ids: mockClaimIds,
|
|
1282
|
+
cited_claim_ids: mockClaimIds,
|
|
1283
|
+
uncited_claim_ids: [],
|
|
1284
|
+
source_claim_map: Object.fromEntries(mockLayerSources.map((source) => [source.id, source.claim_ids || []])),
|
|
1285
|
+
notes: ['mock report, claim matrix, and novelty ledger cite all mock key claims']
|
|
1069
1286
|
},
|
|
1070
1287
|
blockers: []
|
|
1071
1288
|
};
|
|
@@ -1139,35 +1356,48 @@ export async function writeMockResearchResult(dir, plan) {
|
|
|
1139
1356
|
};
|
|
1140
1357
|
const falsificationLedger = {
|
|
1141
1358
|
schema_version: 1,
|
|
1359
|
+
schema: 'sks.falsification-ledger.v1',
|
|
1142
1360
|
created_at: nowIso(),
|
|
1143
|
-
cases:
|
|
1144
|
-
{
|
|
1145
|
-
|
|
1146
|
-
|
|
1147
|
-
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
|
|
1152
|
-
|
|
1361
|
+
cases: Array.from({ length: DEFAULT_RESEARCH_QUALITY_CONTRACT.min_falsification_cases }, (_unused, index) => ({
|
|
1362
|
+
id: `mock-falsification-${index + 1}`,
|
|
1363
|
+
target_claim: mockClaimIds[index % mockClaimIds.length],
|
|
1364
|
+
attack: [
|
|
1365
|
+
'The claim fails if the output only summarizes background material.',
|
|
1366
|
+
'The claim fails if no independent source layer confirms it.',
|
|
1367
|
+
'The claim fails if counterevidence is absent.',
|
|
1368
|
+
'The claim fails if no replication step can be run.'
|
|
1369
|
+
][index] || 'The claim fails if the decisive test cannot be specified.',
|
|
1370
|
+
source_ids: [index % 2 === 0 ? 'mock-counter-1' : 'mock-counter-2'],
|
|
1371
|
+
result: 'survives_with_gate_requirement',
|
|
1372
|
+
next_decisive_test: `Run decisive mock test ${index + 1} and compare against a summary-only baseline.`
|
|
1373
|
+
})),
|
|
1153
1374
|
unresolved_failures: [],
|
|
1154
1375
|
next_decisive_tests: ['Run paired prompt comparison and measure cited testable insights.']
|
|
1155
1376
|
};
|
|
1156
1377
|
const ledger = {
|
|
1157
1378
|
schema_version: 1,
|
|
1158
|
-
entries:
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
|
|
1168
|
-
|
|
1169
|
-
|
|
1170
|
-
|
|
1379
|
+
entries: mockClaimIds.map((claimId, index) => ({
|
|
1380
|
+
id: claimId,
|
|
1381
|
+
claim: [
|
|
1382
|
+
'A useful research run must optimize for falsifiable novelty, not only breadth of summary.',
|
|
1383
|
+
'Source quality must be a first-class artifact rather than an implicit reviewer judgment.',
|
|
1384
|
+
'A claim matrix makes implementation handoff safer by separating facts, hypotheses, and recommendations.',
|
|
1385
|
+
'Counterevidence needs its own minimum threshold because single-source skepticism is too brittle.',
|
|
1386
|
+
'A report-length floor catches summary-only outputs that dodge hard synthesis.',
|
|
1387
|
+
'An implementation blueprint turns research into actionable but still read-only handoff material.',
|
|
1388
|
+
'Replication artifacts make research pipeline behavior auditable after the run.',
|
|
1389
|
+
'A final reviewer artifact prevents passed gates from relying on unstated assumptions.'
|
|
1390
|
+
][index],
|
|
1391
|
+
type: index < 3 ? 'methodological_insight' : 'implementation_guidance',
|
|
1392
|
+
novelty: 2,
|
|
1393
|
+
confidence: 2,
|
|
1394
|
+
falsifiability: 2,
|
|
1395
|
+
source_ids: [`mock-source-${(index % RESEARCH_SOURCE_LAYERS.length) + 1}`, `mock-source-${((index + 1) % RESEARCH_SOURCE_LAYERS.length) + 1}`],
|
|
1396
|
+
counterevidence_ids: [index % 2 === 0 ? 'mock-counter-1' : 'mock-counter-2'],
|
|
1397
|
+
evidence: [`mock-source-${(index % RESEARCH_SOURCE_LAYERS.length) + 1}`, `mock-source-${((index + 1) % RESEARCH_SOURCE_LAYERS.length) + 1}`],
|
|
1398
|
+
falsifiers: [index % 2 === 0 ? 'mock-counter-1' : 'mock-counter-2'],
|
|
1399
|
+
next_experiment: `Run the same topic through summary-only and discovery-loop prompts, then compare claim ${index + 1} support, falsification, and reproducibility.`
|
|
1400
|
+
}))
|
|
1171
1401
|
};
|
|
1172
1402
|
const geniusSummary = [
|
|
1173
1403
|
'# Genius Opinion Summary',
|
|
@@ -1186,14 +1416,80 @@ export async function writeMockResearchResult(dir, plan) {
|
|
|
1186
1416
|
'## Council Consensus',
|
|
1187
1417
|
'The council keeps one modest, testable claim: Research Mode is useful when it writes a source-cited paper, records every agent opinion, triangulates across source layers, and exposes the next decisive test.'
|
|
1188
1418
|
].join('\n');
|
|
1419
|
+
const claimMatrix = buildClaimEvidenceMatrixFromLedgers({
|
|
1420
|
+
missionId: plan?.mission_id || '',
|
|
1421
|
+
sourceLedger,
|
|
1422
|
+
noveltyLedger: ledger,
|
|
1423
|
+
falsificationLedger
|
|
1424
|
+
});
|
|
1425
|
+
const blueprint = defaultImplementationBlueprint(plan);
|
|
1426
|
+
const experimentPlan = defaultExperimentPlan(plan);
|
|
1427
|
+
const replicationPack = defaultReplicationPack(plan);
|
|
1189
1428
|
await writeTextAtomic(path.join(dir, RESEARCH_SOURCE_SKILL_ARTIFACT), researchSourceSkillMarkdown(plan));
|
|
1190
1429
|
await writeJsonAtomic(path.join(dir, 'source-ledger.json'), sourceLedger);
|
|
1430
|
+
await writeResearchQualityContract(dir, plan.quality_contract || DEFAULT_RESEARCH_QUALITY_CONTRACT);
|
|
1431
|
+
await writeClaimEvidenceMatrix(dir, claimMatrix);
|
|
1432
|
+
await writeSourceQualityReport(dir, sourceLedger, claimMatrix);
|
|
1433
|
+
await writeImplementationBlueprint(dir, blueprint);
|
|
1434
|
+
await writeTextAtomic(path.join(dir, IMPLEMENTATION_BLUEPRINT_MARKDOWN_ARTIFACT), renderImplementationBlueprintMarkdown(blueprint));
|
|
1435
|
+
await writeExperimentPlan(dir, experimentPlan);
|
|
1436
|
+
await writeReplicationPack(dir, replicationPack);
|
|
1437
|
+
await writeResearchHandoffArtifacts(dir, plan, blueprint);
|
|
1438
|
+
await writeResearchWorkGraph(dir, plan);
|
|
1191
1439
|
await writeJsonAtomic(path.join(dir, 'agent-ledger.json'), agentLedger);
|
|
1192
1440
|
await writeJsonAtomic(path.join(dir, 'debate-ledger.json'), debateLedger);
|
|
1193
1441
|
await writeJsonAtomic(path.join(dir, 'falsification-ledger.json'), falsificationLedger);
|
|
1194
1442
|
await writeJsonAtomic(path.join(dir, 'novelty-ledger.json'), ledger);
|
|
1195
1443
|
await writeTextAtomic(path.join(dir, RESEARCH_GENIUS_SUMMARY_ARTIFACT), `${geniusSummary}\n`);
|
|
1196
|
-
|
|
1444
|
+
const evidenceParagraphs = Array.from({ length: 72 }, (_unused, index) => {
|
|
1445
|
+
const claimId = mockClaimIds[index % mockClaimIds.length];
|
|
1446
|
+
const sourceA = `mock-source-${(index % RESEARCH_SOURCE_LAYERS.length) + 1}`;
|
|
1447
|
+
const sourceB = `mock-source-${(index % RESEARCH_SOURCE_LAYERS.length) + 8}`;
|
|
1448
|
+
const counter = index % 2 === 0 ? 'mock-counter-1' : 'mock-counter-2';
|
|
1449
|
+
return `Quality note ${index + 1}: claim ${claimId} is treated as a falsifiable research-pipeline assertion, not a fact about live web evidence. The mock run cites ${sourceA} and ${sourceB}, compares the claim with ${counter}, and preserves the implementation handoff as read-only evidence. This repeated fixture text deliberately keeps the selftest report above the quality-contract word floor while still naming the same source-ledger ids that the gate verifies.`;
|
|
1450
|
+
});
|
|
1451
|
+
const researchReportText = [
|
|
1452
|
+
'# SKS Research Report',
|
|
1453
|
+
'',
|
|
1454
|
+
`Prompt: ${plan.prompt}`,
|
|
1455
|
+
'',
|
|
1456
|
+
'## Question',
|
|
1457
|
+
'Can SKS Research Mode close a research mission only when it has enough sourced claims, counterevidence, falsification, implementation handoff material, and replication evidence to support a downstream execution route?',
|
|
1458
|
+
'',
|
|
1459
|
+
'## Methodology',
|
|
1460
|
+
'This mock run is a selftest fixture, so it does not claim live web retrieval. It exercises the same artifact contract that real research must satisfy: layered source ledger entries, source quality fields, claim-evidence matrix rows, falsification cases, a blueprint, an experiment plan, a replication pack, and a final reviewer decision.',
|
|
1461
|
+
'',
|
|
1462
|
+
'## Source Map',
|
|
1463
|
+
'The source ledger contains two mock sources per source layer plus two counterevidence records. The source ids include mock-source-1 through mock-source-14, and the counterevidence ids include mock-counter-1 and mock-counter-2. Each source row includes layer, kind, locator, publisher_or_author, accessed_at, reliability, credibility, stance, and claim_ids.',
|
|
1464
|
+
'',
|
|
1465
|
+
'## Key Claims',
|
|
1466
|
+
...ledger.entries.map((entry) => `- ${entry.id}: ${entry.claim} Sources: ${(entry.source_ids || []).join(', ')}. Counterevidence: ${(entry.counterevidence_ids || entry.falsifiers || []).join(', ')}.`),
|
|
1467
|
+
'',
|
|
1468
|
+
'## Evidence Matrix Summary',
|
|
1469
|
+
`The claim-evidence matrix records ${claimMatrix.key_claim_ids.length} key claims and ${claimMatrix.triangulated_claim_count} triangulated claims. Each critical or high-importance claim has at least one source id and one counterevidence id, and each hypothesis has a test_or_probe field for follow-up validation.`,
|
|
1470
|
+
'',
|
|
1471
|
+
'## Counterevidence',
|
|
1472
|
+
'The first counterexample, mock-counter-1, attacks overclaiming without decisive tests. The second counterexample, mock-counter-2, attacks missing replication and thin experiment plans. Both are intentionally simple but give the gate two independent counterevidence entries to verify.',
|
|
1473
|
+
'',
|
|
1474
|
+
'## Falsification',
|
|
1475
|
+
'The falsification ledger includes four cases. They attack summary-only output, missing independent confirmation, absent counterevidence, and absent replication. The cases survive only as gate-backed requirements, not as proof that the mock topic was researched on the live web.',
|
|
1476
|
+
'',
|
|
1477
|
+
'## Implementation Blueprint',
|
|
1478
|
+
'The implementation blueprint has eight sections: problem, decision, architecture, interfaces, data contracts, execution plan, verification plan, and risks and rollbacks. The key point is that Research does not change repository source. It creates a handoff for a later $Team route that can validate the research and then decide what to implement.',
|
|
1479
|
+
'',
|
|
1480
|
+
'## Experiment / Validation Plan',
|
|
1481
|
+
'The experiment plan contains five steps: compare a baseline and research output, score cited key claims, run the smallest implementation probe, compare falsification outcomes, and record replication commands. The replication pack lists the commands and expected artifacts needed to reproduce the gate.',
|
|
1482
|
+
'',
|
|
1483
|
+
'## Limitations',
|
|
1484
|
+
'This is mock evidence for harness verification. It proves the local artifact contract and gate behavior, not live research accuracy. A normal non-mock run must still collect real sources and must keep the gate blocked if source access is unavailable.',
|
|
1485
|
+
'',
|
|
1486
|
+
'## References',
|
|
1487
|
+
'- mock-source-1 through mock-source-14: layered mock sources generated by writeMockResearchResult.',
|
|
1488
|
+
'- mock-counter-1 and mock-counter-2: counterevidence fixtures generated by writeMockResearchResult.',
|
|
1489
|
+
'',
|
|
1490
|
+
...evidenceParagraphs
|
|
1491
|
+
].join('\n\n');
|
|
1492
|
+
await writeTextAtomic(path.join(dir, 'research-report.md'), `${researchReportText}\n`);
|
|
1197
1493
|
await writeTextAtomic(path.join(dir, paperArtifact), `# Research Paper: ${plan.prompt}\n\n## Abstract\nA source-cited research run should produce cross-layer, falsifiable novelty rather than only summarize known material.\n\n## Introduction\nThe mock topic is evaluated as a research workflow outcome with layered source coverage [mock-source-1].\n\n## Methodology\nFive xhigh agents produce Eureka ideas, debate, triangulate source layers, and falsify the strongest claim.\n\n## Findings\nThe surviving finding is that useful research needs cited novelty, source-layer coverage, cross-layer triangulation, and a cheap decisive probe.\n\n## Discussion\nThe debate favors gate-backed evidence over narrative confidence, and treats public discourse as signal rather than truth.\n\n## Limitations and Falsification\nThe claim fails without sources, counterevidence, triangulation checks, or testable predictions [mock-counter-1].\n\n## Conclusion and Next Experiment\nCompare this loop against a summary-only baseline and score testable insights.\n\n## References\n- [mock-source-1] Mock academic literature coverage.\n- [mock-source-2] Mock official government and leading-institution knowledge coverage.\n- [mock-source-3] Mock standards and primary documents coverage.\n- [mock-source-4] Mock current news and global reporting coverage.\n- [mock-source-5] Mock public discourse coverage.\n- [mock-source-6] Mock developer and practitioner knowledge coverage.\n- [mock-source-7] Mock counterevidence and fact-checking coverage.\n- [mock-counter-1] Mock overclaim counterexample.\n`);
|
|
1198
1494
|
await writeJsonAtomic(path.join(dir, 'research-gate.json'), {
|
|
1199
1495
|
...defaultResearchGate(),
|
|
@@ -1223,11 +1519,11 @@ export async function writeMockResearchResult(dir, plan) {
|
|
|
1223
1519
|
debate_exchanges: debateLedger.exchanges.length,
|
|
1224
1520
|
consensus_iterations: debateLedger.consensus_iterations,
|
|
1225
1521
|
unanimous_consensus: true,
|
|
1226
|
-
counterevidence_sources:
|
|
1227
|
-
candidate_insights:
|
|
1522
|
+
counterevidence_sources: 2,
|
|
1523
|
+
candidate_insights: ledger.entries.length,
|
|
1228
1524
|
falsification_passes: 1,
|
|
1229
|
-
falsification_cases:
|
|
1230
|
-
testable_predictions:
|
|
1525
|
+
falsification_cases: falsificationLedger.cases.length,
|
|
1526
|
+
testable_predictions: experimentPlan.steps.length,
|
|
1231
1527
|
citation_coverage: true,
|
|
1232
1528
|
evidence: ['mock research report', `mock research paper: ${paperArtifact}`, 'mock genius opinion summary', 'mock research source skill', 'mock layered source ledger', 'mock agent ledger', 'mock debate ledger', 'mock novelty ledger', 'mock falsification ledger'],
|
|
1233
1529
|
notes: ['mock mode records the new contract but does not call a model or perform live web browsing']
|
|
@@ -1238,6 +1534,10 @@ export async function writeMockResearchResult(dir, plan) {
|
|
|
1238
1534
|
export function buildResearchPrompt({ id, mission, plan, cycle, previous }) {
|
|
1239
1535
|
const paperArtifact = researchPaperArtifactForPlan(plan);
|
|
1240
1536
|
const agentAgentNames = (plan?.research_council?.agents || RESEARCH_AGENT_COUNCIL).map((agent) => researchAgentAgentName(agent)).join(', ');
|
|
1241
|
-
|
|
1537
|
+
const promptText = `You are running SKS Research Mode.\nMISSION: ${id}\nTOPIC: ${mission.prompt}\nCYCLE: ${cycle}\nMODE: Genius Agent Council + frontier discovery loop. Use maximum reasoning depth available under the current Codex profile.\nLONG-RUN REAL-RESEARCH POLICY: Normal Research is allowed to take one or two hours when the question requires it. Do real source gathering and evidence comparison; do not shortcut into mock, fixture, or summary-only output. If live source access is unavailable, write the blocker and keep the gate unpassed.\nNO-CODE-MUTATION POLICY: Do not edit repository source, package metadata, docs, config, generated skills, or harness files. Write only route-local artifacts under .sneakoscope/missions/${id}/. If a needed implementation change is discovered, record it as a recommendation or blocker for a later execution route.\nNO-QUESTION LOCK: Do not ask the user. Resolve scope from research-plan.json and current project evidence.\nSAFETY: Destructive database operations and unsafe external actions are forbidden. Prefer read-only inspection, local files, and cited public sources.\nPERSONA POLICY: Use Einstein/Feynman/Turing/von Neumann-inspired agent lenses only as cognitive roles. Do not impersonate, roleplay private identity, or speak as the historical people.\nAGENT PERSONA POLICY: Every Research agent row must include agent_name, display_name, persona, persona_boundary, reasoning_effort: "xhigh", service_tier when available, falsifiers, cheap_probes, and challenge_or_response. Use these agent_name values exactly: ${agentAgentNames}. Persona names are cognitive lenses, not impersonations.\nAGENT EFFORT POLICY: Every Research agent agent must use reasoning_effort=xhigh. Record effort: "xhigh" for every agent in agent-ledger.json. Any lower-effort agent output must keep research-gate.json unpassed.\nEUREKA POLICY: Every agent must literally write "Eureka!" and one non-obvious, source-linked idea before debate.\nCONSENSUS LOOP POLICY: This is not a fixed three-cycle run. Repeat source-gathering, agent Eureka ideas, debate, falsification, and synthesis pressure until every agent records final agreement with the surviving mechanism. If unanimous agreement is not reached, keep research-gate.json unpassed and continue until the explicit max-cycle safety cap pauses the run.\nDEBATE POLICY: The agents must debate vigorously but stay evidence-bound. Every agent must challenge or respond at least once, and debate-ledger.json must record exchanges, consensus_iterations, unanimous_consensus, and per-agent agreements before synthesis.\nPAPER POLICY: After the report and ledgers, write ${paperArtifact} as a concise manuscript with Abstract, Introduction, Methodology, Findings/Results, Discussion, Limitations/Falsification, Conclusion/Next Experiment, and References.\nSOURCE SKILL POLICY: Create or update ${RESEARCH_SOURCE_SKILL_ARTIFACT} as a route-local source collection skill before synthesis. It must name the selected source layers, query routes, quality fields, blockers, and cross-layer triangulation checks. Do not edit generated .agents/skills during the research run.\nWEB/SOURCE POLICY: Run layered source retrieval across every safely available layer before synthesis: latest public papers, official government or leading-institution data, standards or primary docs, current news including BBC/CNN/GDELT-style sources when relevant, public discourse including X/Twitter and Reddit when available, developer/practitioner sources such as Stack Overflow/Stack Exchange/GitHub, and counterevidence or fact-checking sources. Treat public discourse as signal, not truth. If a layer cannot be searched, record the blocker in source-ledger.json and do not pass the gate.\nQUALITY CONTRACT:\n${researchPromptContractText()}\nRESEARCH PLAN:\n${JSON.stringify(plan, null, 2)}\n\nOBJECTIVE: Produce genuinely useful candidate discoveries: non-obvious hypotheses, mechanisms, predictions, or experiments. Do not merely summarize. Mark uncertainty clearly.\n\nREQUIRED PROCESS:\n1. Source skill first: create ${RESEARCH_SOURCE_SKILL_ARTIFACT} with source layers, query templates, quality fields, blockers, and triangulation rules.\n2. Layered source search: create source-ledger.json with source_layers, queries, source ids, source quality notes, counterevidence sources, source claim_ids, triangulation.cross_layer_checks, citation coverage, and blockers.\n3. Claim matrix: create claim-evidence-matrix.json with at least ${DEFAULT_RESEARCH_QUALITY_CONTRACT.min_key_claims} key claims and ${DEFAULT_RESEARCH_QUALITY_CONTRACT.min_trianguled_claims} triangulated claims.\n4. Independent xhigh agents: create agent-ledger.json with agent_name/display_name/persona/persona_boundary, effort=xhigh, reasoning_effort=xhigh, a literal Eureka! idea, findings, source_ids, falsifiers, cheap_probes, and challenge_or_response for every agent lens.\n5. Debate to agreement: create debate-ledger.json with evidence-bound challenge/response exchanges involving every agent, consensus_iterations >= 1, unanimous_consensus=true only when all agents agree, and agent_agreements for every agent.\n6. Falsification: create falsification-ledger.json with at least ${DEFAULT_RESEARCH_QUALITY_CONTRACT.min_falsification_cases} attacks, missing evidence, source conflicts, and decisive next tests.\n7. Synthesis: write research-report.md with at least ${DEFAULT_RESEARCH_QUALITY_CONTRACT.min_report_words} words and novelty-ledger.json only after cited agent findings, Eureka ideas, unanimous debate agreement, cross-layer triangulation, and falsification are recorded.\n8. Handoff: write implementation-blueprint.json/.md, experiment-plan.json/.md, replication-pack.json, and research-final-review.json.\n9. Paper: write ${paperArtifact} as a paper-style manuscript with source-ledger references and limitations.\n\nREQUIRED OUTPUT FILES in .sneakoscope/missions/${id}/:\n- research-report.md: concise report with Question, Methodology, Source Map, Key Claims, Evidence Matrix Summary, Counterevidence, Falsification, Implementation Blueprint, Experiment / Validation Plan, Limitations, and References. Cite source-ledger ids for factual claims.\n- ${paperArtifact}: paper manuscript with Abstract, Introduction, Methodology, Findings/Results, Discussion, Limitations/Falsification, Conclusion/Next Experiment, and References using source-ledger ids.\n- ${RESEARCH_SOURCE_SKILL_ARTIFACT}: route-local source collection skill; it is evidence for the Skill Creator step and must not mutate generated .agents/skills.\n- source-ledger.json: layered web/source queries, source ids, source priority, source quality notes, claim_ids, counterevidence sources, citation coverage, triangulation checks, and blockers.\n- ${CLAIM_EVIDENCE_MATRIX_ARTIFACT}: key claims, source ids, counterevidence ids, triangulation, unsupported claims, and test probes.\n- ${SOURCE_QUALITY_REPORT_ARTIFACT}: source metadata and citation coverage audit.\n- agent-ledger.json: one entry per agent lens with agent_name, display_name, persona, persona_boundary, effort, reasoning_effort, service_tier, eureka, query_set, findings, source_ids, falsifiers, cheap_probes, and challenge_or_response.\n- debate-ledger.json: evidence-bound challenge/response exchanges, participants, changed minds, unresolved conflicts, consensus_iterations, unanimous_consensus, and agent_agreements for every agent.\n- novelty-ledger.json: entries with claim, novelty, confidence, falsifiability, evidence source ids, falsifiers, next_experiment.\n- falsification-ledger.json: attacks/counterexamples/source conflicts, result, and next_decisive_tests.\n- implementation-blueprint.json and implementation-blueprint.md: at least eight handoff sections.\n- experiment-plan.json and experiment-plan.md: at least five validation steps.\n- replication-pack.json: commands, inputs, expected artifacts, and reproduction notes.\n- research-final-review.json: approved=true only after all contract checks pass.\n- research-gate.json: set passed only when every required artifact and quality threshold passes.\n\nPrevious cycle tail:\n${String(previous || '').slice(-2500)}\n`;
|
|
1538
|
+
const promptValidation = validateResearchPromptContract(promptText);
|
|
1539
|
+
if (!promptValidation.ok)
|
|
1540
|
+
return `${promptText}\n\nPROMPT CONTRACT BLOCKERS:\n${promptValidation.blockers.join('\n')}\n`;
|
|
1541
|
+
return promptText;
|
|
1242
1542
|
}
|
|
1243
1543
|
//# sourceMappingURL=research.js.map
|