scai 0.1.178 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +162 -267
- package/dist/__tests__/CommitSuggesterCmd.test.js +112 -0
- package/dist/__tests__/CommitSuggesterCmd.test.js.map +1 -0
- package/dist/__tests__/EvalReportCmd.test.js +645 -0
- package/dist/__tests__/EvalReportCmd.test.js.map +1 -0
- package/dist/__tests__/ModelCmd.test.js +64 -0
- package/dist/__tests__/ModelCmd.test.js.map +1 -0
- package/dist/__tests__/agents/agentActions.test.js +345 -0
- package/dist/__tests__/agents/agentActions.test.js.map +1 -0
- package/dist/__tests__/agents/agentFeedback.test.js +118 -0
- package/dist/__tests__/agents/agentFeedback.test.js.map +1 -0
- package/dist/__tests__/agents/agentGeneralScope.test.js +74 -0
- package/dist/__tests__/agents/agentGeneralScope.test.js.map +1 -0
- package/dist/__tests__/agents/agentLoop.test.js +1723 -0
- package/dist/__tests__/agents/agentLoop.test.js.map +1 -0
- package/dist/__tests__/agents/agentPolicyState.test.js +948 -0
- package/dist/__tests__/agents/agentPolicyState.test.js.map +1 -0
- package/dist/__tests__/agents/agentReadEvidence.test.js +170 -0
- package/dist/__tests__/agents/agentReadEvidence.test.js.map +1 -0
- package/dist/__tests__/agents/agentReadPersistence.test.js +129 -0
- package/dist/__tests__/agents/agentReadPersistence.test.js.map +1 -0
- package/dist/__tests__/agents/agentResumeCheckpoint.test.js +90 -0
- package/dist/__tests__/agents/agentResumeCheckpoint.test.js.map +1 -0
- package/dist/__tests__/agents/agentSearchBatchPlanner.test.js +289 -0
- package/dist/__tests__/agents/agentSearchBatchPlanner.test.js.map +1 -0
- package/dist/__tests__/agents/agentSearchOwnership.test.js +166 -0
- package/dist/__tests__/agents/agentSearchOwnership.test.js.map +1 -0
- package/dist/__tests__/agents/agentSearchRanking.test.js +139 -0
- package/dist/__tests__/agents/agentSearchRanking.test.js.map +1 -0
- package/dist/__tests__/agents/agentSearchRouting.test.js +584 -0
- package/dist/__tests__/agents/agentSearchRouting.test.js.map +1 -0
- package/dist/__tests__/agents/agentSearchScoring.test.js +23 -0
- package/dist/__tests__/agents/agentSearchScoring.test.js.map +1 -0
- package/dist/__tests__/agents/agentSearchShared.test.js +78 -0
- package/dist/__tests__/agents/agentSearchShared.test.js.map +1 -0
- package/dist/__tests__/agents/agentStateMachine.test.js +58 -0
- package/dist/__tests__/agents/agentStateMachine.test.js.map +1 -0
- package/dist/__tests__/agents/agentTaskPersistence.test.js +156 -0
- package/dist/__tests__/agents/agentTaskPersistence.test.js.map +1 -0
- package/dist/__tests__/agents/agentTools.test.js +69 -0
- package/dist/__tests__/agents/agentTools.test.js.map +1 -0
- package/dist/__tests__/agents/agentTransform.test.js +779 -0
- package/dist/__tests__/agents/agentTransform.test.js.map +1 -0
- package/dist/__tests__/agents/analysisPlanGenStep.test.js +157 -0
- package/dist/__tests__/agents/analysisPlanGenStep.test.js.map +1 -0
- package/dist/__tests__/agents/answerOnlyCompletion.test.js +75 -0
- package/dist/__tests__/agents/answerOnlyCompletion.test.js.map +1 -0
- package/dist/__tests__/agents/decideNextAction.test.js +1662 -0
- package/dist/__tests__/agents/decideNextAction.test.js.map +1 -0
- package/dist/__tests__/agents/deriveFocusFromSearchStep.test.js +258 -0
- package/dist/__tests__/agents/deriveFocusFromSearchStep.test.js.map +1 -0
- package/dist/__tests__/agents/evidenceVerifierStep.test.js +113 -0
- package/dist/__tests__/agents/evidenceVerifierStep.test.js.map +1 -0
- package/dist/__tests__/agents/executionPolicyResolver.test.js +208 -0
- package/dist/__tests__/agents/executionPolicyResolver.test.js.map +1 -0
- package/dist/__tests__/agents/fileCheckStep.test.js +299 -0
- package/dist/__tests__/agents/fileCheckStep.test.js.map +1 -0
- package/dist/__tests__/agents/giveUpEvaluatorStep.test.js +35 -0
- package/dist/__tests__/agents/giveUpEvaluatorStep.test.js.map +1 -0
- package/dist/__tests__/agents/guardState.test.js +297 -0
- package/dist/__tests__/agents/guardState.test.js.map +1 -0
- package/dist/__tests__/agents/mainAgentHeuristics.test.js +72 -0
- package/dist/__tests__/agents/mainAgentHeuristics.test.js.map +1 -0
- package/dist/__tests__/agents/objectiveEvaluatorStep.test.js +60 -0
- package/dist/__tests__/agents/objectiveEvaluatorStep.test.js.map +1 -0
- package/dist/__tests__/agents/outerLoopRecoveryEvaluator.test.js +207 -0
- package/dist/__tests__/agents/outerLoopRecoveryEvaluator.test.js.map +1 -0
- package/dist/__tests__/agents/prompting.test.js +363 -0
- package/dist/__tests__/agents/prompting.test.js.map +1 -0
- package/dist/__tests__/agents/readinessGateStep.test.js +180 -0
- package/dist/__tests__/agents/readinessGateStep.test.js.map +1 -0
- package/dist/__tests__/agents/reasonNextStep.test.js +56 -0
- package/dist/__tests__/agents/reasonNextStep.test.js.map +1 -0
- package/dist/__tests__/agents/reasonNextTaskStep.test.js +284 -0
- package/dist/__tests__/agents/reasonNextTaskStep.test.js.map +1 -0
- package/dist/__tests__/agents/resolveAgentTargetClassification.test.js +170 -0
- package/dist/__tests__/agents/resolveAgentTargetClassification.test.js.map +1 -0
- package/dist/__tests__/agents/resolveProgressState.test.js +526 -0
- package/dist/__tests__/agents/resolveProgressState.test.js.map +1 -0
- package/dist/__tests__/agents/resumeCheckpoint.test.js +50 -0
- package/dist/__tests__/agents/resumeCheckpoint.test.js.map +1 -0
- package/dist/__tests__/agents/routingDecisionStep.test.js +134 -0
- package/dist/__tests__/agents/routingDecisionStep.test.js.map +1 -0
- package/dist/__tests__/agents/scopeClassificationStep.test.js +118 -0
- package/dist/__tests__/agents/scopeClassificationStep.test.js.map +1 -0
- package/dist/__tests__/agents/searchContext.test.js +97 -0
- package/dist/__tests__/agents/searchContext.test.js.map +1 -0
- package/dist/__tests__/agents/selectRelevantSourcesStep.test.js +73 -0
- package/dist/__tests__/agents/selectRelevantSourcesStep.test.js.map +1 -0
- package/dist/__tests__/agents/structuredOutput.test.js +45 -0
- package/dist/__tests__/agents/structuredOutput.test.js.map +1 -0
- package/dist/__tests__/agents/transformPlanGenStep.fallback.test.js +59 -0
- package/dist/__tests__/agents/transformPlanGenStep.fallback.test.js.map +1 -0
- package/dist/__tests__/agents/transformPlanGenStep.test.js +92 -0
- package/dist/__tests__/agents/transformPlanGenStep.test.js.map +1 -0
- package/dist/__tests__/agents/understandIntentStep.test.js +237 -0
- package/dist/__tests__/agents/understandIntentStep.test.js.map +1 -0
- package/dist/__tests__/agents/understandResumeContext.test.js +65 -0
- package/dist/__tests__/agents/understandResumeContext.test.js.map +1 -0
- package/dist/__tests__/agents/understandScope.test.js +227 -0
- package/dist/__tests__/agents/understandScope.test.js.map +1 -0
- package/dist/__tests__/agents/validateChangesStep.test.js +52 -0
- package/dist/__tests__/agents/validateChangesStep.test.js.map +1 -0
- package/dist/__tests__/askCommandTaskBinding.test.js +176 -0
- package/dist/__tests__/askCommandTaskBinding.test.js.map +1 -0
- package/dist/__tests__/commandVisibility.test.js +25 -0
- package/dist/__tests__/commandVisibility.test.js.map +1 -0
- package/dist/__tests__/config.devOutput.test.js +82 -0
- package/dist/__tests__/config.devOutput.test.js.map +1 -0
- package/dist/__tests__/currentContext.test.js +43 -0
- package/dist/__tests__/currentContext.test.js.map +1 -0
- package/dist/__tests__/daemonWorker.test.js +51 -0
- package/dist/__tests__/daemonWorker.test.js.map +1 -0
- package/dist/__tests__/dialogState.test.js +113 -0
- package/dist/__tests__/dialogState.test.js.map +1 -0
- package/dist/__tests__/evalCommands.test.js +506 -0
- package/dist/__tests__/evalCommands.test.js.map +1 -0
- package/dist/__tests__/evalCommandsSummary.test.js +68 -0
- package/dist/__tests__/evalCommandsSummary.test.js.map +1 -0
- package/dist/__tests__/example.test.js +1 -0
- package/dist/__tests__/example.test.js.map +1 -0
- package/dist/__tests__/factory.commitCommand.test.js +45 -0
- package/dist/__tests__/factory.commitCommand.test.js.map +1 -0
- package/dist/__tests__/factory.devOutputCommand.test.js +122 -0
- package/dist/__tests__/factory.devOutputCommand.test.js.map +1 -0
- package/dist/__tests__/factory.evalCommands.test.js +38 -0
- package/dist/__tests__/factory.evalCommands.test.js.map +1 -0
- package/dist/__tests__/factory.planCommand.test.js +35 -0
- package/dist/__tests__/factory.planCommand.test.js.map +1 -0
- package/dist/__tests__/factory.setupCommand.test.js +34 -0
- package/dist/__tests__/factory.setupCommand.test.js.map +1 -0
- package/dist/__tests__/factory.statusCommand.test.js +54 -0
- package/dist/__tests__/factory.statusCommand.test.js.map +1 -0
- package/dist/__tests__/fileRules/queryTokenRules.test.js +35 -0
- package/dist/__tests__/fileRules/queryTokenRules.test.js.map +1 -0
- package/dist/__tests__/fileRules/searchPathClassification.test.js +57 -0
- package/dist/__tests__/fileRules/searchPathClassification.test.js.map +1 -0
- package/dist/__tests__/generate.ollamaRecovery.test.js +344 -0
- package/dist/__tests__/generate.ollamaRecovery.test.js.map +1 -0
- package/dist/__tests__/index.modelStartup.test.js +24 -0
- package/dist/__tests__/index.modelStartup.test.js.map +1 -0
- package/dist/__tests__/indexCmd.test.js +85 -0
- package/dist/__tests__/indexCmd.test.js.map +1 -0
- package/dist/__tests__/indexSlashCommand.test.js +50 -0
- package/dist/__tests__/indexSlashCommand.test.js.map +1 -0
- package/dist/__tests__/ollamaService.test.js +103 -0
- package/dist/__tests__/ollamaService.test.js.map +1 -0
- package/dist/__tests__/pipeline/modules/codeTransformModule.small-file.test.js +68 -0
- package/dist/__tests__/pipeline/modules/codeTransformModule.small-file.test.js.map +1 -0
- package/dist/__tests__/pipeline/modules/commitSuggesterModule.test.js +68 -0
- package/dist/__tests__/pipeline/modules/commitSuggesterModule.test.js.map +1 -0
- package/dist/__tests__/pipeline/modules/fileSearchModule.test.js +284 -0
- package/dist/__tests__/pipeline/modules/fileSearchModule.test.js.map +1 -0
- package/dist/__tests__/pipeline/modules/finalAnswerModule.test.js +1139 -0
- package/dist/__tests__/pipeline/modules/finalAnswerModule.test.js.map +1 -0
- package/dist/__tests__/pipeline/modules/readFileModule.test.js +146 -0
- package/dist/__tests__/pipeline/modules/readFileModule.test.js.map +1 -0
- package/dist/__tests__/pipeline/modules/semanticAnalysisModule.test.js +192 -0
- package/dist/__tests__/pipeline/modules/semanticAnalysisModule.test.js.map +1 -0
- package/dist/__tests__/repoIdentity.test.js +31 -0
- package/dist/__tests__/repoIdentity.test.js.map +1 -0
- package/dist/__tests__/resumeContext.test.js +87 -0
- package/dist/__tests__/resumeContext.test.js.map +1 -0
- package/dist/__tests__/resumeState.test.js +239 -0
- package/dist/__tests__/resumeState.test.js.map +1 -0
- package/dist/__tests__/search/SearchOrchestrator.test.js +836 -0
- package/dist/__tests__/search/SearchOrchestrator.test.js.map +1 -0
- package/dist/__tests__/shellDialogUi.test.js +52 -0
- package/dist/__tests__/shellDialogUi.test.js.map +1 -0
- package/dist/__tests__/shellSession.test.js +102 -0
- package/dist/__tests__/shellSession.test.js.map +1 -0
- package/dist/__tests__/statusOwner.test.js +215 -0
- package/dist/__tests__/statusOwner.test.js.map +1 -0
- package/dist/__tests__/testing/contextEval.test.js +244 -0
- package/dist/__tests__/testing/contextEval.test.js.map +1 -0
- package/dist/__tests__/testing/harnessArtifacts.test.js +124 -0
- package/dist/__tests__/testing/harnessArtifacts.test.js.map +1 -0
- package/dist/__tests__/testing/llmTraceSession.test.js +67 -0
- package/dist/__tests__/testing/llmTraceSession.test.js.map +1 -0
- package/dist/__tests__/testing/registerDevCliCommands.test.js +35 -0
- package/dist/__tests__/testing/registerDevCliCommands.test.js.map +1 -0
- package/dist/__tests__/testing/runDiagnosis.test.js +159 -0
- package/dist/__tests__/testing/runDiagnosis.test.js.map +1 -0
- package/dist/__tests__/testing/runtimeLogReader.test.js +66 -0
- package/dist/__tests__/testing/runtimeLogReader.test.js.map +1 -0
- package/dist/__tests__/testing/testCommands.test.js +53 -0
- package/dist/__tests__/testing/testCommands.test.js.map +1 -0
- package/dist/__tests__/utils/compileSearchQuery.test.js +38 -0
- package/dist/__tests__/utils/compileSearchQuery.test.js.map +1 -0
- package/dist/__tests__/utils/consolePresentation.test.js +105 -0
- package/dist/__tests__/utils/consolePresentation.test.js.map +1 -0
- package/dist/__tests__/utils/extractFileReferences.test.js +41 -0
- package/dist/__tests__/utils/extractFileReferences.test.js.map +1 -0
- package/dist/__tests__/utils/log.test.js +34 -0
- package/dist/__tests__/utils/log.test.js.map +1 -0
- package/dist/__tests__/utils/runtimeLogger.test.js +200 -0
- package/dist/__tests__/utils/runtimeLogger.test.js.map +1 -0
- package/dist/__tests__/utils/spinner.test.js +31 -0
- package/dist/__tests__/utils/spinner.test.js.map +1 -0
- package/dist/__tests__/utils/verifyFocusPreference.test.js +41 -0
- package/dist/__tests__/utils/verifyFocusPreference.test.js.map +1 -0
- package/dist/agent/actions/index.js +301 -0
- package/dist/agent/actions/index.js.map +1 -0
- package/dist/agent/actions/normalize.js +360 -0
- package/dist/agent/actions/normalize.js.map +1 -0
- package/dist/agent/actions/schemas.js +129 -0
- package/dist/agent/actions/schemas.js.map +1 -0
- package/dist/agent/evidence/index.js +320 -0
- package/dist/agent/evidence/index.js.map +1 -0
- package/dist/agent/feedback/index.js +187 -0
- package/dist/agent/feedback/index.js.map +1 -0
- package/dist/agent/finalization/index.js +35 -0
- package/dist/agent/finalization/index.js.map +1 -0
- package/dist/agent/index.js +126 -0
- package/dist/agent/index.js.map +1 -0
- package/dist/agent/logging/index.js +350 -0
- package/dist/agent/logging/index.js.map +1 -0
- package/dist/agent/persistence/boot.js +58 -0
- package/dist/agent/persistence/boot.js.map +1 -0
- package/dist/agent/persistence/currentTask.js +36 -0
- package/dist/agent/persistence/currentTask.js.map +1 -0
- package/dist/agent/persistence/hydrate.js +42 -0
- package/dist/agent/persistence/hydrate.js.map +1 -0
- package/dist/agent/persistence/index.js +15 -0
- package/dist/agent/persistence/index.js.map +1 -0
- package/dist/agent/persistence/snapshots.js +97 -0
- package/dist/agent/persistence/snapshots.js.map +1 -0
- package/dist/agent/persistence/steps.js +95 -0
- package/dist/agent/persistence/steps.js.map +1 -0
- package/dist/agent/persistence/tasks.js +195 -0
- package/dist/agent/persistence/tasks.js.map +1 -0
- package/dist/agent/persistence/turns.js +92 -0
- package/dist/agent/persistence/turns.js.map +1 -0
- package/dist/agent/policy/ambiguityResolution.js +226 -0
- package/dist/agent/policy/ambiguityResolution.js.map +1 -0
- package/dist/agent/policy/contracts.js +2 -0
- package/dist/agent/policy/contracts.js.map +1 -0
- package/dist/agent/policy/coveragePolicy.js +309 -0
- package/dist/agent/policy/coveragePolicy.js.map +1 -0
- package/dist/agent/policy/endDecisionPolicy.js +31 -0
- package/dist/agent/policy/endDecisionPolicy.js.map +1 -0
- package/dist/agent/policy/index.js +344 -0
- package/dist/agent/policy/index.js.map +1 -0
- package/dist/agent/policy/loopReview.js +778 -0
- package/dist/agent/policy/loopReview.js.map +1 -0
- package/dist/agent/policy/readinessPolicy.js +108 -0
- package/dist/agent/policy/readinessPolicy.js.map +1 -0
- package/dist/agent/policy/resolutionPipeline.js +356 -0
- package/dist/agent/policy/resolutionPipeline.js.map +1 -0
- package/dist/agent/policy/targetClassification.js +33 -0
- package/dist/agent/policy/targetClassification.js.map +1 -0
- package/dist/agent/prompting/actionChoice.js +90 -0
- package/dist/agent/prompting/actionChoice.js.map +1 -0
- package/dist/agent/prompting/finalAnswer.js +38 -0
- package/dist/agent/prompting/finalAnswer.js.map +1 -0
- package/dist/agent/prompting/index.js +14 -0
- package/dist/agent/prompting/index.js.map +1 -0
- package/dist/agent/prompting/plan.js +59 -0
- package/dist/agent/prompting/plan.js.map +1 -0
- package/dist/agent/prompting/transform.js +175 -0
- package/dist/agent/prompting/transform.js.map +1 -0
- package/dist/agent/prompting/understand.js +70 -0
- package/dist/agent/prompting/understand.js.map +1 -0
- package/dist/agent/read/freshness.js +29 -0
- package/dist/agent/read/freshness.js.map +1 -0
- package/dist/agent/read/fullReadPrompt.js +43 -0
- package/dist/agent/read/fullReadPrompt.js.map +1 -0
- package/dist/agent/read/index.js +140 -0
- package/dist/agent/read/index.js.map +1 -0
- package/dist/agent/read/persistence.js +88 -0
- package/dist/agent/read/persistence.js.map +1 -0
- package/dist/agent/read/summarizeReadEvidence.js +733 -0
- package/dist/agent/read/summarizeReadEvidence.js.map +1 -0
- package/dist/agent/read/targetResolution.js +126 -0
- package/dist/agent/read/targetResolution.js.map +1 -0
- package/dist/agent/resume/checkpoint.js +41 -0
- package/dist/agent/resume/checkpoint.js.map +1 -0
- package/dist/agent/runtime/lifecycle.js +67 -0
- package/dist/agent/runtime/lifecycle.js.map +1 -0
- package/dist/agent/runtime/progress.js +178 -0
- package/dist/agent/runtime/progress.js.map +1 -0
- package/dist/agent/runtime/runAgentLoop.js +402 -0
- package/dist/agent/runtime/runAgentLoop.js.map +1 -0
- package/dist/agent/runtime/runAgentPlanOnly.js +127 -0
- package/dist/agent/runtime/runAgentPlanOnly.js.map +1 -0
- package/dist/agent/runtime/understand.js +336 -0
- package/dist/agent/runtime/understand.js.map +1 -0
- package/dist/agent/search/batchPlanner.js +274 -0
- package/dist/agent/search/batchPlanner.js.map +1 -0
- package/dist/agent/search/candidateRetentionPolicy.js +184 -0
- package/dist/agent/search/candidateRetentionPolicy.js.map +1 -0
- package/dist/agent/search/directory.js +51 -0
- package/dist/agent/search/directory.js.map +1 -0
- package/dist/agent/search/exactTarget.js +151 -0
- package/dist/agent/search/exactTarget.js.map +1 -0
- package/dist/agent/search/fragment.js +110 -0
- package/dist/agent/search/fragment.js.map +1 -0
- package/dist/agent/search/index.js +166 -0
- package/dist/agent/search/index.js.map +1 -0
- package/dist/agent/search/laneClassifier.js +119 -0
- package/dist/agent/search/laneClassifier.js.map +1 -0
- package/dist/agent/search/limits.js +10 -0
- package/dist/agent/search/limits.js.map +1 -0
- package/dist/agent/search/ranking.js +22 -0
- package/dist/agent/search/ranking.js.map +1 -0
- package/dist/agent/search/regex.js +83 -0
- package/dist/agent/search/regex.js.map +1 -0
- package/dist/agent/search/routePolicy.js +11 -0
- package/dist/agent/search/routePolicy.js.map +1 -0
- package/dist/agent/search/searchContext.js +128 -0
- package/dist/agent/search/searchContext.js.map +1 -0
- package/dist/agent/search/semantic.js +113 -0
- package/dist/agent/search/semantic.js.map +1 -0
- package/dist/agent/search/semanticIndexSearch.js +202 -0
- package/dist/agent/search/semanticIndexSearch.js.map +1 -0
- package/dist/agent/search/shared.js +283 -0
- package/dist/agent/search/shared.js.map +1 -0
- package/dist/agent/search/shell.js +202 -0
- package/dist/agent/search/shell.js.map +1 -0
- package/dist/agent/search/snippetEvidence.js +57 -0
- package/dist/agent/search/snippetEvidence.js.map +1 -0
- package/dist/agent/search/types.js +2 -0
- package/dist/agent/search/types.js.map +1 -0
- package/dist/agent/state/index.js +99 -0
- package/dist/agent/state/index.js.map +1 -0
- package/dist/agent/state/memory.js +56 -0
- package/dist/agent/state/memory.js.map +1 -0
- package/dist/agent/structuredOutput/index.js +28 -0
- package/dist/agent/structuredOutput/index.js.map +1 -0
- package/dist/agent/tools/index.js +199 -0
- package/dist/agent/tools/index.js.map +1 -0
- package/dist/agent/transform/index.js +519 -0
- package/dist/agent/transform/index.js.map +1 -0
- package/dist/agent/transform/syntax.js +49 -0
- package/dist/agent/transform/syntax.js.map +1 -0
- package/dist/agent/types.js +20 -0
- package/dist/agent/types.js.map +1 -0
- package/dist/agents/actionRegistry.js +114 -0
- package/dist/agents/actionRegistry.js.map +1 -0
- package/dist/agents/agent.js +5 -0
- package/dist/agents/agent.js.map +1 -0
- package/dist/agents/agentActions.js +5 -0
- package/dist/agents/agentActions.js.map +1 -0
- package/dist/agents/agentEvidence.js +5 -0
- package/dist/agents/agentEvidence.js.map +1 -0
- package/dist/agents/agentFeedback.js +5 -0
- package/dist/agents/agentFeedback.js.map +1 -0
- package/dist/agents/agentLogging.js +5 -0
- package/dist/agents/agentLogging.js.map +1 -0
- package/dist/agents/agentLoop.js +5 -0
- package/dist/agents/agentLoop.js.map +1 -0
- package/dist/agents/agentMemory.js +5 -0
- package/dist/agents/agentMemory.js.map +1 -0
- package/dist/agents/agentPlanMode.js +5 -0
- package/dist/agents/agentPlanMode.js.map +1 -0
- package/dist/agents/agentPolicyState.js +5 -0
- package/dist/agents/agentPolicyState.js.map +1 -0
- package/dist/agents/agentProgress.js +93 -0
- package/dist/agents/agentProgress.js.map +1 -0
- package/dist/agents/agentSchemas.js +5 -0
- package/dist/agents/agentSchemas.js.map +1 -0
- package/dist/agents/agentSearchScoring.js +5 -0
- package/dist/agents/agentSearchScoring.js.map +1 -0
- package/dist/agents/agentStateMachine.js +5 -0
- package/dist/agents/agentStateMachine.js.map +1 -0
- package/dist/agents/agentTools.js +5 -0
- package/dist/agents/agentTools.js.map +1 -0
- package/dist/agents/agentTypes.js +5 -0
- package/dist/agents/agentTypes.js.map +1 -0
- package/dist/agents/agentUnderstand.js +5 -0
- package/dist/agents/agentUnderstand.js.map +1 -0
- package/dist/agents/analysisPlanGenStep.js +194 -17
- package/dist/agents/analysisPlanGenStep.js.map +1 -0
- package/dist/agents/answerOnlyCompletion.js +32 -0
- package/dist/agents/answerOnlyCompletion.js.map +1 -0
- package/dist/agents/collaboratorStep.js +1 -0
- package/dist/agents/collaboratorStep.js.map +1 -0
- package/dist/agents/decideNextAction.js +444 -0
- package/dist/agents/decideNextAction.js.map +1 -0
- package/dist/agents/deriveFocusFromSearchStep.js +83 -0
- package/dist/agents/deriveFocusFromSearchStep.js.map +1 -0
- package/dist/agents/evidenceVerifierStep.js +104 -13
- package/dist/agents/evidenceVerifierStep.js.map +1 -0
- package/dist/agents/fileCheckStep.js +381 -12
- package/dist/agents/fileCheckStep.js.map +1 -0
- package/dist/agents/giveUpEvaluatorStep.js +63 -0
- package/dist/agents/giveUpEvaluatorStep.js.map +1 -0
- package/dist/agents/guardPolicy.js +20 -0
- package/dist/agents/guardPolicy.js.map +1 -0
- package/dist/agents/guards/executionPolicyResolver.js +165 -0
- package/dist/agents/guards/executionPolicyResolver.js.map +1 -0
- package/dist/agents/guards/guardState.js +195 -0
- package/dist/agents/guards/guardState.js.map +1 -0
- package/dist/agents/guards/resolveProgressState.js +403 -0
- package/dist/agents/guards/resolveProgressState.js.map +1 -0
- package/dist/agents/infoPlanGenStep.js +66 -8
- package/dist/agents/infoPlanGenStep.js.map +1 -0
- package/dist/agents/integrateFeedbackStep.js +1 -0
- package/dist/agents/integrateFeedbackStep.js.map +1 -0
- package/dist/agents/iterationFileSelector.js +8 -7
- package/dist/agents/iterationFileSelector.js.map +1 -0
- package/dist/agents/mainAgentActivityLog.js +85 -0
- package/dist/agents/mainAgentActivityLog.js.map +1 -0
- package/dist/agents/mainAgentHeuristics.js +173 -0
- package/dist/agents/mainAgentHeuristics.js.map +1 -0
- package/dist/agents/mainAgentVerify.js +159 -0
- package/dist/agents/mainAgentVerify.js.map +1 -0
- package/dist/agents/objectiveEvaluatorStep.js +103 -0
- package/dist/agents/objectiveEvaluatorStep.js.map +1 -0
- package/dist/agents/outerLoopRecoveryEvaluator.js +108 -0
- package/dist/agents/outerLoopRecoveryEvaluator.js.map +1 -0
- package/dist/agents/readinessGateStep.js +95 -9
- package/dist/agents/readinessGateStep.js.map +1 -0
- package/dist/agents/reasonNextStep.js +9 -8
- package/dist/agents/reasonNextStep.js.map +1 -0
- package/dist/agents/reasonNextTaskStep.js +267 -144
- package/dist/agents/reasonNextTaskStep.js.map +1 -0
- package/dist/agents/researchPlanGenStep.js +61 -25
- package/dist/agents/researchPlanGenStep.js.map +1 -0
- package/dist/agents/resolveAgentTargetClassification.js +5 -0
- package/dist/agents/resolveAgentTargetClassification.js.map +1 -0
- package/dist/agents/resolveExecutionModeStep.js +1 -0
- package/dist/agents/resolveExecutionModeStep.js.map +1 -0
- package/dist/agents/resolveExplicitTargetsStep.js +74 -0
- package/dist/agents/resolveExplicitTargetsStep.js.map +1 -0
- package/dist/agents/routingDecisionStep.js +58 -11
- package/dist/agents/routingDecisionStep.js.map +1 -0
- package/dist/agents/scopeClassificationStep.js +66 -3
- package/dist/agents/scopeClassificationStep.js.map +1 -0
- package/dist/agents/selectRelevantSourcesStep.js +13 -5
- package/dist/agents/selectRelevantSourcesStep.js.map +1 -0
- package/dist/agents/structuralPreloadStep.js +3 -4
- package/dist/agents/structuralPreloadStep.js.map +1 -0
- package/dist/agents/transformPlanGenStep.js +105 -18
- package/dist/agents/transformPlanGenStep.js.map +1 -0
- package/dist/agents/understandIntentStep.js +237 -17
- package/dist/agents/understandIntentStep.js.map +1 -0
- package/dist/agents/validateChangesStep.js +16 -2
- package/dist/agents/validateChangesStep.js.map +1 -0
- package/dist/agents/writeFileStep.js +1 -0
- package/dist/agents/writeFileStep.js.map +1 -0
- package/dist/commands/AskCmd.js +139 -44
- package/dist/commands/AskCmd.js.map +1 -0
- package/dist/commands/BackupCmd.js +1 -0
- package/dist/commands/BackupCmd.js.map +1 -0
- package/dist/commands/ChangeLogUpdateCmd.js +1 -0
- package/dist/commands/ChangeLogUpdateCmd.js.map +1 -0
- package/dist/commands/CommitSuggesterCmd.js +55 -13
- package/dist/commands/CommitSuggesterCmd.js.map +1 -0
- package/dist/commands/DaemonCmd.js +52 -14
- package/dist/commands/DaemonCmd.js.map +1 -0
- package/dist/commands/DeleteIndex.js +1 -0
- package/dist/commands/DeleteIndex.js.map +1 -0
- package/dist/commands/EvalReportCmd.js +374 -0
- package/dist/commands/EvalReportCmd.js.map +1 -0
- package/dist/commands/FindCmd.js +1 -0
- package/dist/commands/FindCmd.js.map +1 -0
- package/dist/commands/GitCmd.js +1 -0
- package/dist/commands/GitCmd.js.map +1 -0
- package/dist/commands/IndexCmd.js +11 -79
- package/dist/commands/IndexCmd.js.map +1 -0
- package/dist/commands/InspectCmd.js +1 -0
- package/dist/commands/InspectCmd.js.map +1 -0
- package/dist/commands/ModelCmd.js +24 -0
- package/dist/commands/ModelCmd.js.map +1 -0
- package/dist/commands/ReadlineSingleton.js +1 -0
- package/dist/commands/ReadlineSingleton.js.map +1 -0
- package/dist/commands/ResetDbCmd.js +18 -1
- package/dist/commands/ResetDbCmd.js.map +1 -0
- package/dist/commands/ReviewCmd.js +1 -0
- package/dist/commands/ReviewCmd.js.map +1 -0
- package/dist/commands/StatusCmd.js +22 -0
- package/dist/commands/StatusCmd.js.map +1 -0
- package/dist/commands/StopDaemonCmd.js +1 -0
- package/dist/commands/StopDaemonCmd.js.map +1 -0
- package/dist/commands/SummaryCmd.js +1 -0
- package/dist/commands/SummaryCmd.js.map +1 -0
- package/dist/commands/SwitchCmd.js +9 -15
- package/dist/commands/SwitchCmd.js.map +1 -0
- package/dist/commands/TasksCmd.js +142 -57
- package/dist/commands/TasksCmd.js.map +1 -0
- package/dist/commands/TestCmd.js +66 -0
- package/dist/commands/TestCmd.js.map +1 -0
- package/dist/commands/WorkflowCmd.js +1 -0
- package/dist/commands/WorkflowCmd.js.map +1 -0
- package/dist/commands/commandVisibility.js +27 -0
- package/dist/commands/commandVisibility.js.map +1 -0
- package/dist/commands/evalCommands.js +1337 -0
- package/dist/commands/evalCommands.js.map +1 -0
- package/dist/commands/factory.js +206 -38
- package/dist/commands/factory.js.map +1 -0
- package/dist/config.js +62 -11
- package/dist/config.js.map +1 -0
- package/dist/constants.js +21 -3
- package/dist/constants.js.map +1 -0
- package/dist/context.js +33 -32
- package/dist/context.js.map +1 -0
- package/dist/daemon/daemonQueues.js +1 -20
- package/dist/daemon/daemonQueues.js.map +1 -0
- package/dist/daemon/daemonWorker.js +26 -37
- package/dist/daemon/daemonWorker.js.map +1 -0
- package/dist/daemon/generateSummaries.js +1 -0
- package/dist/daemon/generateSummaries.js.map +1 -0
- package/dist/daemon/runFolderCapsuleBatch.js +1 -0
- package/dist/daemon/runFolderCapsuleBatch.js.map +1 -0
- package/dist/daemon/runIndexingBatch.js +1 -0
- package/dist/daemon/runIndexingBatch.js.map +1 -0
- package/dist/daemon/runKgBatch.js +9 -1
- package/dist/daemon/runKgBatch.js.map +1 -0
- package/dist/db/backup.js +1 -0
- package/dist/db/backup.js.map +1 -0
- package/dist/db/client.js +18 -3
- package/dist/db/client.js.map +1 -0
- package/dist/db/fileIndex.js +110 -152
- package/dist/db/fileIndex.js.map +1 -0
- package/dist/db/functionExtractors/extractFromJava.js +1 -0
- package/dist/db/functionExtractors/extractFromJava.js.map +1 -0
- package/dist/db/functionExtractors/extractFromJs.js +1 -0
- package/dist/db/functionExtractors/extractFromJs.js.map +1 -0
- package/dist/db/functionExtractors/extractFromTs.js +1 -0
- package/dist/db/functionExtractors/extractFromTs.js.map +1 -0
- package/dist/db/functionExtractors/extractFromXML.js +1 -0
- package/dist/db/functionExtractors/extractFromXML.js.map +1 -0
- package/dist/db/functionExtractors/index.js +1 -0
- package/dist/db/functionExtractors/index.js.map +1 -0
- package/dist/db/functionIndex.js +9 -0
- package/dist/db/functionIndex.js.map +1 -0
- package/dist/db/schema.js +314 -99
- package/dist/db/schema.js.map +1 -0
- package/dist/db/sqlTemplates.js +1 -0
- package/dist/db/sqlTemplates.js.map +1 -0
- package/dist/fileRules/builtins.js +1 -0
- package/dist/fileRules/builtins.js.map +1 -0
- package/dist/fileRules/classifyFile.js +1 -0
- package/dist/fileRules/classifyFile.js.map +1 -0
- package/dist/fileRules/codeAllowedExtensions.js +1 -0
- package/dist/fileRules/codeAllowedExtensions.js.map +1 -0
- package/dist/fileRules/detectFileType.js +1 -0
- package/dist/fileRules/detectFileType.js.map +1 -0
- package/dist/fileRules/fileClassifier.js +1 -0
- package/dist/fileRules/fileClassifier.js.map +1 -0
- package/dist/fileRules/fileExceptions.js +1 -0
- package/dist/fileRules/fileExceptions.js.map +1 -0
- package/dist/fileRules/ignoredExtensions.js +1 -0
- package/dist/fileRules/ignoredExtensions.js.map +1 -0
- package/dist/fileRules/ignoredPaths.js +48 -5
- package/dist/fileRules/ignoredPaths.js.map +1 -0
- package/dist/fileRules/queryTokenRules.js +176 -0
- package/dist/fileRules/queryTokenRules.js.map +1 -0
- package/dist/fileRules/searchPathClassification.js +58 -0
- package/dist/fileRules/searchPathClassification.js.map +1 -0
- package/dist/fileRules/shouldIgnoreFiles.js +1 -0
- package/dist/fileRules/shouldIgnoreFiles.js.map +1 -0
- package/dist/fileRules/stopWords.js +9 -0
- package/dist/fileRules/stopWords.js.map +1 -0
- package/dist/fileRules/wellKnownRepoFiles.js +1 -0
- package/dist/fileRules/wellKnownRepoFiles.js.map +1 -0
- package/dist/git/commitSummary.js +227 -0
- package/dist/git/commitSummary.js.map +1 -0
- package/dist/github/api.js +1 -0
- package/dist/github/api.js.map +1 -0
- package/dist/github/auth.js +1 -0
- package/dist/github/auth.js.map +1 -0
- package/dist/github/github.js +1 -0
- package/dist/github/github.js.map +1 -0
- package/dist/github/githubAuthCheck.js +1 -0
- package/dist/github/githubAuthCheck.js.map +1 -0
- package/dist/github/postComments.js +1 -0
- package/dist/github/postComments.js.map +1 -0
- package/dist/github/repo.js +15 -24
- package/dist/github/repo.js.map +1 -0
- package/dist/github/token.js +1 -0
- package/dist/github/token.js.map +1 -0
- package/dist/github/types.js +1 -0
- package/dist/github/types.js.map +1 -0
- package/dist/index.js +318 -37
- package/dist/index.js.map +1 -0
- package/dist/lib/generate.js +264 -20
- package/dist/lib/generate.js.map +1 -0
- package/dist/lib/generateFolderCapsules.js +1 -0
- package/dist/lib/generateFolderCapsules.js.map +1 -0
- package/dist/lib/ollamaModelPolicy.js +59 -0
- package/dist/lib/ollamaModelPolicy.js.map +1 -0
- package/dist/lib/spinner.js +29 -9
- package/dist/lib/spinner.js.map +1 -0
- package/dist/modelSetup.js +25 -78
- package/dist/modelSetup.js.map +1 -0
- package/dist/pipeline/modules/changeLogModule.js +10 -1
- package/dist/pipeline/modules/changeLogModule.js.map +1 -0
- package/dist/pipeline/modules/cleanupModule.js +1 -0
- package/dist/pipeline/modules/cleanupModule.js.map +1 -0
- package/dist/pipeline/modules/codeTransformModule.js +10 -16
- package/dist/pipeline/modules/codeTransformModule.js.map +1 -0
- package/dist/pipeline/modules/commentModule.js +12 -0
- package/dist/pipeline/modules/commentModule.js.map +1 -0
- package/dist/pipeline/modules/commitSuggesterModule.js +82 -12
- package/dist/pipeline/modules/commitSuggesterModule.js.map +1 -0
- package/dist/pipeline/modules/contextReviewModule.js +12 -1
- package/dist/pipeline/modules/contextReviewModule.js.map +1 -0
- package/dist/pipeline/modules/dialogAnswerModule.js +58 -0
- package/dist/pipeline/modules/dialogAnswerModule.js.map +1 -0
- package/dist/pipeline/modules/fileSearchModule.js +5 -143
- package/dist/pipeline/modules/fileSearchModule.js.map +1 -0
- package/dist/pipeline/modules/finalAnswerModule.js +1176 -151
- package/dist/pipeline/modules/finalAnswerModule.js.map +1 -0
- package/dist/pipeline/modules/kgModule.js +18 -1
- package/dist/pipeline/modules/kgModule.js.map +1 -0
- package/dist/pipeline/modules/planAnswerModule.js +99 -0
- package/dist/pipeline/modules/planAnswerModule.js.map +1 -0
- package/dist/pipeline/modules/readFileModule.js +300 -0
- package/dist/pipeline/modules/readFileModule.js.map +1 -0
- package/dist/pipeline/modules/reviewModule.js +10 -1
- package/dist/pipeline/modules/reviewModule.js.map +1 -0
- package/dist/pipeline/modules/searchDbModule.js +159 -0
- package/dist/pipeline/modules/searchDbModule.js.map +1 -0
- package/dist/pipeline/modules/searchListDirectoryModule.js +62 -0
- package/dist/pipeline/modules/searchListDirectoryModule.js.map +1 -0
- package/dist/pipeline/modules/searchModuleShared.js +71 -0
- package/dist/pipeline/modules/searchModuleShared.js.map +1 -0
- package/dist/pipeline/modules/searchRegexModule.js +59 -0
- package/dist/pipeline/modules/searchRegexModule.js.map +1 -0
- package/dist/pipeline/modules/semanticAnalysisModule.js +185 -28
- package/dist/pipeline/modules/semanticAnalysisModule.js.map +1 -0
- package/dist/pipeline/modules/summaryModule.js +11 -1
- package/dist/pipeline/modules/summaryModule.js.map +1 -0
- package/dist/pipeline/registry/moduleRegistry.js +9 -0
- package/dist/pipeline/registry/moduleRegistry.js.map +1 -0
- package/dist/pipeline/runModulePipeline.js +1 -0
- package/dist/pipeline/runModulePipeline.js.map +1 -0
- package/dist/scripts/dbScriptSupport.js +172 -0
- package/dist/scripts/dbScriptSupport.js.map +1 -0
- package/dist/scripts/dbcheck.js +173 -267
- package/dist/scripts/dbcheck.js.map +1 -0
- package/dist/scripts/dboverview.js +161 -0
- package/dist/scripts/dboverview.js.map +1 -0
- package/dist/scripts/migrateDb.js +1 -0
- package/dist/scripts/migrateDb.js.map +1 -0
- package/dist/search/SearchOrchestrator.js +928 -0
- package/dist/search/SearchOrchestrator.js.map +1 -0
- package/dist/search/sharedRankingPolicy.js +283 -0
- package/dist/search/sharedRankingPolicy.js.map +1 -0
- package/dist/setup/reindexOwner.js +97 -0
- package/dist/setup/reindexOwner.js.map +1 -0
- package/dist/setup/setupOwner.js +100 -0
- package/dist/setup/setupOwner.js.map +1 -0
- package/dist/shell/dialogUi.js +81 -0
- package/dist/shell/dialogUi.js.map +1 -0
- package/dist/shellSession.js +126 -0
- package/dist/shellSession.js.map +1 -0
- package/dist/status/statusOwner.js +239 -0
- package/dist/status/statusOwner.js.map +1 -0
- package/dist/testing/contextEval.js +514 -0
- package/dist/testing/contextEval.js.map +1 -0
- package/dist/testing/fixtures/transform/small-file.input.js +5 -0
- package/dist/testing/fixtures/transform/small-file.input.js.map +1 -0
- package/dist/testing/harnessArtifacts.js +112 -0
- package/dist/testing/harnessArtifacts.js.map +1 -0
- package/dist/testing/llmTraceSession.js +67 -0
- package/dist/testing/llmTraceSession.js.map +1 -0
- package/dist/testing/registerDevCliCommands.js +43 -0
- package/dist/testing/registerDevCliCommands.js.map +1 -0
- package/dist/testing/runDiagnosis.js +248 -0
- package/dist/testing/runDiagnosis.js.map +1 -0
- package/dist/testing/runtimeLogReader.js +144 -0
- package/dist/testing/runtimeLogReader.js.map +1 -0
- package/dist/testing/testCommands.js +35 -303
- package/dist/testing/testCommands.js.map +1 -0
- package/dist/testing/testRegistry.js +233 -0
- package/dist/testing/testRegistry.js.map +1 -0
- package/dist/types.js +1 -0
- package/dist/types.js.map +1 -0
- package/dist/utils/buildContextualPrompt.js +26 -75
- package/dist/utils/buildContextualPrompt.js.map +1 -0
- package/dist/utils/changeLogPrompt.js +1 -0
- package/dist/utils/changeLogPrompt.js.map +1 -0
- package/dist/utils/checkModel.js +17 -92
- package/dist/utils/checkModel.js.map +1 -0
- package/dist/utils/commentMap.js +1 -0
- package/dist/utils/commentMap.js.map +1 -0
- package/dist/utils/compileSearchQuery.js +23 -9
- package/dist/utils/compileSearchQuery.js.map +1 -0
- package/dist/utils/consolePresentation.js +208 -0
- package/dist/utils/consolePresentation.js.map +1 -0
- package/dist/utils/contentUtils.js +17 -2
- package/dist/utils/contentUtils.js.map +1 -0
- package/dist/utils/debugContext.js +1 -0
- package/dist/utils/debugContext.js.map +1 -0
- package/dist/utils/dialogState.js +201 -0
- package/dist/utils/dialogState.js.map +1 -0
- package/dist/utils/editor.js +1 -0
- package/dist/utils/editor.js.map +1 -0
- package/dist/utils/executionEvidence.js +50 -0
- package/dist/utils/executionEvidence.js.map +1 -0
- package/dist/utils/extractFileReferences.js +140 -6
- package/dist/utils/extractFileReferences.js.map +1 -0
- package/dist/utils/fileEvidenceCache.js +50 -0
- package/dist/utils/fileEvidenceCache.js.map +1 -0
- package/dist/utils/fileTree.js +1 -0
- package/dist/utils/fileTree.js.map +1 -0
- package/dist/utils/loadRelevantFolderCapsules.js +35 -5
- package/dist/utils/loadRelevantFolderCapsules.js.map +1 -0
- package/dist/utils/log.js +10 -1
- package/dist/utils/log.js.map +1 -0
- package/dist/utils/normalizeData.js +1 -0
- package/dist/utils/normalizeData.js.map +1 -0
- package/dist/utils/ollamaModelStatus.js +28 -0
- package/dist/utils/ollamaModelStatus.js.map +1 -0
- package/dist/utils/ollamaService.js +294 -0
- package/dist/utils/ollamaService.js.map +1 -0
- package/dist/utils/outputFormatter.js +1 -0
- package/dist/utils/outputFormatter.js.map +1 -0
- package/dist/utils/parseTaggedContent.js +1 -0
- package/dist/utils/parseTaggedContent.js.map +1 -0
- package/dist/utils/planActions.js +27 -46
- package/dist/utils/planActions.js.map +1 -0
- package/dist/utils/promptBuilderHelper.js +1 -0
- package/dist/utils/promptBuilderHelper.js.map +1 -0
- package/dist/utils/promptLogHelper.js +29 -13
- package/dist/utils/promptLogHelper.js.map +1 -0
- package/dist/utils/queryAnchors.js +71 -0
- package/dist/utils/queryAnchors.js.map +1 -0
- package/dist/utils/repoIdentity.js +82 -0
- package/dist/utils/repoIdentity.js.map +1 -0
- package/dist/utils/repoKey.js +1 -0
- package/dist/utils/repoKey.js.map +1 -0
- package/dist/utils/resolveTargetsToFiles.js +1 -0
- package/dist/utils/resolveTargetsToFiles.js.map +1 -0
- package/dist/utils/resumeContext.js +219 -0
- package/dist/utils/resumeContext.js.map +1 -0
- package/dist/utils/resumeState.js +310 -0
- package/dist/utils/resumeState.js.map +1 -0
- package/dist/utils/rollingPlan.js +118 -0
- package/dist/utils/rollingPlan.js.map +1 -0
- package/dist/utils/runQueryWithDaemonControl.js +11 -3
- package/dist/utils/runQueryWithDaemonControl.js.map +1 -0
- package/dist/utils/runtimeLogger.js +252 -0
- package/dist/utils/runtimeLogger.js.map +1 -0
- package/dist/utils/sanitizeQuery.js +1 -0
- package/dist/utils/sanitizeQuery.js.map +1 -0
- package/dist/utils/sharedUtils.js +1 -0
- package/dist/utils/sharedUtils.js.map +1 -0
- package/dist/utils/sleep.js +1 -0
- package/dist/utils/sleep.js.map +1 -0
- package/dist/utils/splitCodeIntoChunk.js +1 -0
- package/dist/utils/splitCodeIntoChunk.js.map +1 -0
- package/dist/utils/time.js +66 -0
- package/dist/utils/time.js.map +1 -0
- package/dist/utils/verifyFocusPreference.js +107 -0
- package/dist/utils/verifyFocusPreference.js.map +1 -0
- package/dist/utils/vscode.js +1 -0
- package/dist/utils/vscode.js.map +1 -0
- package/dist/workflow/workflowResolver.js +1 -0
- package/dist/workflow/workflowResolver.js.map +1 -0
- package/dist/workflow/workflowRunner.js +1 -0
- package/dist/workflow/workflowRunner.js.map +1 -0
- package/package.json +3 -3
- package/dist/agents/MainAgent.js +0 -1886
- package/dist/agents/contextReviewStep.js +0 -101
- package/dist/agents/finalPlanGenStep.js +0 -107
- package/dist/agents/structuralAnalysisStep.js +0 -46
- package/dist/agents/validationAnalysisStep.js +0 -87
- package/dist/pipeline/modules/chunkManagerModule.js +0 -24
- package/dist/pipeline/modules/cleanGeneratedTestsModule.js +0 -33
- package/dist/pipeline/modules/fileReaderModule.js +0 -72
- package/dist/pipeline/modules/gatherInfoModule.js +0 -181
- package/dist/pipeline/modules/generateTestsModule.js +0 -68
- package/dist/pipeline/modules/preserveCodeModule.js +0 -195
- package/dist/pipeline/modules/refactorModule.js +0 -40
- package/dist/pipeline/modules/repairTestsModule.js +0 -48
- package/dist/pipeline/modules/runTestsModule.js +0 -37
|
@@ -0,0 +1,1337 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Defines local eval cases and shared CLI/shell eval command actions.
|
|
3
|
+
*
|
|
4
|
+
* Why this file exists:
|
|
5
|
+
* - local evals catch new-agent regressions without needing a full benchmark harness
|
|
6
|
+
* - checks should prefer user-visible output and stable shared runtime-log tokens
|
|
7
|
+
* - stale architecture names should not stay in the default regression suite after cleanup
|
|
8
|
+
* - `evalQueries` stay as smoke prompts, while `localEvalCases` stay the real regression suite
|
|
9
|
+
* - CLI and shell eval commands should call one shared behavior owner
|
|
10
|
+
* - slower transform coverage should live behind one explicit eval command
|
|
11
|
+
*
|
|
12
|
+
* Example eval shapes:
|
|
13
|
+
* - explicit target -> answer mentions the right file or symbol
|
|
14
|
+
* - search-heavy question -> shared logs still show the new runtime start and final answer
|
|
15
|
+
* - repo-wide analysis -> answer stays grounded without relying on legacy trace phrases
|
|
16
|
+
*/
|
|
17
|
+
import fs from "fs";
|
|
18
|
+
import path from "path";
|
|
19
|
+
import { Script } from "node:vm";
|
|
20
|
+
import chalk from "chalk";
|
|
21
|
+
import { RUN_LOG_PATH } from "../constants.js";
|
|
22
|
+
import { buildEvalReportMarkdown, getLatestTaskId, printEvalReport, runEvalReportCommand } from "./EvalReportCmd.js";
|
|
23
|
+
import { createHarnessArtifactDir, writeHarnessArtifacts } from "../testing/harnessArtifacts.js";
|
|
24
|
+
import { startLlmTraceSession, stopLlmTraceSession } from "../testing/llmTraceSession.js";
|
|
25
|
+
import { extractFinalAnswerTextFromRuntimeLog, normalizeRuntimeLogText } from "../testing/runtimeLogReader.js";
|
|
26
|
+
import { resolveCanonicalRepoIdentity } from "../utils/repoIdentity.js";
|
|
27
|
+
const SUPPRESS_FINAL_CONTEXT_ENV = "SCAI_SUPPRESS_FINAL_CONTEXT";
|
|
28
|
+
function createInitialEvalOutcomes(evalIds) {
|
|
29
|
+
return new Map(evalIds.map((id) => [id, {
|
|
30
|
+
kind: "incomplete",
|
|
31
|
+
applicable: true,
|
|
32
|
+
failures: ["eval did not finish"],
|
|
33
|
+
}]));
|
|
34
|
+
}
|
|
35
|
+
function recordEvalOutcome(outcomesById, evalId, record) {
|
|
36
|
+
outcomesById.set(evalId, record);
|
|
37
|
+
}
|
|
38
|
+
export function buildEvalBatchSummary(outcomesById) {
|
|
39
|
+
let applicable = 0;
|
|
40
|
+
let passed = 0;
|
|
41
|
+
let failed = 0;
|
|
42
|
+
let skipped = 0;
|
|
43
|
+
let incomplete = 0;
|
|
44
|
+
for (const outcome of outcomesById.values()) {
|
|
45
|
+
if (outcome.applicable) {
|
|
46
|
+
applicable += 1;
|
|
47
|
+
}
|
|
48
|
+
switch (outcome.kind) {
|
|
49
|
+
case "passed":
|
|
50
|
+
passed += 1;
|
|
51
|
+
break;
|
|
52
|
+
case "failed":
|
|
53
|
+
failed += 1;
|
|
54
|
+
break;
|
|
55
|
+
case "skipped":
|
|
56
|
+
skipped += 1;
|
|
57
|
+
break;
|
|
58
|
+
case "incomplete":
|
|
59
|
+
default:
|
|
60
|
+
incomplete += 1;
|
|
61
|
+
break;
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
return {
|
|
65
|
+
total: outcomesById.size,
|
|
66
|
+
applicable,
|
|
67
|
+
completed: passed + failed,
|
|
68
|
+
passed,
|
|
69
|
+
failed,
|
|
70
|
+
skipped,
|
|
71
|
+
incomplete,
|
|
72
|
+
};
|
|
73
|
+
}
|
|
74
|
+
export function assertEvalSummaryInvariant(summary) {
|
|
75
|
+
const classifiedTotal = summary.passed + summary.failed + summary.skipped + summary.incomplete;
|
|
76
|
+
const classifiedCompleted = summary.passed + summary.failed;
|
|
77
|
+
if (summary.total !== classifiedTotal
|
|
78
|
+
|| summary.completed !== classifiedCompleted
|
|
79
|
+
|| summary.completed > summary.applicable
|
|
80
|
+
|| summary.applicable > summary.total) {
|
|
81
|
+
throw new Error([
|
|
82
|
+
"Eval summary totals do not add up.",
|
|
83
|
+
`total=${summary.total} classified=${classifiedTotal}`,
|
|
84
|
+
`completed=${summary.completed} classifiedCompleted=${classifiedCompleted}`,
|
|
85
|
+
`applicable=${summary.applicable} total=${summary.total}`,
|
|
86
|
+
].join(" "));
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
export function formatEvalSummaryLine(summary, durationSec) {
|
|
90
|
+
return `\n[EVAL SUMMARY] total=${summary.total}, applicable=${summary.applicable}, completed=${summary.completed}, passed=${summary.passed}, failed=${summary.failed}, skipped=${summary.skipped}, incomplete=${summary.incomplete}, duration=${durationSec}s`;
|
|
91
|
+
}
|
|
92
|
+
export function writeEvalBatchSummaryArtifact(artifactDir, input) {
|
|
93
|
+
fs.mkdirSync(artifactDir, { recursive: true });
|
|
94
|
+
const outcomes = Object.fromEntries(Array.from(input.outcomesById.entries()).map(([evalId, outcome]) => [evalId, outcome]));
|
|
95
|
+
const outputPath = path.join(artifactDir, "batch_summary.json");
|
|
96
|
+
fs.writeFileSync(outputPath, JSON.stringify({
|
|
97
|
+
selectedEvalIds: input.selectedEvalIds,
|
|
98
|
+
durationSec: input.durationSec,
|
|
99
|
+
summary: input.summary,
|
|
100
|
+
outcomes,
|
|
101
|
+
}, null, 2));
|
|
102
|
+
return outputPath;
|
|
103
|
+
}
|
|
104
|
+
export const evalQueries = [
|
|
105
|
+
"Explain how agentMemory, agentLoop, agentActions, and agentStateMachine divide responsibilities.",
|
|
106
|
+
"Explain how the simple agent chooses between plan, search-db, search-regex, search-list-directory, triage-file, read-file, and final-answer.",
|
|
107
|
+
"Trace one simple-agent run from understand through final answer and show where state transitions happen.",
|
|
108
|
+
"How do runtime.log, runtime.ndjson, and llm_calls differ, and where is each written?",
|
|
109
|
+
"Trace how generate() timing reaches structured logs and the console presentation layer.",
|
|
110
|
+
"How does scai config dev-output affect console timing for both runtimes?",
|
|
111
|
+
"Where is console presentation centralized, and which normal runtime messages flow through it?",
|
|
112
|
+
"How are local-time timestamps formatted across runtime.log, runtime.ndjson, debug logs, and llm trace files?",
|
|
113
|
+
"How does /resume for the simple agent work, and when does it reuse in-memory state?",
|
|
114
|
+
"Where are readable runtime-local ids like session-001 and task-001 assigned, and where do they appear?",
|
|
115
|
+
"Which simple-agent files under cli/src/agents are missing targeted test coverage?",
|
|
116
|
+
"Summarize CLI architecture from index.ts, commands/factory.ts, and runQueryWithDaemonControl.ts.",
|
|
117
|
+
"How does this repo run tests and where are the test entry points configured?",
|
|
118
|
+
"Are there flaky-test signals in __tests__ or test scripts, and where would retries be added?",
|
|
119
|
+
"Where are SQLite queries defined in db/fileIndex.ts and db/client.ts?",
|
|
120
|
+
"Map the full run lifecycle in MainAgent and point out where routing decisions are made.",
|
|
121
|
+
"For repo-wide analysis questions, which method decides verify wave budget and why?",
|
|
122
|
+
"Compare runSearch, runVerify, and runResearch responsibilities and identify overlap risks.",
|
|
123
|
+
"List all gate checks (phase/scope/route/readiness/research) and the exact stop conditions.",
|
|
124
|
+
"Explain why a repo-wide question might still end up analyzing only a few files.",
|
|
125
|
+
];
|
|
126
|
+
export const localEvalScenarios = [];
|
|
127
|
+
function createTransformEvalCase(options) {
|
|
128
|
+
return {
|
|
129
|
+
id: options.id,
|
|
130
|
+
group: "agent",
|
|
131
|
+
suite: "transform",
|
|
132
|
+
query: options.queryTemplate.split("$TARGET").join(options.targetPath),
|
|
133
|
+
runOptions: {
|
|
134
|
+
runtime: "agent",
|
|
135
|
+
},
|
|
136
|
+
when: {
|
|
137
|
+
mustContainInRunLog: [
|
|
138
|
+
"action=codeTransform",
|
|
139
|
+
],
|
|
140
|
+
},
|
|
141
|
+
syntaxCheckPaths: [options.targetPath],
|
|
142
|
+
checks: {
|
|
143
|
+
mustContain: [
|
|
144
|
+
"[agent] [task:created]",
|
|
145
|
+
"artifact final-answer",
|
|
146
|
+
],
|
|
147
|
+
mustContainAnswer: [
|
|
148
|
+
options.targetPath,
|
|
149
|
+
],
|
|
150
|
+
fileMustContain: options.fileMustContain?.length
|
|
151
|
+
? [{ path: options.targetPath, tokens: options.fileMustContain }]
|
|
152
|
+
: undefined,
|
|
153
|
+
fileMustNotContain: options.fileMustNotContain?.length
|
|
154
|
+
? [{ path: options.targetPath, tokens: options.fileMustNotContain }]
|
|
155
|
+
: undefined,
|
|
156
|
+
fileMustContainAnyOf: options.fileMustContainAnyOf,
|
|
157
|
+
},
|
|
158
|
+
};
|
|
159
|
+
}
|
|
160
|
+
export const localEvalCases = [
|
|
161
|
+
{
|
|
162
|
+
id: "agent_read_and_answer_simple_target",
|
|
163
|
+
group: "agent",
|
|
164
|
+
query: "Explain what cli/src/agents/agentTypes.ts is for.",
|
|
165
|
+
runOptions: {
|
|
166
|
+
runtime: "agent",
|
|
167
|
+
},
|
|
168
|
+
checks: {
|
|
169
|
+
mustContain: [
|
|
170
|
+
"[agent] [task:created]",
|
|
171
|
+
"artifact final-answer",
|
|
172
|
+
],
|
|
173
|
+
mustContainAnswer: [
|
|
174
|
+
"agentTypes.ts",
|
|
175
|
+
],
|
|
176
|
+
},
|
|
177
|
+
when: {
|
|
178
|
+
mustContainInRunLog: [
|
|
179
|
+
"action=read-file",
|
|
180
|
+
],
|
|
181
|
+
mustNotContainInRunLog: [
|
|
182
|
+
"action=triage-file",
|
|
183
|
+
],
|
|
184
|
+
},
|
|
185
|
+
},
|
|
186
|
+
{
|
|
187
|
+
id: "agent_broad_triage_then_full_read",
|
|
188
|
+
group: "agent",
|
|
189
|
+
query: "Explain how the modular agent chooses between triage-file and read-file.",
|
|
190
|
+
runOptions: {
|
|
191
|
+
runtime: "agent",
|
|
192
|
+
},
|
|
193
|
+
when: {
|
|
194
|
+
mustContainInRunLog: [
|
|
195
|
+
"action=triage-file",
|
|
196
|
+
"action=read-file",
|
|
197
|
+
],
|
|
198
|
+
},
|
|
199
|
+
checks: {
|
|
200
|
+
mustContain: [
|
|
201
|
+
"[agent] [task:created]",
|
|
202
|
+
"artifact final-answer",
|
|
203
|
+
],
|
|
204
|
+
mustContainAnswer: [
|
|
205
|
+
"triage-file",
|
|
206
|
+
"read-file",
|
|
207
|
+
],
|
|
208
|
+
},
|
|
209
|
+
},
|
|
210
|
+
{
|
|
211
|
+
id: "agent_ranked_candidate_promotion",
|
|
212
|
+
group: "agent",
|
|
213
|
+
query: "Explain how agentMemory, agentLoop, agentActions, and agentStateMachine divide responsibilities.",
|
|
214
|
+
runOptions: {
|
|
215
|
+
runtime: "agent",
|
|
216
|
+
},
|
|
217
|
+
when: {
|
|
218
|
+
mustContainInRunLog: [
|
|
219
|
+
"action=read-file",
|
|
220
|
+
],
|
|
221
|
+
},
|
|
222
|
+
checks: {
|
|
223
|
+
mustContain: [
|
|
224
|
+
"[agent] [task:created]",
|
|
225
|
+
"artifact final-answer",
|
|
226
|
+
],
|
|
227
|
+
mustContainAnswer: [
|
|
228
|
+
"agentLoop",
|
|
229
|
+
"agentActions",
|
|
230
|
+
],
|
|
231
|
+
},
|
|
232
|
+
},
|
|
233
|
+
{
|
|
234
|
+
id: "agent_list_plan_sections",
|
|
235
|
+
group: "agent",
|
|
236
|
+
query: "List the main sections in plan.md.",
|
|
237
|
+
runOptions: {
|
|
238
|
+
runtime: "agent",
|
|
239
|
+
},
|
|
240
|
+
checks: {
|
|
241
|
+
mustContain: [
|
|
242
|
+
"[agent] [task:created]",
|
|
243
|
+
"artifact final-answer",
|
|
244
|
+
],
|
|
245
|
+
mustContainAnswer: [
|
|
246
|
+
"plan.md",
|
|
247
|
+
],
|
|
248
|
+
},
|
|
249
|
+
},
|
|
250
|
+
{
|
|
251
|
+
id: "agent_cli_architecture_summary",
|
|
252
|
+
group: "agent",
|
|
253
|
+
query: "Summarize CLI architecture from index.ts, commands/factory.ts, and runQueryWithDaemonControl.ts.",
|
|
254
|
+
runOptions: {
|
|
255
|
+
runtime: "agent",
|
|
256
|
+
},
|
|
257
|
+
checks: {
|
|
258
|
+
mustContain: [
|
|
259
|
+
"[agent] [task:created]",
|
|
260
|
+
"artifact final-answer",
|
|
261
|
+
],
|
|
262
|
+
mustContainAnswer: [
|
|
263
|
+
"index.ts",
|
|
264
|
+
"factory.ts",
|
|
265
|
+
"runQueryWithDaemonControl.ts",
|
|
266
|
+
],
|
|
267
|
+
},
|
|
268
|
+
},
|
|
269
|
+
{
|
|
270
|
+
id: "routing_repo_wide_question",
|
|
271
|
+
group: "agent",
|
|
272
|
+
query: "summarize this repo architecture and identify weak coupling points",
|
|
273
|
+
runOptions: {
|
|
274
|
+
runtime: "agent",
|
|
275
|
+
},
|
|
276
|
+
checks: {
|
|
277
|
+
mustContain: [
|
|
278
|
+
"[agent] [task:created]",
|
|
279
|
+
"artifact final-answer",
|
|
280
|
+
],
|
|
281
|
+
mustContainAnswer: [
|
|
282
|
+
"module",
|
|
283
|
+
"architecture",
|
|
284
|
+
],
|
|
285
|
+
},
|
|
286
|
+
},
|
|
287
|
+
{
|
|
288
|
+
id: "verify_pipeline_sqlite_query",
|
|
289
|
+
group: "agent",
|
|
290
|
+
query: "Where are SQLite queries defined in db/fileIndex.ts and db/client.ts?",
|
|
291
|
+
runOptions: {
|
|
292
|
+
runtime: "agent",
|
|
293
|
+
},
|
|
294
|
+
checks: {
|
|
295
|
+
mustContain: [
|
|
296
|
+
"[agent] [task:created]",
|
|
297
|
+
"artifact final-answer",
|
|
298
|
+
],
|
|
299
|
+
mustContainAnswer: [
|
|
300
|
+
"db/fileIndex.ts",
|
|
301
|
+
"db/client.ts",
|
|
302
|
+
"SQLite",
|
|
303
|
+
],
|
|
304
|
+
},
|
|
305
|
+
},
|
|
306
|
+
{
|
|
307
|
+
id: "search_folder_basename_exact_target",
|
|
308
|
+
group: "agent",
|
|
309
|
+
query: "check guards",
|
|
310
|
+
runOptions: {
|
|
311
|
+
runtime: "agent",
|
|
312
|
+
},
|
|
313
|
+
checks: {
|
|
314
|
+
mustContain: [
|
|
315
|
+
"[agent] [task:created]",
|
|
316
|
+
"artifact final-answer",
|
|
317
|
+
],
|
|
318
|
+
mustContainAnswer: [
|
|
319
|
+
"executionPolicyResolver.ts",
|
|
320
|
+
"guardState.ts",
|
|
321
|
+
"resolveProgressState.ts",
|
|
322
|
+
],
|
|
323
|
+
},
|
|
324
|
+
},
|
|
325
|
+
{
|
|
326
|
+
id: "search_folder_path_exact_target",
|
|
327
|
+
group: "agent",
|
|
328
|
+
query: "check agents/guards",
|
|
329
|
+
runOptions: {
|
|
330
|
+
runtime: "agent",
|
|
331
|
+
},
|
|
332
|
+
checks: {
|
|
333
|
+
mustContain: [
|
|
334
|
+
"[agent] [task:created]",
|
|
335
|
+
"artifact final-answer",
|
|
336
|
+
],
|
|
337
|
+
mustContainAnswer: [
|
|
338
|
+
"executionPolicyResolver.ts",
|
|
339
|
+
"guardState.ts",
|
|
340
|
+
"resolveProgressState.ts",
|
|
341
|
+
],
|
|
342
|
+
},
|
|
343
|
+
},
|
|
344
|
+
{
|
|
345
|
+
id: "search_folder_natural_language_exact_target",
|
|
346
|
+
group: "agent",
|
|
347
|
+
query: "please tell me what the three files in the guards folder do?",
|
|
348
|
+
runOptions: {
|
|
349
|
+
runtime: "agent",
|
|
350
|
+
},
|
|
351
|
+
checks: {
|
|
352
|
+
mustContain: [
|
|
353
|
+
"artifact final-answer",
|
|
354
|
+
],
|
|
355
|
+
mustContainAnswer: [
|
|
356
|
+
"executionPolicyResolver.ts",
|
|
357
|
+
"guardState.ts",
|
|
358
|
+
"resolveProgressState.ts",
|
|
359
|
+
],
|
|
360
|
+
},
|
|
361
|
+
},
|
|
362
|
+
{
|
|
363
|
+
id: "search_folder_absolute_path_exact_target",
|
|
364
|
+
group: "agent",
|
|
365
|
+
query: "/Users/rzs/dev/repos/scai/cli/src/agents/guards",
|
|
366
|
+
runOptions: {
|
|
367
|
+
runtime: "agent",
|
|
368
|
+
},
|
|
369
|
+
checks: {
|
|
370
|
+
mustContain: [
|
|
371
|
+
"[agent] [task:created]",
|
|
372
|
+
"artifact final-answer",
|
|
373
|
+
],
|
|
374
|
+
mustContainAnswer: [
|
|
375
|
+
"executionPolicyResolver.ts",
|
|
376
|
+
],
|
|
377
|
+
},
|
|
378
|
+
},
|
|
379
|
+
{
|
|
380
|
+
id: "search_file_without_extension_exact_target",
|
|
381
|
+
group: "agent",
|
|
382
|
+
query: "Explain EvalReportCmd",
|
|
383
|
+
runOptions: {
|
|
384
|
+
runtime: "agent",
|
|
385
|
+
},
|
|
386
|
+
checks: {
|
|
387
|
+
mustContain: [
|
|
388
|
+
"[agent] [task:created]",
|
|
389
|
+
"artifact final-answer",
|
|
390
|
+
],
|
|
391
|
+
mustContainAnswer: [
|
|
392
|
+
"EvalReportCmd",
|
|
393
|
+
],
|
|
394
|
+
},
|
|
395
|
+
},
|
|
396
|
+
{
|
|
397
|
+
id: "search_file_with_extension_exact_target",
|
|
398
|
+
group: "agent",
|
|
399
|
+
query: "Explain loopReview.ts",
|
|
400
|
+
runOptions: {
|
|
401
|
+
runtime: "agent",
|
|
402
|
+
},
|
|
403
|
+
checks: {
|
|
404
|
+
mustContain: [
|
|
405
|
+
"[agent] [task:created]",
|
|
406
|
+
"artifact final-answer",
|
|
407
|
+
],
|
|
408
|
+
mustContainAnswer: [
|
|
409
|
+
"loopReview.ts",
|
|
410
|
+
],
|
|
411
|
+
},
|
|
412
|
+
},
|
|
413
|
+
{
|
|
414
|
+
id: "search_filename_with_spaces_between_words",
|
|
415
|
+
group: "agent",
|
|
416
|
+
query: "Explain final answer module",
|
|
417
|
+
runOptions: {
|
|
418
|
+
runtime: "agent",
|
|
419
|
+
},
|
|
420
|
+
checks: {
|
|
421
|
+
mustContain: [
|
|
422
|
+
"[agent] [task:created]",
|
|
423
|
+
"artifact final-answer",
|
|
424
|
+
],
|
|
425
|
+
mustContainAnswer: [
|
|
426
|
+
"Final Answer Module",
|
|
427
|
+
"buildFinalAnswer",
|
|
428
|
+
],
|
|
429
|
+
},
|
|
430
|
+
},
|
|
431
|
+
{
|
|
432
|
+
id: "search_snippet_without_filename",
|
|
433
|
+
group: "agent",
|
|
434
|
+
query: "\"returns raw operator-pattern matches without semantic guessing\"",
|
|
435
|
+
runOptions: {
|
|
436
|
+
runtime: "agent",
|
|
437
|
+
},
|
|
438
|
+
checks: {
|
|
439
|
+
mustContain: [
|
|
440
|
+
"[agent] [task:created]",
|
|
441
|
+
"artifact final-answer",
|
|
442
|
+
],
|
|
443
|
+
mustContainAnswer: [
|
|
444
|
+
"SearchOrchestrator.test.ts",
|
|
445
|
+
],
|
|
446
|
+
},
|
|
447
|
+
},
|
|
448
|
+
{
|
|
449
|
+
id: "search_identical_snippet_multiple_files",
|
|
450
|
+
group: "agent",
|
|
451
|
+
query: "\"const currentQuestionId = currentStep.questionId ?? (orderedQuestions.length === 1 ? orderedQuestions[0].id : undefined);\"",
|
|
452
|
+
runOptions: {
|
|
453
|
+
runtime: "agent",
|
|
454
|
+
},
|
|
455
|
+
checks: {
|
|
456
|
+
mustContain: [
|
|
457
|
+
"[agent] [task:created]",
|
|
458
|
+
"artifact final-answer",
|
|
459
|
+
],
|
|
460
|
+
mustContainAnswer: [
|
|
461
|
+
"analysisPlanGenStep.ts",
|
|
462
|
+
"transformPlanGenStep.ts",
|
|
463
|
+
],
|
|
464
|
+
},
|
|
465
|
+
},
|
|
466
|
+
{
|
|
467
|
+
id: "search_operator_pattern_query",
|
|
468
|
+
group: "agent",
|
|
469
|
+
query: "runSearch|semanticSearchFiles|plannerSearchFiles",
|
|
470
|
+
runOptions: {
|
|
471
|
+
runtime: "agent",
|
|
472
|
+
},
|
|
473
|
+
checks: {
|
|
474
|
+
mustContain: [
|
|
475
|
+
"[agent] [task:created]",
|
|
476
|
+
"artifact final-answer",
|
|
477
|
+
],
|
|
478
|
+
mustContainAnswer: [
|
|
479
|
+
"runSearch",
|
|
480
|
+
],
|
|
481
|
+
},
|
|
482
|
+
},
|
|
483
|
+
{
|
|
484
|
+
id: "search_unresolved_folder_fallback",
|
|
485
|
+
group: "agent",
|
|
486
|
+
query: "check nonexistent-folder-name",
|
|
487
|
+
runOptions: {
|
|
488
|
+
runtime: "agent",
|
|
489
|
+
},
|
|
490
|
+
checks: {
|
|
491
|
+
mustContain: [
|
|
492
|
+
"[agent] [task:created]",
|
|
493
|
+
],
|
|
494
|
+
},
|
|
495
|
+
},
|
|
496
|
+
{
|
|
497
|
+
id: "analysis_plan_and_semantic_pass",
|
|
498
|
+
group: "agent",
|
|
499
|
+
query: "Where does semanticAnalysis merge prior state, and what fields should remain semantic-only?",
|
|
500
|
+
runOptions: {
|
|
501
|
+
runtime: "agent",
|
|
502
|
+
},
|
|
503
|
+
checks: {
|
|
504
|
+
mustContain: [
|
|
505
|
+
"[agent] [task:created]",
|
|
506
|
+
"artifact final-answer",
|
|
507
|
+
"semanticAnalysisModule.ts",
|
|
508
|
+
],
|
|
509
|
+
},
|
|
510
|
+
},
|
|
511
|
+
{
|
|
512
|
+
id: "final_answer_for_test_entrypoints",
|
|
513
|
+
group: "agent",
|
|
514
|
+
query: "How does this repo run tests and where are the test entry points configured?",
|
|
515
|
+
runOptions: {
|
|
516
|
+
runtime: "agent",
|
|
517
|
+
},
|
|
518
|
+
checks: {
|
|
519
|
+
mustContain: [
|
|
520
|
+
"[agent] [task:created]",
|
|
521
|
+
"artifact final-answer",
|
|
522
|
+
],
|
|
523
|
+
mustContainAnswer: [
|
|
524
|
+
"vitest",
|
|
525
|
+
"package.json",
|
|
526
|
+
"vitest.config.ts",
|
|
527
|
+
],
|
|
528
|
+
},
|
|
529
|
+
},
|
|
530
|
+
/**
|
|
531
|
+
* Transform evals for the spatialmap practice files.
|
|
532
|
+
*
|
|
533
|
+
* These stay in a separate slow suite so the default `/evals` run stays fast.
|
|
534
|
+
*/
|
|
535
|
+
createTransformEvalCase({
|
|
536
|
+
id: "transform_spatialmap_small_rename_run_bootstrap_flow",
|
|
537
|
+
targetPath: "playground/test/spatialmap.small.js",
|
|
538
|
+
queryTemplate: "In $TARGET, rename runBootstrapFlow to bootstrapPortalFlow and update all call sites. Keep behavior the same.",
|
|
539
|
+
fileMustContain: [
|
|
540
|
+
"function bootstrapPortalFlow(",
|
|
541
|
+
"bootstrapPortalFlow(context)",
|
|
542
|
+
],
|
|
543
|
+
fileMustNotContain: [
|
|
544
|
+
"runBootstrapFlow",
|
|
545
|
+
],
|
|
546
|
+
}),
|
|
547
|
+
createTransformEvalCase({
|
|
548
|
+
id: "transform_spatialmap_small_extract_profile_url_helper",
|
|
549
|
+
targetPath: "playground/test/spatialmap.small.extract.js",
|
|
550
|
+
queryTemplate: "In $TARGET, extract the repeated /rest/profile/... URL building into a helper named buildProfileToolUrl and use it in buildPortalHtmlUrl, buildLayerPanelUrl, and buildSearchPanelUrl. Keep behavior the same.",
|
|
551
|
+
fileMustContain: [
|
|
552
|
+
"function buildProfileToolUrl(",
|
|
553
|
+
],
|
|
554
|
+
fileMustContainAnyOf: [
|
|
555
|
+
{
|
|
556
|
+
path: "playground/test/spatialmap.small.extract.js",
|
|
557
|
+
anyOf: [
|
|
558
|
+
"return buildProfileToolUrl(profileName, \"html/client/portal\")",
|
|
559
|
+
"return buildProfileToolUrl(profileName) + \"/html/client/portal\"",
|
|
560
|
+
],
|
|
561
|
+
},
|
|
562
|
+
{
|
|
563
|
+
path: "playground/test/spatialmap.small.extract.js",
|
|
564
|
+
anyOf: [
|
|
565
|
+
"return buildProfileToolUrl(profileName, \"html/client/layer-panel\")",
|
|
566
|
+
"return buildProfileToolUrl(profileName) + \"/html/client/layer-panel\"",
|
|
567
|
+
],
|
|
568
|
+
},
|
|
569
|
+
{
|
|
570
|
+
path: "playground/test/spatialmap.small.extract.js",
|
|
571
|
+
anyOf: [
|
|
572
|
+
"return buildProfileToolUrl(profileName, \"html/client/search-panel\")",
|
|
573
|
+
"return buildProfileToolUrl(profileName) + \"/html/client/search-panel\"",
|
|
574
|
+
],
|
|
575
|
+
},
|
|
576
|
+
],
|
|
577
|
+
fileMustNotContain: [
|
|
578
|
+
"return \"/rest/profile/\" + profileName + \"/tools/html/client/portal\";",
|
|
579
|
+
"return \"/rest/profile/\" + profileName + \"/tools/html/client/layer-panel\";",
|
|
580
|
+
"return \"/rest/profile/\" + profileName + \"/tools/html/client/search-panel\";",
|
|
581
|
+
],
|
|
582
|
+
}),
|
|
583
|
+
createTransformEvalCase({
|
|
584
|
+
id: "transform_spatialmap_small_multi_rename_flow_helpers",
|
|
585
|
+
targetPath: "playground/test/spatialmap.small.multi.js",
|
|
586
|
+
queryTemplate: "In $TARGET, rename the beginPortalFlow, beginWidgetFlow, and beginReadyFlow helper family to startPortalFlow, startWidgetFlow, and startReadyFlow, and update all call sites. Keep behavior the same.",
|
|
587
|
+
fileMustContain: [
|
|
588
|
+
"function startPortalFlow(",
|
|
589
|
+
"function startWidgetFlow(",
|
|
590
|
+
"function startReadyFlow(",
|
|
591
|
+
],
|
|
592
|
+
fileMustNotContain: [
|
|
593
|
+
"beginPortalFlow",
|
|
594
|
+
"beginWidgetFlow",
|
|
595
|
+
"beginReadyFlow",
|
|
596
|
+
],
|
|
597
|
+
}),
|
|
598
|
+
createTransformEvalCase({
|
|
599
|
+
id: "transform_spatialmap_small_multi_extract_portal_request_helpers",
|
|
600
|
+
targetPath: "playground/test/spatialmap.small.multi.extract.js",
|
|
601
|
+
queryTemplate: "In $TARGET, extract the repeated /rest/profile/... portal request URL building into a helper named buildProfileToolRequestUrl and use it in buildPortalHtmlRequestUrl, buildLayerPanelRequestUrl, and buildSearchPanelRequestUrl. Keep behavior the same.",
|
|
602
|
+
fileMustContain: [
|
|
603
|
+
"function buildProfileToolRequestUrl(",
|
|
604
|
+
"return buildProfileToolRequestUrl(profileName, \"html/client/portal\")",
|
|
605
|
+
"return buildProfileToolRequestUrl(profileName, \"html/client/layer-panel\")",
|
|
606
|
+
"return buildProfileToolRequestUrl(profileName, \"html/client/search-panel\")",
|
|
607
|
+
],
|
|
608
|
+
fileMustNotContain: [
|
|
609
|
+
"function buildPortalHtmlRequestUrl(profileName) { return \"/rest/profile/\" + profileName + \"/tools/html/client/portal\"; }",
|
|
610
|
+
"function buildLayerPanelRequestUrl(profileName) { return \"/rest/profile/\" + profileName + \"/tools/html/client/layer-panel\"; }",
|
|
611
|
+
"function buildSearchPanelRequestUrl(profileName) { return \"/rest/profile/\" + profileName + \"/tools/html/client/search-panel\"; }",
|
|
612
|
+
],
|
|
613
|
+
}),
|
|
614
|
+
createTransformEvalCase({
|
|
615
|
+
id: "transform_spatialmap_medium_document_body_cleanup",
|
|
616
|
+
targetPath: "playground/test/spatialmap.medium.js",
|
|
617
|
+
queryTemplate: "In $TARGET, add a helper named getPageBody that returns document.body, then use it in prepareSurface, loadPortalMarkup, loadSidebarMarkup, loadSearchMarkup, syncSurfaceShell, syncSurfacePanels, syncSurfaceMessages, syncSurfaceLayout, and syncSurfaceTheme. Keep behavior the same.",
|
|
618
|
+
fileMustContain: [
|
|
619
|
+
"function getPageBody()",
|
|
620
|
+
"document.body",
|
|
621
|
+
],
|
|
622
|
+
fileMustNotContain: [
|
|
623
|
+
"document.querySelector(\"body\").insertAdjacentHTML(",
|
|
624
|
+
"const pageBody = document.querySelector(\"body\");",
|
|
625
|
+
],
|
|
626
|
+
}),
|
|
627
|
+
createTransformEvalCase({
|
|
628
|
+
id: "transform_spatialmap_medium_multi_dom_surface_cleanup",
|
|
629
|
+
targetPath: "playground/test/spatialmap.medium.multi.js",
|
|
630
|
+
queryTemplate: "In $TARGET, add a helper named getPageBody that returns document.body, then use it in the markup-loading and syncSurface helpers instead of repeating document.querySelector(\"body\"). Keep behavior the same.",
|
|
631
|
+
fileMustContain: [
|
|
632
|
+
"function getPageBody()",
|
|
633
|
+
"return document.body;",
|
|
634
|
+
],
|
|
635
|
+
fileMustNotContain: [
|
|
636
|
+
"document.querySelector(\"body\")",
|
|
637
|
+
"context.widget.body",
|
|
638
|
+
],
|
|
639
|
+
}),
|
|
640
|
+
createTransformEvalCase({
|
|
641
|
+
id: "transform_spatialmap_medium_extract_profile_tool_url",
|
|
642
|
+
targetPath: "playground/test/spatialmap.medium.extract.js",
|
|
643
|
+
queryTemplate: "In $TARGET, extract the repeated /rest/profile/... tool URL building into a helper named buildProfileToolUrl and use it in buildPortalHtmlUrl, buildSidebarHtmlUrl, buildSearchHtmlUrl, and buildPortalSourceUrl. Keep behavior the same.",
|
|
644
|
+
fileMustContain: [
|
|
645
|
+
"function buildProfileToolUrl(",
|
|
646
|
+
"return buildProfileToolUrl(profileName, \"html/client/portal\")",
|
|
647
|
+
"return buildProfileToolUrl(profileName, \"html/client/sidebar\")",
|
|
648
|
+
"return buildProfileToolUrl(profileName, \"html/client/search\")",
|
|
649
|
+
"return buildProfileToolUrl(profileName, \"src/client/portal/\" + sessionId)",
|
|
650
|
+
],
|
|
651
|
+
fileMustNotContain: [
|
|
652
|
+
"return \"/rest/profile/\" + profileName + \"/tools/html/client/portal\";",
|
|
653
|
+
"return \"/rest/profile/\" + profileName + \"/tools/html/client/sidebar\";",
|
|
654
|
+
"return \"/rest/profile/\" + profileName + \"/tools/html/client/search\";",
|
|
655
|
+
"return \"/rest/profile/\" + profileName + \"/tools/src/client/portal/\" + sessionId;",
|
|
656
|
+
],
|
|
657
|
+
}),
|
|
658
|
+
createTransformEvalCase({
|
|
659
|
+
id: "transform_spatialmap_large_cache_session_id",
|
|
660
|
+
targetPath: "playground/test/spatialmap.js",
|
|
661
|
+
queryTemplate: "In $TARGET, cache sessionStorage.getItem(\"spatialmapSessionId\") in a const named sessionId inside doLoadNow and reuse it in loadToolsConfig, loadToolRequires, and initSpatialmapWidget. Keep behavior the same.",
|
|
662
|
+
fileMustContain: [
|
|
663
|
+
"const sessionId = sessionStorage.getItem(\"spatialmapSessionId\");",
|
|
664
|
+
"sessionId: sessionId,",
|
|
665
|
+
],
|
|
666
|
+
fileMustNotContain: [
|
|
667
|
+
"sessionStorage.getItem(\"spatialmapSessionId\"),",
|
|
668
|
+
],
|
|
669
|
+
}),
|
|
670
|
+
];
|
|
671
|
+
export async function runEvalSmokeCommand(runQuery) {
|
|
672
|
+
const query = evalQueries[0];
|
|
673
|
+
console.log(chalk.bold(`[EVAL QUERY] ${query}`));
|
|
674
|
+
await runQuery(query, { continueCurrentTask: false, runtime: "agent" });
|
|
675
|
+
}
|
|
676
|
+
export async function runEvalRandomCommand(runQuery) {
|
|
677
|
+
const query = pickRandom(evalQueries);
|
|
678
|
+
console.log(`\n[eval-random] Selected query:\n-> ${query}\n`);
|
|
679
|
+
await runQuery(query, { continueCurrentTask: false, runtime: "agent" });
|
|
680
|
+
}
|
|
681
|
+
export async function runEvalBatchCommand(runQuery) {
|
|
682
|
+
await runLocalEvals(runQuery, { group: "agent", suite: "default" });
|
|
683
|
+
}
|
|
684
|
+
export async function runEvalTransformBatchCommand(runQuery) {
|
|
685
|
+
await runLocalEvals(runQuery, { group: "agent", suite: "transform" });
|
|
686
|
+
}
|
|
687
|
+
export async function runEvalByIdCommand(runQuery, evalIds) {
|
|
688
|
+
if (evalIds.length === 0) {
|
|
689
|
+
console.log(chalk.yellow("Usage: eval-run <id> [id...]"));
|
|
690
|
+
console.log(chalk.yellow(`Available ids: ${localEvalCases.map(test => test.id).join(", ")}`));
|
|
691
|
+
return;
|
|
692
|
+
}
|
|
693
|
+
await runLocalEvals(runQuery, { includeIds: evalIds });
|
|
694
|
+
}
|
|
695
|
+
export async function listEvalsCommand() {
|
|
696
|
+
console.log(chalk.cyan("\nAvailable evals:"));
|
|
697
|
+
for (const test of localEvalCases) {
|
|
698
|
+
const scenario = resolveEvalScenario(test);
|
|
699
|
+
console.log(chalk.bold(`- ${test.id}`));
|
|
700
|
+
console.log(` group: ${test.group ?? "main"}`);
|
|
701
|
+
console.log(` suite: ${test.suite ?? "default"}`);
|
|
702
|
+
console.log(` query: ${scenario.query}`);
|
|
703
|
+
}
|
|
704
|
+
console.log("");
|
|
705
|
+
}
|
|
706
|
+
export async function runEvalReportShellCommand(runQuery) {
|
|
707
|
+
await runLocalEvals(runQuery, { reportEachEval: true, group: "agent", suite: "default" });
|
|
708
|
+
const latestTaskId = getLatestTaskId();
|
|
709
|
+
if (typeof latestTaskId === "number") {
|
|
710
|
+
printEvalReport(latestTaskId, {
|
|
711
|
+
title: "Eval Report | latest task",
|
|
712
|
+
mermaid: true,
|
|
713
|
+
evaluate: true,
|
|
714
|
+
});
|
|
715
|
+
}
|
|
716
|
+
}
|
|
717
|
+
export function registerEvalCommands(cmd, runQuery) {
|
|
718
|
+
cmd
|
|
719
|
+
.command("eval")
|
|
720
|
+
.description("Run the default smoke eval query")
|
|
721
|
+
.action(async () => {
|
|
722
|
+
await runEvalSmokeCommand(runQuery);
|
|
723
|
+
});
|
|
724
|
+
cmd
|
|
725
|
+
.command("eval-random")
|
|
726
|
+
.description("Run one random smoke eval query")
|
|
727
|
+
.action(async () => {
|
|
728
|
+
await runEvalRandomCommand(runQuery);
|
|
729
|
+
});
|
|
730
|
+
cmd
|
|
731
|
+
.command("evals")
|
|
732
|
+
.description("Run the default local eval suite")
|
|
733
|
+
.action(async () => {
|
|
734
|
+
await runEvalBatchCommand(runQuery);
|
|
735
|
+
});
|
|
736
|
+
cmd
|
|
737
|
+
.command("evals-transform")
|
|
738
|
+
.description("Run the transform-only local eval suite")
|
|
739
|
+
.action(async () => {
|
|
740
|
+
await runEvalTransformBatchCommand(runQuery);
|
|
741
|
+
});
|
|
742
|
+
cmd
|
|
743
|
+
.command("eval-run <id> [ids...]")
|
|
744
|
+
.description("Run one or more local evals by id")
|
|
745
|
+
.action(async (id, ids) => {
|
|
746
|
+
await runEvalByIdCommand(runQuery, [id, ...(ids ?? [])]);
|
|
747
|
+
});
|
|
748
|
+
cmd
|
|
749
|
+
.command("eval-list")
|
|
750
|
+
.description("List local eval ids and queries")
|
|
751
|
+
.action(async () => {
|
|
752
|
+
await listEvalsCommand();
|
|
753
|
+
});
|
|
754
|
+
cmd
|
|
755
|
+
.command("eval-report [taskId]")
|
|
756
|
+
.description("Show task flow timeline, eval checks, and Mermaid graph (defaults to latest task)")
|
|
757
|
+
.option("--no-mermaid", "Skip Mermaid graph output")
|
|
758
|
+
.option("--no-evaluate", "Skip flow eval checks")
|
|
759
|
+
.action((taskId, opts) => {
|
|
760
|
+
runEvalReportCommand(taskId, {
|
|
761
|
+
mermaid: opts.mermaid,
|
|
762
|
+
evaluate: opts.evaluate,
|
|
763
|
+
});
|
|
764
|
+
});
|
|
765
|
+
}
|
|
766
|
+
export function registerEvalShellCommands(customCommands, runQuery) {
|
|
767
|
+
customCommands.eval = async () => runEvalSmokeCommand(runQuery);
|
|
768
|
+
customCommands["eval-random"] = async () => runEvalRandomCommand(runQuery);
|
|
769
|
+
customCommands.evals = async () => runEvalBatchCommand(runQuery);
|
|
770
|
+
customCommands["evals-transform"] = async () => runEvalTransformBatchCommand(runQuery);
|
|
771
|
+
customCommands["eval-run"] = async (...args) => runEvalByIdCommand(runQuery, args.map((arg) => arg.trim()).filter(Boolean));
|
|
772
|
+
customCommands["eval-list"] = async () => listEvalsCommand();
|
|
773
|
+
customCommands["eval-report"] = async () => runEvalReportShellCommand(runQuery);
|
|
774
|
+
}
|
|
775
|
+
function pickRandom(items) {
|
|
776
|
+
return items[Math.floor(Math.random() * items.length)];
|
|
777
|
+
}
|
|
778
|
+
function countOccurrences(haystack, needle) {
|
|
779
|
+
if (!needle)
|
|
780
|
+
return 0;
|
|
781
|
+
return haystack.split(needle).length - 1;
|
|
782
|
+
}
|
|
783
|
+
function resolveRepoRelativePath(repoRootPath, repoRelativePath) {
|
|
784
|
+
return path.join(repoRootPath, repoRelativePath);
|
|
785
|
+
}
|
|
786
|
+
/**
|
|
787
|
+
* Captures the real mutated fixture files that a transform eval asserted on.
|
|
788
|
+
*
|
|
789
|
+
* Example:
|
|
790
|
+
* - syntax check path `playground/test/spatialmap.small.js`
|
|
791
|
+
* - saved artifact snapshot uses that same committed path
|
|
792
|
+
*/
|
|
793
|
+
function collectOutputFilesForAssertions(assertions, repoRootPath) {
|
|
794
|
+
const filePaths = new Set();
|
|
795
|
+
for (const test of assertions) {
|
|
796
|
+
for (const filePath of test.syntaxCheckPaths ?? []) {
|
|
797
|
+
filePaths.add(filePath);
|
|
798
|
+
}
|
|
799
|
+
for (const check of test.checks.fileMustContain ?? []) {
|
|
800
|
+
filePaths.add(check.path);
|
|
801
|
+
}
|
|
802
|
+
for (const check of test.checks.fileMustNotContain ?? []) {
|
|
803
|
+
filePaths.add(check.path);
|
|
804
|
+
}
|
|
805
|
+
}
|
|
806
|
+
const files = [];
|
|
807
|
+
for (const filePath of filePaths) {
|
|
808
|
+
const absolutePath = resolveRepoRelativePath(repoRootPath, filePath);
|
|
809
|
+
if (!fs.existsSync(absolutePath)) {
|
|
810
|
+
continue;
|
|
811
|
+
}
|
|
812
|
+
files.push({
|
|
813
|
+
path: filePath,
|
|
814
|
+
content: fs.readFileSync(absolutePath, "utf-8"),
|
|
815
|
+
});
|
|
816
|
+
}
|
|
817
|
+
return files;
|
|
818
|
+
}
|
|
819
|
+
function collectFileCheckFailures(checks, repoRootPath, mode) {
|
|
820
|
+
const failures = [];
|
|
821
|
+
for (const check of checks ?? []) {
|
|
822
|
+
const absolutePath = resolveRepoRelativePath(repoRootPath, check.path);
|
|
823
|
+
if (!fs.existsSync(absolutePath)) {
|
|
824
|
+
failures.push(`file missing: ${check.path}`);
|
|
825
|
+
continue;
|
|
826
|
+
}
|
|
827
|
+
const content = fs.readFileSync(absolutePath, "utf-8");
|
|
828
|
+
for (const token of check.tokens) {
|
|
829
|
+
const hasToken = content.includes(token);
|
|
830
|
+
if (mode === "must-contain" && !hasToken) {
|
|
831
|
+
failures.push(`file missing token: ${check.path} -> ${token}`);
|
|
832
|
+
}
|
|
833
|
+
if (mode === "must-not-contain" && hasToken) {
|
|
834
|
+
failures.push(`file contains forbidden token: ${check.path} -> ${token}`);
|
|
835
|
+
}
|
|
836
|
+
}
|
|
837
|
+
}
|
|
838
|
+
return failures;
|
|
839
|
+
}
|
|
840
|
+
function collectFileAnyTokenFailures(checks, repoRootPath) {
|
|
841
|
+
const failures = [];
|
|
842
|
+
for (const check of checks ?? []) {
|
|
843
|
+
const absolutePath = resolveRepoRelativePath(repoRootPath, check.path);
|
|
844
|
+
if (!fs.existsSync(absolutePath)) {
|
|
845
|
+
failures.push(`file missing: ${check.path}`);
|
|
846
|
+
continue;
|
|
847
|
+
}
|
|
848
|
+
const content = fs.readFileSync(absolutePath, "utf-8");
|
|
849
|
+
if (!check.anyOf.some((token) => content.includes(token))) {
|
|
850
|
+
failures.push(`file missing any accepted token: ${check.path} -> ${check.anyOf.join(" || ")}`);
|
|
851
|
+
}
|
|
852
|
+
}
|
|
853
|
+
return failures;
|
|
854
|
+
}
|
|
855
|
+
function collectSyntaxFailures(pathsToCheck, repoRootPath) {
|
|
856
|
+
const failures = [];
|
|
857
|
+
for (const filePath of pathsToCheck ?? []) {
|
|
858
|
+
const absolutePath = resolveRepoRelativePath(repoRootPath, filePath);
|
|
859
|
+
if (!fs.existsSync(absolutePath)) {
|
|
860
|
+
failures.push(`syntax check file missing: ${filePath}`);
|
|
861
|
+
continue;
|
|
862
|
+
}
|
|
863
|
+
const content = fs.readFileSync(absolutePath, "utf-8");
|
|
864
|
+
try {
|
|
865
|
+
new Script(content, { filename: filePath });
|
|
866
|
+
}
|
|
867
|
+
catch (error) {
|
|
868
|
+
failures.push(`syntax check failed: ${filePath} -> ${error instanceof Error ? error.message : String(error)}`);
|
|
869
|
+
}
|
|
870
|
+
}
|
|
871
|
+
return failures;
|
|
872
|
+
}
|
|
873
|
+
/**
|
|
874
|
+
* Saves one small metadata file for eval scenario runs that feed several assertions.
|
|
875
|
+
*/
|
|
876
|
+
function writeEvalScenarioMetadata(artifactDir, scenario, assertions) {
|
|
877
|
+
fs.writeFileSync(`${artifactDir}/scenario.json`, JSON.stringify({
|
|
878
|
+
scenarioId: scenario.id,
|
|
879
|
+
query: scenario.query,
|
|
880
|
+
assertionIds: assertions.map(test => test.id),
|
|
881
|
+
}, null, 2), "utf-8");
|
|
882
|
+
}
|
|
883
|
+
/**
|
|
884
|
+
* Resolves the execution unit for one assertion.
|
|
885
|
+
* Example:
|
|
886
|
+
* - direct eval keeps its own query/setup/runOptions
|
|
887
|
+
* - grouped mutating eval maps to one shared scenario and runs once
|
|
888
|
+
*/
|
|
889
|
+
function resolveEvalScenario(test) {
|
|
890
|
+
if (!test.scenarioId) {
|
|
891
|
+
return {
|
|
892
|
+
id: test.id,
|
|
893
|
+
query: test.query ?? "",
|
|
894
|
+
setupSteps: test.setupSteps,
|
|
895
|
+
runOptions: test.runOptions,
|
|
896
|
+
};
|
|
897
|
+
}
|
|
898
|
+
const sharedScenario = localEvalScenarios.find(scenario => scenario.id === test.scenarioId);
|
|
899
|
+
if (!sharedScenario) {
|
|
900
|
+
throw new Error(`Unknown eval scenario: ${test.scenarioId}`);
|
|
901
|
+
}
|
|
902
|
+
return sharedScenario;
|
|
903
|
+
}
|
|
904
|
+
function groupEvalCasesByScenario(selectedEvals) {
|
|
905
|
+
const grouped = new Map();
|
|
906
|
+
for (const test of selectedEvals) {
|
|
907
|
+
const scenario = resolveEvalScenario(test);
|
|
908
|
+
const existing = grouped.get(scenario.id);
|
|
909
|
+
if (existing) {
|
|
910
|
+
existing.assertions.push(test);
|
|
911
|
+
continue;
|
|
912
|
+
}
|
|
913
|
+
grouped.set(scenario.id, {
|
|
914
|
+
scenario,
|
|
915
|
+
assertions: [test],
|
|
916
|
+
runLog: "",
|
|
917
|
+
finalAnswerText: "",
|
|
918
|
+
});
|
|
919
|
+
}
|
|
920
|
+
return Array.from(grouped.values());
|
|
921
|
+
}
|
|
922
|
+
function extractFinalAnswerText(runLog) {
|
|
923
|
+
return extractFinalAnswerTextFromRuntimeLog(runLog);
|
|
924
|
+
}
|
|
925
|
+
/**
|
|
926
|
+
* Keeps only the latest visible run block for the current query when one exists.
|
|
927
|
+
* Example: mixed shared output + `Query: check guards` -> slice from that query block onward.
|
|
928
|
+
*/
|
|
929
|
+
function isolateRunLogForQuery(runLog, query) {
|
|
930
|
+
const normalizedRunLog = normalizeRuntimeLogText(runLog);
|
|
931
|
+
const marker = `Query: ${query.trim()}`;
|
|
932
|
+
const queryIndex = normalizedRunLog.lastIndexOf(marker);
|
|
933
|
+
if (queryIndex < 0) {
|
|
934
|
+
return normalizedRunLog;
|
|
935
|
+
}
|
|
936
|
+
const runStartIndex = normalizedRunLog.lastIndexOf("== ", queryIndex);
|
|
937
|
+
return (runStartIndex >= 0 ? normalizedRunLog.slice(runStartIndex) : normalizedRunLog.slice(queryIndex)).trim();
|
|
938
|
+
}
|
|
939
|
+
/**
|
|
940
|
+
* Detects whether the captured run log looks like the run that this eval just created.
|
|
941
|
+
* Example: a log with `Query: Explain index.ts` or `[agent] [task:created]` is owned; stale shared output is not.
|
|
942
|
+
*/
|
|
943
|
+
function hasOwnedEvalRunLog(runLog, query) {
|
|
944
|
+
if (!runLog.trim()) {
|
|
945
|
+
return false;
|
|
946
|
+
}
|
|
947
|
+
return runLog.includes(`Query: ${query.trim()}`)
|
|
948
|
+
|| runLog.includes("[agent] [task:created]")
|
|
949
|
+
|| runLog.includes("[main] [task:created]");
|
|
950
|
+
}
|
|
951
|
+
function isEvalApplicable(test, runLog) {
|
|
952
|
+
const normalizedRunLog = normalizeRuntimeLogText(runLog);
|
|
953
|
+
const mustContain = test.when?.mustContainInRunLog ?? [];
|
|
954
|
+
for (const token of mustContain) {
|
|
955
|
+
if (!normalizedRunLog.includes(token)) {
|
|
956
|
+
return { ok: false, reason: `condition unmet: missing "${token}"` };
|
|
957
|
+
}
|
|
958
|
+
}
|
|
959
|
+
const mustNotContain = test.when?.mustNotContainInRunLog ?? [];
|
|
960
|
+
for (const token of mustNotContain) {
|
|
961
|
+
if (normalizedRunLog.includes(token)) {
|
|
962
|
+
return { ok: false, reason: `condition unmet: found forbidden "${token}"` };
|
|
963
|
+
}
|
|
964
|
+
}
|
|
965
|
+
return { ok: true };
|
|
966
|
+
}
|
|
967
|
+
function deriveTuningHints(failures, runLog) {
|
|
968
|
+
const hints = new Set();
|
|
969
|
+
const joined = failures.join(" ").toLowerCase();
|
|
970
|
+
const runLogLower = runLog.toLowerCase();
|
|
971
|
+
if (joined.includes("targetsymbols") || joined.includes("\"can\"")) {
|
|
972
|
+
hints.add("Symbol extraction is too permissive. Tune sentence-starter/stopword filtering in symbol heuristics.");
|
|
973
|
+
}
|
|
974
|
+
if (joined.includes("domain-anchors:") || joined.includes("missing token: domain-anchors:")) {
|
|
975
|
+
hints.add("Domain anchor recall is low. Tune phrase expansions or generic-token exclusions in query anchor extraction.");
|
|
976
|
+
}
|
|
977
|
+
if (joined.includes("searchorchestrator") || joined.includes("resolvedtargetfiles")) {
|
|
978
|
+
hints.add("Exact-target retrieval did not leave a visible search packet. Inspect orchestrator logging and downstream focus derivation.");
|
|
979
|
+
}
|
|
980
|
+
if (joined.includes("output | infoplangen") || joined.includes("output | filesearch")) {
|
|
981
|
+
hints.add("Expansion did not trigger as expected. Tune readiness weak-alignment guard or regex escalation thresholds.");
|
|
982
|
+
}
|
|
983
|
+
if (runLogLower.includes("weakly aligned to domain anchors") || runLogLower.includes("weak domain-anchor alignment")) {
|
|
984
|
+
hints.add("Weak-alignment gate fired. Consider lowering anchor strictness or adding a missing phrase synonym.");
|
|
985
|
+
}
|
|
986
|
+
if (hints.size === 0 && failures.length > 0) {
|
|
987
|
+
hints.add("Inspect failed tokens and tune the nearest retrieval gate (symbol, anchors, readiness, or expansion) with a minimal change.");
|
|
988
|
+
}
|
|
989
|
+
return Array.from(hints);
|
|
990
|
+
}
|
|
991
|
+
function buildEvalArtifactReport(taskId, testId, artifactDir, options) {
|
|
992
|
+
if (typeof taskId !== "number") {
|
|
993
|
+
return undefined;
|
|
994
|
+
}
|
|
995
|
+
try {
|
|
996
|
+
const reportText = buildEvalReportMarkdown(taskId, {
|
|
997
|
+
title: `Eval Report | ${testId}`,
|
|
998
|
+
mermaid: true,
|
|
999
|
+
evaluate: true,
|
|
1000
|
+
artifactDir,
|
|
1001
|
+
}) ?? undefined;
|
|
1002
|
+
if (options?.reportEachEval) {
|
|
1003
|
+
printEvalReport(taskId, {
|
|
1004
|
+
title: `Eval Report | ${testId}`,
|
|
1005
|
+
mermaid: true,
|
|
1006
|
+
evaluate: true,
|
|
1007
|
+
artifactDir,
|
|
1008
|
+
});
|
|
1009
|
+
}
|
|
1010
|
+
return reportText;
|
|
1011
|
+
}
|
|
1012
|
+
catch (err) {
|
|
1013
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
1014
|
+
console.log(chalk.yellow(` - report error: ${message}`));
|
|
1015
|
+
return undefined;
|
|
1016
|
+
}
|
|
1017
|
+
}
|
|
1018
|
+
function resolveEvalQueryOptions(options, latestTaskId) {
|
|
1019
|
+
if (options?.resumeLatestTask) {
|
|
1020
|
+
return {
|
|
1021
|
+
resumeTaskId: latestTaskId,
|
|
1022
|
+
bindAsCurrentTask: options.bindAsCurrentTask,
|
|
1023
|
+
continueCurrentTask: false,
|
|
1024
|
+
runtime: options.runtime,
|
|
1025
|
+
};
|
|
1026
|
+
}
|
|
1027
|
+
return {
|
|
1028
|
+
resumeTaskId: options?.resumeTaskId,
|
|
1029
|
+
continueCurrentTask: options?.continueCurrentTask ?? false,
|
|
1030
|
+
bindAsCurrentTask: options?.bindAsCurrentTask,
|
|
1031
|
+
runtime: options?.runtime,
|
|
1032
|
+
};
|
|
1033
|
+
}
|
|
1034
|
+
function inferLikelyFailureKind(test, failures, runtimeError) {
|
|
1035
|
+
if ((test.suite ?? "default") !== "transform") {
|
|
1036
|
+
return undefined;
|
|
1037
|
+
}
|
|
1038
|
+
if (runtimeError || failures.some((failure) => failure.includes("runtime error") || failure.includes("syntax check failed"))) {
|
|
1039
|
+
return "runtime-failure";
|
|
1040
|
+
}
|
|
1041
|
+
if (test.id === "transform_spatialmap_small_multi_extract_portal_request_helpers") {
|
|
1042
|
+
return failures.length > 0 ? "partial-transform-miss" : undefined;
|
|
1043
|
+
}
|
|
1044
|
+
const onlyTokenShapeFailures = failures.length > 0 && failures.every((failure) => failure.includes("file missing token:")
|
|
1045
|
+
|| failure.includes("file contains forbidden token:"));
|
|
1046
|
+
if (onlyTokenShapeFailures && (test.id === "transform_spatialmap_small_extract_profile_url_helper"
|
|
1047
|
+
|| test.id === "transform_spatialmap_medium_extract_profile_tool_url")) {
|
|
1048
|
+
return "harness-strictness";
|
|
1049
|
+
}
|
|
1050
|
+
return failures.length > 0 ? "partial-transform-miss" : undefined;
|
|
1051
|
+
}
|
|
1052
|
+
export async function runLocalEvals(runQuery, options) {
|
|
1053
|
+
const previousSuppress = process.env[SUPPRESS_FINAL_CONTEXT_ENV];
|
|
1054
|
+
process.env[SUPPRESS_FINAL_CONTEXT_ENV] = "1";
|
|
1055
|
+
const requestedIds = new Set(options?.includeIds ?? []);
|
|
1056
|
+
const selectedEvalsBase = requestedIds.size > 0
|
|
1057
|
+
? localEvalCases.filter(test => requestedIds.has(test.id))
|
|
1058
|
+
: localEvalCases;
|
|
1059
|
+
const selectedEvals = options?.group
|
|
1060
|
+
? selectedEvalsBase.filter((test) => (test.group ?? "main") === options.group)
|
|
1061
|
+
: selectedEvalsBase;
|
|
1062
|
+
const suiteFilteredEvals = options?.suite
|
|
1063
|
+
? selectedEvals.filter((test) => (test.suite ?? "default") === options.suite)
|
|
1064
|
+
: selectedEvals;
|
|
1065
|
+
if (requestedIds.size > 0 && suiteFilteredEvals.length === 0) {
|
|
1066
|
+
console.log(chalk.yellow("[EVAL] No matching eval ids."));
|
|
1067
|
+
console.log(chalk.yellow(`Available ids: ${localEvalCases.map(test => test.id).join(", ")}`));
|
|
1068
|
+
return;
|
|
1069
|
+
}
|
|
1070
|
+
if (!requestedIds.size && (options?.group || options?.suite) && suiteFilteredEvals.length === 0) {
|
|
1071
|
+
const groupLabel = options?.group ? `group "${options.group}"` : undefined;
|
|
1072
|
+
const suiteLabel = options?.suite ? `suite "${options.suite}"` : undefined;
|
|
1073
|
+
const filterSummary = [groupLabel, suiteLabel].filter(Boolean).join(" and ");
|
|
1074
|
+
console.log(chalk.yellow(`[EVAL] No matching evals for ${filterSummary}.`));
|
|
1075
|
+
return;
|
|
1076
|
+
}
|
|
1077
|
+
const startedAt = Date.now();
|
|
1078
|
+
const selectedScenarioRuns = groupEvalCasesByScenario(suiteFilteredEvals);
|
|
1079
|
+
const repoRootPath = resolveCanonicalRepoIdentity().repoRootPath;
|
|
1080
|
+
const outcomesById = createInitialEvalOutcomes(suiteFilteredEvals.map((test) => test.id));
|
|
1081
|
+
let batchRunError;
|
|
1082
|
+
try {
|
|
1083
|
+
for (const scenarioRun of selectedScenarioRuns) {
|
|
1084
|
+
const { scenario, assertions } = scenarioRun;
|
|
1085
|
+
scenarioRun.artifactDir = createHarnessArtifactDir("evals", scenario.id);
|
|
1086
|
+
writeEvalScenarioMetadata(scenarioRun.artifactDir, scenario, assertions);
|
|
1087
|
+
fs.mkdirSync(requireRunLogDir(), { recursive: true });
|
|
1088
|
+
fs.writeFileSync(RUN_LOG_PATH, "", { flag: "w" });
|
|
1089
|
+
let latestTaskId = getLatestTaskId() ?? undefined;
|
|
1090
|
+
startLlmTraceSession(scenarioRun.artifactDir);
|
|
1091
|
+
try {
|
|
1092
|
+
for (const step of scenario.setupSteps ?? []) {
|
|
1093
|
+
const beforeSetupTaskId = getLatestTaskId();
|
|
1094
|
+
await runQuery(step.query, resolveEvalQueryOptions(step.options, latestTaskId));
|
|
1095
|
+
const afterSetupTaskId = getLatestTaskId();
|
|
1096
|
+
latestTaskId =
|
|
1097
|
+
typeof afterSetupTaskId === "number" &&
|
|
1098
|
+
(beforeSetupTaskId == null || afterSetupTaskId >= beforeSetupTaskId)
|
|
1099
|
+
? afterSetupTaskId
|
|
1100
|
+
: latestTaskId;
|
|
1101
|
+
}
|
|
1102
|
+
const assertionIds = assertions.map(test => test.id).join(", ");
|
|
1103
|
+
console.log(chalk.cyan(`\n[EVAL] ${scenario.id}`));
|
|
1104
|
+
console.log(chalk.dim(` assertions: ${assertionIds}`));
|
|
1105
|
+
console.log(chalk.bold(`[EVAL QUERY] ${scenario.query}`));
|
|
1106
|
+
fs.appendFileSync(RUN_LOG_PATH, `\n[EVAL] ${scenario.id}\n[EVAL ASSERTIONS] ${assertionIds}\n[EVAL QUERY] ${scenario.query}\n`, "utf-8");
|
|
1107
|
+
const beforeTaskId = getLatestTaskId();
|
|
1108
|
+
try {
|
|
1109
|
+
await runQuery(scenario.query, resolveEvalQueryOptions(scenario.runOptions, latestTaskId));
|
|
1110
|
+
}
|
|
1111
|
+
catch (err) {
|
|
1112
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
1113
|
+
scenarioRun.runtimeError = message;
|
|
1114
|
+
}
|
|
1115
|
+
const afterTaskId = getLatestTaskId();
|
|
1116
|
+
scenarioRun.createdTaskId =
|
|
1117
|
+
typeof afterTaskId === "number" &&
|
|
1118
|
+
(beforeTaskId == null || afterTaskId > beforeTaskId)
|
|
1119
|
+
? afterTaskId
|
|
1120
|
+
: undefined;
|
|
1121
|
+
const rawRunLog = fs.existsSync(RUN_LOG_PATH)
|
|
1122
|
+
? fs.readFileSync(RUN_LOG_PATH, "utf-8")
|
|
1123
|
+
: "";
|
|
1124
|
+
scenarioRun.runLog = isolateRunLogForQuery(rawRunLog, scenario.query);
|
|
1125
|
+
scenarioRun.finalAnswerText = extractFinalAnswerText(scenarioRun.runLog);
|
|
1126
|
+
scenarioRun.outputFiles = collectOutputFilesForAssertions(assertions, repoRootPath);
|
|
1127
|
+
if (!scenarioRun.createdTaskId && !hasOwnedEvalRunLog(scenarioRun.runLog, scenario.query)) {
|
|
1128
|
+
scenarioRun.harnessError = "eval harness could not isolate the run log or created task for this scenario";
|
|
1129
|
+
}
|
|
1130
|
+
}
|
|
1131
|
+
finally {
|
|
1132
|
+
stopLlmTraceSession();
|
|
1133
|
+
}
|
|
1134
|
+
for (const test of assertions) {
|
|
1135
|
+
let reportText;
|
|
1136
|
+
const testQuery = test.query ?? scenario.query;
|
|
1137
|
+
const normalizedRunLog = normalizeRuntimeLogText(scenarioRun.runLog);
|
|
1138
|
+
if (scenarioRun.harnessError) {
|
|
1139
|
+
console.log(chalk.yellow(`[INCOMPLETE] ${test.id}`));
|
|
1140
|
+
console.log(chalk.yellow(` - ${scenarioRun.harnessError}`));
|
|
1141
|
+
const artifactDir = writeHarnessArtifacts({
|
|
1142
|
+
kind: "evals",
|
|
1143
|
+
caseId: test.id,
|
|
1144
|
+
query: testQuery,
|
|
1145
|
+
taskId: scenarioRun.createdTaskId ?? undefined,
|
|
1146
|
+
artifactDir: scenarioRun.artifactDir,
|
|
1147
|
+
runLog: scenarioRun.runLog,
|
|
1148
|
+
finalAnswerText: scenarioRun.finalAnswerText,
|
|
1149
|
+
result: {
|
|
1150
|
+
id: test.id,
|
|
1151
|
+
applicable: true,
|
|
1152
|
+
passed: false,
|
|
1153
|
+
outcome: "incomplete",
|
|
1154
|
+
failures: [scenarioRun.harnessError],
|
|
1155
|
+
},
|
|
1156
|
+
outputFiles: scenarioRun.outputFiles,
|
|
1157
|
+
});
|
|
1158
|
+
recordEvalOutcome(outcomesById, test.id, {
|
|
1159
|
+
kind: "incomplete",
|
|
1160
|
+
applicable: true,
|
|
1161
|
+
failures: [scenarioRun.harnessError],
|
|
1162
|
+
artifactDir,
|
|
1163
|
+
});
|
|
1164
|
+
console.log(chalk.dim(` artifacts: ${artifactDir}`));
|
|
1165
|
+
continue;
|
|
1166
|
+
}
|
|
1167
|
+
if (scenarioRun.runtimeError) {
|
|
1168
|
+
recordEvalOutcome(outcomesById, test.id, {
|
|
1169
|
+
kind: "failed",
|
|
1170
|
+
applicable: true,
|
|
1171
|
+
failures: [`runtime error: ${scenarioRun.runtimeError}`],
|
|
1172
|
+
});
|
|
1173
|
+
console.log(chalk.red(`[FAIL] ${test.id}`));
|
|
1174
|
+
console.log(chalk.red(` - runtime error: ${scenarioRun.runtimeError}`));
|
|
1175
|
+
const artifactDir = writeHarnessArtifacts({
|
|
1176
|
+
kind: "evals",
|
|
1177
|
+
caseId: test.id,
|
|
1178
|
+
query: testQuery,
|
|
1179
|
+
artifactDir: scenarioRun.artifactDir,
|
|
1180
|
+
runLog: scenarioRun.runLog,
|
|
1181
|
+
finalAnswerText: scenarioRun.finalAnswerText,
|
|
1182
|
+
result: {
|
|
1183
|
+
id: test.id,
|
|
1184
|
+
applicable: true,
|
|
1185
|
+
passed: false,
|
|
1186
|
+
failures: [`runtime error: ${scenarioRun.runtimeError}`],
|
|
1187
|
+
likelyFailureKind: inferLikelyFailureKind(test, [`runtime error: ${scenarioRun.runtimeError}`], scenarioRun.runtimeError),
|
|
1188
|
+
},
|
|
1189
|
+
outputFiles: scenarioRun.outputFiles,
|
|
1190
|
+
});
|
|
1191
|
+
recordEvalOutcome(outcomesById, test.id, {
|
|
1192
|
+
kind: "failed",
|
|
1193
|
+
applicable: true,
|
|
1194
|
+
failures: [`runtime error: ${scenarioRun.runtimeError}`],
|
|
1195
|
+
artifactDir,
|
|
1196
|
+
});
|
|
1197
|
+
continue;
|
|
1198
|
+
}
|
|
1199
|
+
const applicability = isEvalApplicable(test, scenarioRun.runLog);
|
|
1200
|
+
if (!applicability.ok) {
|
|
1201
|
+
reportText = buildEvalArtifactReport(scenarioRun.createdTaskId, test.id, scenarioRun.artifactDir, options);
|
|
1202
|
+
console.log(chalk.blue(`[SKIP] ${test.id} (${applicability.reason})`));
|
|
1203
|
+
const artifactDir = writeHarnessArtifacts({
|
|
1204
|
+
kind: "evals",
|
|
1205
|
+
caseId: test.id,
|
|
1206
|
+
query: testQuery,
|
|
1207
|
+
taskId: scenarioRun.createdTaskId ?? undefined,
|
|
1208
|
+
artifactDir: scenarioRun.artifactDir,
|
|
1209
|
+
runLog: scenarioRun.runLog,
|
|
1210
|
+
finalAnswerText: scenarioRun.finalAnswerText,
|
|
1211
|
+
reportText,
|
|
1212
|
+
result: {
|
|
1213
|
+
id: test.id,
|
|
1214
|
+
applicable: false,
|
|
1215
|
+
passed: false,
|
|
1216
|
+
failures: [applicability.reason ?? "skipped"],
|
|
1217
|
+
likelyFailureKind: inferLikelyFailureKind(test, [applicability.reason ?? "skipped"], undefined),
|
|
1218
|
+
},
|
|
1219
|
+
outputFiles: scenarioRun.outputFiles,
|
|
1220
|
+
});
|
|
1221
|
+
recordEvalOutcome(outcomesById, test.id, {
|
|
1222
|
+
kind: "skipped",
|
|
1223
|
+
applicable: false,
|
|
1224
|
+
failures: [applicability.reason ?? "skipped"],
|
|
1225
|
+
artifactDir,
|
|
1226
|
+
});
|
|
1227
|
+
console.log(chalk.dim(` artifacts: ${artifactDir}`));
|
|
1228
|
+
continue;
|
|
1229
|
+
}
|
|
1230
|
+
const failures = [];
|
|
1231
|
+
for (const token of test.checks.mustContain ?? []) {
|
|
1232
|
+
if (!normalizedRunLog.includes(token)) {
|
|
1233
|
+
failures.push(`missing token: ${token}`);
|
|
1234
|
+
}
|
|
1235
|
+
}
|
|
1236
|
+
for (const token of test.checks.mustNotContain ?? []) {
|
|
1237
|
+
if (normalizedRunLog.includes(token)) {
|
|
1238
|
+
failures.push(`unexpected token: ${token}`);
|
|
1239
|
+
}
|
|
1240
|
+
}
|
|
1241
|
+
for (const [token, max] of Object.entries(test.checks.maxOccurrences ?? {})) {
|
|
1242
|
+
const count = countOccurrences(normalizedRunLog, token);
|
|
1243
|
+
if (count > max) {
|
|
1244
|
+
failures.push(`too many occurrences: "${token}" => ${count} (max ${max})`);
|
|
1245
|
+
}
|
|
1246
|
+
}
|
|
1247
|
+
for (const token of test.checks.mustContainAnswer ?? []) {
|
|
1248
|
+
if (!scenarioRun.finalAnswerText.toLowerCase().includes(token.toLowerCase())) {
|
|
1249
|
+
failures.push(`final answer missing token: ${token}`);
|
|
1250
|
+
}
|
|
1251
|
+
}
|
|
1252
|
+
for (const token of test.checks.mustNotContainAnswer ?? []) {
|
|
1253
|
+
if (scenarioRun.finalAnswerText.toLowerCase().includes(token.toLowerCase())) {
|
|
1254
|
+
failures.push(`final answer contains forbidden token: ${token}`);
|
|
1255
|
+
}
|
|
1256
|
+
}
|
|
1257
|
+
failures.push(...collectFileCheckFailures(test.checks.fileMustContain, repoRootPath, "must-contain"), ...collectFileCheckFailures(test.checks.fileMustNotContain, repoRootPath, "must-not-contain"), ...collectFileAnyTokenFailures(test.checks.fileMustContainAnyOf, repoRootPath), ...collectSyntaxFailures(test.syntaxCheckPaths, repoRootPath));
|
|
1258
|
+
if (failures.length === 0) {
|
|
1259
|
+
console.log(chalk.green(`[PASS] ${test.id}`));
|
|
1260
|
+
}
|
|
1261
|
+
else {
|
|
1262
|
+
console.log(chalk.red(`[FAIL] ${test.id}`));
|
|
1263
|
+
for (const failure of failures) {
|
|
1264
|
+
console.log(chalk.red(` - ${failure}`));
|
|
1265
|
+
}
|
|
1266
|
+
const tuningHints = deriveTuningHints(failures, scenarioRun.runLog);
|
|
1267
|
+
if (tuningHints.length > 0) {
|
|
1268
|
+
console.log(chalk.yellow(" tuning hints:"));
|
|
1269
|
+
for (const hint of tuningHints) {
|
|
1270
|
+
console.log(chalk.yellow(` - ${hint}`));
|
|
1271
|
+
}
|
|
1272
|
+
}
|
|
1273
|
+
}
|
|
1274
|
+
reportText = buildEvalArtifactReport(scenarioRun.createdTaskId, test.id, scenarioRun.artifactDir, options);
|
|
1275
|
+
const artifactDir = writeHarnessArtifacts({
|
|
1276
|
+
kind: "evals",
|
|
1277
|
+
caseId: test.id,
|
|
1278
|
+
query: testQuery,
|
|
1279
|
+
taskId: scenarioRun.createdTaskId ?? undefined,
|
|
1280
|
+
artifactDir: scenarioRun.artifactDir,
|
|
1281
|
+
runLog: scenarioRun.runLog,
|
|
1282
|
+
finalAnswerText: scenarioRun.finalAnswerText,
|
|
1283
|
+
reportText,
|
|
1284
|
+
result: {
|
|
1285
|
+
id: test.id,
|
|
1286
|
+
applicable: applicability.ok,
|
|
1287
|
+
passed: failures.length === 0,
|
|
1288
|
+
failures,
|
|
1289
|
+
likelyFailureKind: inferLikelyFailureKind(test, failures, scenarioRun.runtimeError),
|
|
1290
|
+
},
|
|
1291
|
+
outputFiles: scenarioRun.outputFiles,
|
|
1292
|
+
});
|
|
1293
|
+
recordEvalOutcome(outcomesById, test.id, {
|
|
1294
|
+
kind: failures.length === 0 ? "passed" : "failed",
|
|
1295
|
+
applicable: true,
|
|
1296
|
+
failures,
|
|
1297
|
+
artifactDir,
|
|
1298
|
+
});
|
|
1299
|
+
console.log(chalk.dim(` artifacts: ${artifactDir}`));
|
|
1300
|
+
}
|
|
1301
|
+
}
|
|
1302
|
+
}
|
|
1303
|
+
catch (err) {
|
|
1304
|
+
batchRunError = err instanceof Error ? err : new Error(String(err));
|
|
1305
|
+
}
|
|
1306
|
+
finally {
|
|
1307
|
+
if (previousSuppress === undefined) {
|
|
1308
|
+
delete process.env[SUPPRESS_FINAL_CONTEXT_ENV];
|
|
1309
|
+
}
|
|
1310
|
+
else {
|
|
1311
|
+
process.env[SUPPRESS_FINAL_CONTEXT_ENV] = previousSuppress;
|
|
1312
|
+
}
|
|
1313
|
+
}
|
|
1314
|
+
const durationSec = ((Date.now() - startedAt) / 1000).toFixed(1);
|
|
1315
|
+
const summary = buildEvalBatchSummary(outcomesById);
|
|
1316
|
+
assertEvalSummaryInvariant(summary);
|
|
1317
|
+
const batchSummaryArtifactDir = createHarnessArtifactDir("evals", "batch-summary");
|
|
1318
|
+
const batchSummaryArtifact = writeEvalBatchSummaryArtifact(batchSummaryArtifactDir, {
|
|
1319
|
+
selectedEvalIds: suiteFilteredEvals.map((test) => test.id),
|
|
1320
|
+
summary,
|
|
1321
|
+
outcomesById,
|
|
1322
|
+
durationSec,
|
|
1323
|
+
});
|
|
1324
|
+
const color = summary.failed === 0 && summary.incomplete === 0 ? chalk.green : chalk.yellow;
|
|
1325
|
+
console.log(color(formatEvalSummaryLine(summary, durationSec)));
|
|
1326
|
+
console.log(chalk.dim(` batch summary: ${batchSummaryArtifact}`));
|
|
1327
|
+
if (batchRunError) {
|
|
1328
|
+
throw batchRunError;
|
|
1329
|
+
}
|
|
1330
|
+
}
|
|
1331
|
+
function requireRunLogDir() {
|
|
1332
|
+
const idx = RUN_LOG_PATH.lastIndexOf("/");
|
|
1333
|
+
if (idx <= 0)
|
|
1334
|
+
return ".";
|
|
1335
|
+
return RUN_LOG_PATH.slice(0, idx);
|
|
1336
|
+
}
|
|
1337
|
+
//# sourceMappingURL=evalCommands.js.map
|