agent-os-kernel 1.1.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_os/__init__.py +66 -4
- agent_os/agents_compat.py +286 -0
- agent_os/base_agent.py +308 -0
- agent_os/cli.py +1079 -19
- agent_os/integrations/__init__.py +37 -2
- agent_os/integrations/openai_adapter.py +502 -0
- agent_os/integrations/semantic_kernel_adapter.py +569 -0
- agent_os/stateless.py +349 -0
- agent_os_kernel-1.2.0.dist-info/METADATA +676 -0
- agent_os_kernel-1.2.0.dist-info/RECORD +1053 -0
- {agent_os_kernel-1.1.0.dist-info → agent_os_kernel-1.2.0.dist-info}/entry_points.txt +0 -1
- modules/amb/.github/workflows/ci.yml +102 -0
- modules/amb/.github/workflows/publish.yml +146 -0
- modules/amb/.gitignore +134 -0
- modules/amb/CHANGELOG.md +118 -0
- modules/amb/CONTRIBUTING.md +141 -0
- modules/amb/LICENSE +21 -0
- modules/amb/README.md +188 -0
- modules/amb/amb_core/__init__.py +175 -0
- modules/amb/amb_core/adapters/__init__.py +55 -0
- modules/amb/amb_core/adapters/aws_sqs_broker.py +374 -0
- modules/amb/amb_core/adapters/azure_servicebus_broker.py +338 -0
- modules/amb/amb_core/adapters/kafka_broker.py +258 -0
- modules/amb/amb_core/adapters/nats_broker.py +283 -0
- modules/amb/amb_core/adapters/rabbitmq_broker.py +233 -0
- modules/amb/amb_core/adapters/redis_broker.py +260 -0
- modules/amb/amb_core/broker.py +143 -0
- modules/amb/amb_core/bus.py +479 -0
- modules/amb/amb_core/cloudevents.py +507 -0
- modules/amb/amb_core/dlq.py +343 -0
- modules/amb/amb_core/hf_utils.py +534 -0
- modules/amb/amb_core/memory_broker.py +408 -0
- modules/amb/amb_core/models.py +139 -0
- modules/amb/amb_core/persistence.py +527 -0
- modules/amb/amb_core/schema.py +292 -0
- modules/amb/amb_core/tracing.py +356 -0
- modules/amb/examples/advanced_features.py +223 -0
- modules/amb/examples/backpressure_demo.py +225 -0
- modules/amb/examples/basic_usage.py +117 -0
- modules/amb/examples/tracing_demo.py +104 -0
- modules/amb/experiments/README.md +52 -0
- modules/amb/experiments/reproduce_results.py +467 -0
- modules/amb/experiments/results.json +324 -0
- modules/amb/paper/README.md +40 -0
- modules/amb/paper/paper.tex +365 -0
- modules/amb/paper/whitepaper.md +377 -0
- modules/amb/pyproject.toml +117 -0
- modules/amb/tests/__init__.py +1 -0
- modules/amb/tests/test_backpressure_priority.py +280 -0
- modules/amb/tests/test_bus.py +198 -0
- modules/amb/tests/test_cloudevents.py +443 -0
- modules/amb/tests/test_features.py +531 -0
- modules/amb/tests/test_models.py +74 -0
- modules/amb/tests/test_tracing.py +254 -0
- modules/atr/.github/workflows/ci.yml +101 -0
- modules/atr/.github/workflows/publish.yml +140 -0
- modules/atr/.gitignore +134 -0
- modules/atr/.pre-commit-config.yaml +37 -0
- modules/atr/CHANGELOG.md +39 -0
- modules/atr/CONTRIBUTING.md +96 -0
- modules/atr/IMPLEMENTATION_SUMMARY.md +143 -0
- modules/atr/README.md +180 -0
- modules/atr/atr/__init__.py +638 -0
- modules/atr/atr/access.py +346 -0
- modules/atr/atr/composition.py +643 -0
- modules/atr/atr/decorator.py +355 -0
- modules/atr/atr/executor.py +382 -0
- modules/atr/atr/health.py +555 -0
- modules/atr/atr/hf_utils.py +447 -0
- modules/atr/atr/injection.py +420 -0
- modules/atr/atr/metrics.py +438 -0
- modules/atr/atr/policies.py +401 -0
- modules/atr/atr/py.typed +2 -0
- modules/atr/atr/registry.py +450 -0
- modules/atr/atr/schema.py +478 -0
- modules/atr/atr/tools/safe/__init__.py +73 -0
- modules/atr/atr/tools/safe/calculator.py +380 -0
- modules/atr/atr/tools/safe/datetime_tool.py +441 -0
- modules/atr/atr/tools/safe/file_reader.py +400 -0
- modules/atr/atr/tools/safe/http_client.py +314 -0
- modules/atr/atr/tools/safe/json_parser.py +372 -0
- modules/atr/atr/tools/safe/text_tool.py +526 -0
- modules/atr/atr/tools/safe/toolkit.py +173 -0
- modules/atr/docs/PYPI_SETUP.md +113 -0
- modules/atr/examples/README.md +27 -0
- modules/atr/examples/demo.py +144 -0
- modules/atr/examples/sandbox_demo.py +218 -0
- modules/atr/experiments/README.md +69 -0
- modules/atr/experiments/reproduce_results.py +509 -0
- modules/atr/experiments/results/.gitkeep +0 -0
- modules/atr/experiments/results/results_20260123_140334.json +71 -0
- modules/atr/paper/README.md +36 -0
- modules/atr/paper/figures/.gitkeep +0 -0
- modules/atr/paper/references.bib +84 -0
- modules/atr/paper/structure.tex +293 -0
- modules/atr/paper/whitepaper.md +234 -0
- modules/atr/pyproject.toml +148 -0
- modules/atr/requirements.txt +1 -0
- modules/atr/setup.py +30 -0
- modules/atr/tests/__init__.py +1 -0
- modules/atr/tests/test_decorator.py +317 -0
- modules/atr/tests/test_executor.py +245 -0
- modules/atr/tests/test_integration_executor.py +184 -0
- modules/atr/tests/test_registry.py +312 -0
- modules/atr/tests/test_schema.py +182 -0
- modules/atr/tests/test_v2_features.py +708 -0
- modules/caas/.dockerignore +63 -0
- modules/caas/.github/ISSUE_TEMPLATE/bug_report.md +38 -0
- modules/caas/.github/ISSUE_TEMPLATE/custom.md +10 -0
- modules/caas/.github/ISSUE_TEMPLATE/feature_request.md +20 -0
- modules/caas/.github/workflows/ci.yml +100 -0
- modules/caas/.github/workflows/lint.yml +39 -0
- modules/caas/.github/workflows/publish-pypi.yml +124 -0
- modules/caas/.gitignore +73 -0
- modules/caas/.pre-commit-config.yaml +33 -0
- modules/caas/CHANGELOG.md +58 -0
- modules/caas/CONTRIBUTING.md +346 -0
- modules/caas/Dockerfile +41 -0
- modules/caas/LICENSE +21 -0
- modules/caas/MANIFEST.in +11 -0
- modules/caas/README.md +158 -0
- modules/caas/benchmarks/README.md +255 -0
- modules/caas/benchmarks/create_hf_dataset.py +502 -0
- modules/caas/benchmarks/data/sample_corpus/README.md +86 -0
- modules/caas/benchmarks/data/sample_corpus/auth_module.py +211 -0
- modules/caas/benchmarks/data/sample_corpus/contribution_guide.md +185 -0
- modules/caas/benchmarks/data/sample_corpus/remote_work_policy.html +57 -0
- modules/caas/benchmarks/hf_dataset/README.md +214 -0
- modules/caas/benchmarks/hf_dataset/caas_benchmark_corpus.py +73 -0
- modules/caas/benchmarks/hf_dataset/corpus_preview.json +193 -0
- modules/caas/benchmarks/results/README.md +66 -0
- modules/caas/benchmarks/results/evaluation_2026-01-20.json +121 -0
- modules/caas/benchmarks/run_evaluation.py +561 -0
- modules/caas/benchmarks/statistical_tests.py +289 -0
- modules/caas/benchmarks/verify_sample_corpus.py +83 -0
- modules/caas/docker-compose.yml +38 -0
- modules/caas/docs/CONTEXT_TRIAD.md +462 -0
- modules/caas/docs/CONTRIBUTING.md +346 -0
- modules/caas/docs/ETHICS_AND_LIMITATIONS.md +336 -0
- modules/caas/docs/HEURISTIC_ROUTER.md +442 -0
- modules/caas/docs/IMPLEMENTATION_SUMMARY.md +363 -0
- modules/caas/docs/IMPLEMENTATION_SUMMARY_CONTEXT_TRIAD.md +277 -0
- modules/caas/docs/IMPLEMENTATION_SUMMARY_HEURISTIC_ROUTER.md +231 -0
- modules/caas/docs/IMPLEMENTATION_SUMMARY_METADATA_INJECTION.md +258 -0
- modules/caas/docs/IMPLEMENTATION_SUMMARY_PRAGMATIC_TRUTH.md +212 -0
- modules/caas/docs/IMPLEMENTATION_SUMMARY_TRUST_GATEWAY.md +319 -0
- modules/caas/docs/LAYER_1_PRIMITIVE.md +202 -0
- modules/caas/docs/METADATA_INJECTION.md +404 -0
- modules/caas/docs/PRAGMATIC_TRUTH.md +431 -0
- modules/caas/docs/RELATED_WORK.md +312 -0
- modules/caas/docs/RELEASE_CHECKLIST.md +219 -0
- modules/caas/docs/RELEASE_GUIDE.md +285 -0
- modules/caas/docs/REPRODUCIBILITY.md +386 -0
- modules/caas/docs/SLIDING_WINDOW.md +387 -0
- modules/caas/docs/STRUCTURE_AWARE_INDEXING.md +158 -0
- modules/caas/docs/TESTING.md +259 -0
- modules/caas/docs/THREAT_MODEL.md +247 -0
- modules/caas/docs/TRUST_GATEWAY.md +575 -0
- modules/caas/docs/VFS.md +298 -0
- modules/caas/examples/agents/enterprise_security_agent.py +414 -0
- modules/caas/examples/agents/intelligent_document_analyzer.py +380 -0
- modules/caas/examples/demos/demo.py +309 -0
- modules/caas/examples/demos/demo_context_triad.py +225 -0
- modules/caas/examples/demos/demo_conversation_manager.py +285 -0
- modules/caas/examples/demos/demo_heuristic_router.py +133 -0
- modules/caas/examples/demos/demo_metadata_injection.py +198 -0
- modules/caas/examples/demos/demo_pragmatic_truth.py +303 -0
- modules/caas/examples/demos/demo_structure_aware.py +140 -0
- modules/caas/examples/demos/demo_time_decay.py +247 -0
- modules/caas/examples/demos/demo_trust_gateway.py +383 -0
- modules/caas/examples/multi_agent/README.md +159 -0
- modules/caas/examples/multi_agent/research_team.py +369 -0
- modules/caas/examples/multi_agent/vfs_collaboration.py +393 -0
- modules/caas/examples/usage/auth_module.py +142 -0
- modules/caas/examples/usage/usage_example.py +173 -0
- modules/caas/experiments/README.md +42 -0
- modules/caas/experiments/reproduce_results.py +462 -0
- modules/caas/paper/ARXIV_METADATA.md +145 -0
- modules/caas/paper/ARXIV_README.md +47 -0
- modules/caas/paper/CHECKLIST.md +103 -0
- modules/caas/paper/GITHUB_RELEASE_NOTES.md +105 -0
- modules/caas/paper/README.md +71 -0
- modules/caas/paper/abstract.md +24 -0
- modules/caas/paper/arxiv_submission.tar +0 -0
- modules/caas/paper/arxiv_submission.zip +0 -0
- modules/caas/paper/build_pdf.py +355 -0
- modules/caas/paper/experiments.md +149 -0
- modules/caas/paper/figures/.gitkeep +0 -0
- modules/caas/paper/figures/README.md +237 -0
- modules/caas/paper/figures/fig1_system_architecture.png +0 -0
- modules/caas/paper/figures/fig1_system_architecture.svg +198 -0
- modules/caas/paper/figures/fig2_context_triad.png +0 -0
- modules/caas/paper/figures/fig2_context_triad.svg +105 -0
- modules/caas/paper/figures/fig3_ablation_results.png +0 -0
- modules/caas/paper/figures/fig3_ablation_results.svg +113 -0
- modules/caas/paper/figures/fig4_routing_latency.png +0 -0
- modules/caas/paper/figures/fig4_routing_latency.svg +97 -0
- modules/caas/paper/intro.md +103 -0
- modules/caas/paper/latex/figures/fig1_system_architecture.png +0 -0
- modules/caas/paper/latex/figures/fig2_context_triad.png +0 -0
- modules/caas/paper/latex/figures/fig3_ablation_results.png +0 -0
- modules/caas/paper/latex/figures/fig4_routing_latency.png +0 -0
- modules/caas/paper/latex/main.tex +468 -0
- modules/caas/paper/latex/references.bib +140 -0
- modules/caas/paper/method.md +350 -0
- modules/caas/paper/outline.md +123 -0
- modules/caas/paper/related_work.md +101 -0
- modules/caas/paper/tables/.gitkeep +0 -0
- modules/caas/paper/tables/results_tables.md +50 -0
- modules/caas/pyproject.toml +172 -0
- modules/caas/requirements.txt +11 -0
- modules/caas/src/caas/__init__.py +232 -0
- modules/caas/src/caas/api/__init__.py +7 -0
- modules/caas/src/caas/api/server.py +1326 -0
- modules/caas/src/caas/caching.py +832 -0
- modules/caas/src/caas/cli.py +208 -0
- modules/caas/src/caas/conversation.py +221 -0
- modules/caas/src/caas/decay.py +118 -0
- modules/caas/src/caas/detection/__init__.py +7 -0
- modules/caas/src/caas/detection/detector.py +236 -0
- modules/caas/src/caas/enrichment.py +127 -0
- modules/caas/src/caas/gateway/__init__.py +24 -0
- modules/caas/src/caas/gateway/trust_gateway.py +471 -0
- modules/caas/src/caas/hf_utils.py +477 -0
- modules/caas/src/caas/ingestion/__init__.py +21 -0
- modules/caas/src/caas/ingestion/processors.py +251 -0
- modules/caas/src/caas/ingestion/structure_parser.py +185 -0
- modules/caas/src/caas/models.py +354 -0
- modules/caas/src/caas/pragmatic_truth.py +441 -0
- modules/caas/src/caas/routing/__init__.py +8 -0
- modules/caas/src/caas/routing/heuristic_router.py +242 -0
- modules/caas/src/caas/storage/__init__.py +7 -0
- modules/caas/src/caas/storage/store.py +450 -0
- modules/caas/src/caas/triad.py +472 -0
- modules/caas/src/caas/tuning/__init__.py +7 -0
- modules/caas/src/caas/tuning/tuner.py +322 -0
- modules/caas/src/caas/vfs/__init__.py +12 -0
- modules/caas/src/caas/vfs/filesystem.py +450 -0
- modules/caas/tests/__init__.py +3 -0
- modules/caas/tests/conftest.py +8 -0
- modules/caas/tests/test_caching.py +628 -0
- modules/caas/tests/test_context_triad.py +385 -0
- modules/caas/tests/test_conversation_manager.py +289 -0
- modules/caas/tests/test_functionality.py +215 -0
- modules/caas/tests/test_heuristic_router.py +370 -0
- modules/caas/tests/test_metadata_injection.py +328 -0
- modules/caas/tests/test_pragmatic_truth.py +322 -0
- modules/caas/tests/test_structure_aware_indexing.py +283 -0
- modules/caas/tests/test_time_decay.py +268 -0
- modules/caas/tests/test_trust_gateway.py +445 -0
- modules/caas/tests/test_vfs.py +298 -0
- modules/cmvk/.github/FUNDING.yml +9 -0
- modules/cmvk/.github/dependabot.yml +54 -0
- modules/cmvk/.github/workflows/ci.yml +205 -0
- modules/cmvk/.github/workflows/publish.yml +143 -0
- modules/cmvk/.gitignore +147 -0
- modules/cmvk/.pre-commit-config.yaml +58 -0
- modules/cmvk/CHANGELOG.md +146 -0
- modules/cmvk/CITATION.cff +48 -0
- modules/cmvk/CONTRIBUTING.md +229 -0
- modules/cmvk/Dockerfile +87 -0
- modules/cmvk/HF_MODEL_CARD.md +185 -0
- modules/cmvk/LICENSE +21 -0
- modules/cmvk/README.md +149 -0
- modules/cmvk/SECURITY.md +114 -0
- modules/cmvk/config/prompts/generator_v1.txt +23 -0
- modules/cmvk/config/prompts/verifier_hostile.txt +32 -0
- modules/cmvk/config/settings.yaml +40 -0
- modules/cmvk/coverage_html/.gitignore +2 -0
- modules/cmvk/coverage_html/class_index.html +658 -0
- modules/cmvk/coverage_html/coverage_html_cb_188fc9a4.js +735 -0
- modules/cmvk/coverage_html/favicon_32_cb_c827f16f.png +0 -0
- modules/cmvk/coverage_html/function_index.html +1978 -0
- modules/cmvk/coverage_html/index.html +255 -0
- modules/cmvk/coverage_html/keybd_closed_cb_900cfef5.png +0 -0
- modules/cmvk/coverage_html/status.json +1 -0
- modules/cmvk/coverage_html/style_cb_5c747636.css +389 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38___init___py.html +315 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_audit_py.html +499 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_benchmarks_py.html +575 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_constitutional_py.html +1001 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_hf_utils_py.html +398 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_metrics_py.html +570 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_profiles_py.html +397 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_types_py.html +109 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_verification_py.html +1053 -0
- modules/cmvk/docs/DIAGRAMS.md +325 -0
- modules/cmvk/docs/architecture.md +345 -0
- modules/cmvk/docs/features.md +308 -0
- modules/cmvk/docs/getting_started.md +279 -0
- modules/cmvk/docs/innovation_layer.md +377 -0
- modules/cmvk/docs/safety.md +281 -0
- modules/cmvk/docs/traceability.md +150 -0
- modules/cmvk/examples/basic_example.py +62 -0
- modules/cmvk/examples/demo_complete_pipeline.py +209 -0
- modules/cmvk/examples/demo_innovation_layer.py +197 -0
- modules/cmvk/examples/example.py +112 -0
- modules/cmvk/examples/model_diversity_comparison.py +110 -0
- modules/cmvk/examples/real_api_integration.py +121 -0
- modules/cmvk/examples/test_full_pipeline.py +303 -0
- modules/cmvk/experiments/FEATURE_2_LATERAL_THINKING.md +187 -0
- modules/cmvk/experiments/README.md +216 -0
- modules/cmvk/experiments/ablation_runner.py +666 -0
- modules/cmvk/experiments/baseline_runner.py +158 -0
- modules/cmvk/experiments/blind_spot_benchmark.py +364 -0
- modules/cmvk/experiments/datasets/README.md +85 -0
- modules/cmvk/experiments/datasets/humaneval_50.json +352 -0
- modules/cmvk/experiments/datasets/humaneval_full.json +1150 -0
- modules/cmvk/experiments/datasets/humaneval_sample.json +32 -0
- modules/cmvk/experiments/datasets/sabotage.json +262 -0
- modules/cmvk/experiments/datasets/sample.json +40 -0
- modules/cmvk/experiments/demo_with_traces.py +110 -0
- modules/cmvk/experiments/efficiency_curve.py +259 -0
- modules/cmvk/experiments/experiment_runner.py +243 -0
- modules/cmvk/experiments/paper_data_generator.py +183 -0
- modules/cmvk/experiments/reproduce_results.py +407 -0
- modules/cmvk/experiments/reproducible_runner.py +352 -0
- modules/cmvk/experiments/sabotage_stress_test.py +311 -0
- modules/cmvk/experiments/test_lateral_thinking.py +116 -0
- modules/cmvk/experiments/test_prosecutor.py +41 -0
- modules/cmvk/experiments/visualize_results.py +735 -0
- modules/cmvk/logs/traces/demo_HumanEval_0_20260121-204900.json +36 -0
- modules/cmvk/notebooks/analysis.ipynb +124 -0
- modules/cmvk/paper/PAPER.md +561 -0
- modules/cmvk/paper/arxiv_checklist.md +230 -0
- modules/cmvk/paper/cmvk_neurips.aux +77 -0
- modules/cmvk/paper/cmvk_neurips.bbl +81 -0
- modules/cmvk/paper/cmvk_neurips.blg +48 -0
- modules/cmvk/paper/cmvk_neurips.out +16 -0
- modules/cmvk/paper/cmvk_neurips.pdf +0 -0
- modules/cmvk/paper/cmvk_neurips.tex +309 -0
- modules/cmvk/paper/figures/ablation.png +0 -0
- modules/cmvk/paper/figures/ablation.svg +39 -0
- modules/cmvk/paper/figures/architecture.png +0 -0
- modules/cmvk/paper/figures/architecture.svg +115 -0
- modules/cmvk/paper/figures/results_bar.png +0 -0
- modules/cmvk/paper/figures/results_bar.svg +70 -0
- modules/cmvk/paper/generate_figures.py +383 -0
- modules/cmvk/paper/neurips_2024.sty +101 -0
- modules/cmvk/paper/references.bib +98 -0
- modules/cmvk/paper/structure.tex +200 -0
- modules/cmvk/pyproject.toml +189 -0
- modules/cmvk/requirements-dev.txt +19 -0
- modules/cmvk/requirements.txt +14 -0
- modules/cmvk/src/cmvk/__init__.py +216 -0
- modules/cmvk/src/cmvk/audit.py +400 -0
- modules/cmvk/src/cmvk/benchmarks.py +476 -0
- modules/cmvk/src/cmvk/constitutional.py +902 -0
- modules/cmvk/src/cmvk/hf_utils.py +299 -0
- modules/cmvk/src/cmvk/metrics.py +471 -0
- modules/cmvk/src/cmvk/profiles.py +298 -0
- modules/cmvk/src/cmvk/py.typed +0 -0
- modules/cmvk/src/cmvk/types.py +10 -0
- modules/cmvk/src/cmvk/verification.py +954 -0
- modules/cmvk/src/cross_model_verification_kernel/__init__.py +91 -0
- modules/cmvk/src/cross_model_verification_kernel/__main__.py +10 -0
- modules/cmvk/src/cross_model_verification_kernel/agents/__init__.py +16 -0
- modules/cmvk/src/cross_model_verification_kernel/agents/base_agent.py +142 -0
- modules/cmvk/src/cross_model_verification_kernel/agents/generator_openai.py +223 -0
- modules/cmvk/src/cross_model_verification_kernel/agents/verifier_anthropic.py +448 -0
- modules/cmvk/src/cross_model_verification_kernel/agents/verifier_gemini.py +481 -0
- modules/cmvk/src/cross_model_verification_kernel/cli.py +570 -0
- modules/cmvk/src/cross_model_verification_kernel/core/__init__.py +26 -0
- modules/cmvk/src/cross_model_verification_kernel/core/graph_memory.py +308 -0
- modules/cmvk/src/cross_model_verification_kernel/core/kernel.py +413 -0
- modules/cmvk/src/cross_model_verification_kernel/core/trace_logger.py +75 -0
- modules/cmvk/src/cross_model_verification_kernel/core/types.py +121 -0
- modules/cmvk/src/cross_model_verification_kernel/datasets/__init__.py +20 -0
- modules/cmvk/src/cross_model_verification_kernel/datasets/humaneval_loader.py +271 -0
- modules/cmvk/src/cross_model_verification_kernel/generator.py +118 -0
- modules/cmvk/src/cross_model_verification_kernel/kernel.py +292 -0
- modules/cmvk/src/cross_model_verification_kernel/models.py +111 -0
- modules/cmvk/src/cross_model_verification_kernel/py.typed +1 -0
- modules/cmvk/src/cross_model_verification_kernel/simple_kernel.py +185 -0
- modules/cmvk/src/cross_model_verification_kernel/tools/__init__.py +94 -0
- modules/cmvk/src/cross_model_verification_kernel/tools/huggingface_upload.py +394 -0
- modules/cmvk/src/cross_model_verification_kernel/tools/sandbox.py +159 -0
- modules/cmvk/src/cross_model_verification_kernel/tools/statistics.py +468 -0
- modules/cmvk/src/cross_model_verification_kernel/tools/visualizer.py +312 -0
- modules/cmvk/src/cross_model_verification_kernel/tools/web_search.py +86 -0
- modules/cmvk/src/cross_model_verification_kernel/verifier.py +257 -0
- modules/cmvk/tests/__init__.py +3 -0
- modules/cmvk/tests/conftest.py +61 -0
- modules/cmvk/tests/integration/__init__.py +1 -0
- modules/cmvk/tests/integration/test_anthropic_verifier.py +269 -0
- modules/cmvk/tests/integration/test_integration.py +53 -0
- modules/cmvk/tests/integration/test_lateral_thinking_integration.py +199 -0
- modules/cmvk/tests/integration/test_lateral_thinking_witness.py +208 -0
- modules/cmvk/tests/integration/test_prosecutor_mode.py +131 -0
- modules/cmvk/tests/test_constitutional.py +611 -0
- modules/cmvk/tests/test_enhanced_features.py +603 -0
- modules/cmvk/tests/test_verification.py +255 -0
- modules/cmvk/tests/unit/__init__.py +1 -0
- modules/cmvk/tests/unit/test_agents.py +64 -0
- modules/cmvk/tests/unit/test_cli.py +224 -0
- modules/cmvk/tests/unit/test_core.py +126 -0
- modules/cmvk/tests/unit/test_humaneval_loader.py +197 -0
- modules/cmvk/tests/unit/test_kernel.py +255 -0
- modules/cmvk/tests/unit/test_reproducibility.py +160 -0
- modules/cmvk/tests/unit/test_trace_logger.py +115 -0
- modules/cmvk/tests/unit/test_visualizer.py +218 -0
- modules/control-plane/.github/ISSUE_TEMPLATE/bug_report.yml +82 -0
- modules/control-plane/.github/ISSUE_TEMPLATE/config.yml +11 -0
- modules/control-plane/.github/ISSUE_TEMPLATE/feature_request.yml +104 -0
- modules/control-plane/.github/ISSUE_TEMPLATE/question.yml +70 -0
- modules/control-plane/.github/ISSUE_TEMPLATE/security_vulnerability.yml +84 -0
- modules/control-plane/.github/discussions.yml +73 -0
- modules/control-plane/.github/pull_request_template.md +82 -0
- modules/control-plane/.github/workflows/publish.yml +146 -0
- modules/control-plane/.github/workflows/release.yml +39 -0
- modules/control-plane/.github/workflows/tests.yml +58 -0
- modules/control-plane/.gitignore +55 -0
- modules/control-plane/CHANGELOG.md +203 -0
- modules/control-plane/CONTRIBUTING.md +311 -0
- modules/control-plane/CONTRIBUTORS.md +88 -0
- modules/control-plane/Dockerfile +82 -0
- modules/control-plane/LICENSE +21 -0
- modules/control-plane/MANIFEST.in +17 -0
- modules/control-plane/README.md +1264 -0
- modules/control-plane/ROADMAP.md +228 -0
- modules/control-plane/SECURITY.md +210 -0
- modules/control-plane/SUPPORT.md +106 -0
- modules/control-plane/acp-cli.py +212 -0
- modules/control-plane/benchmark/README.md +257 -0
- modules/control-plane/benchmark/__init__.py +19 -0
- modules/control-plane/benchmark/red_team_dataset.py +517 -0
- modules/control-plane/benchmark.py +563 -0
- modules/control-plane/build_and_publish.sh +130 -0
- modules/control-plane/docker-compose.yml +74 -0
- modules/control-plane/docs/ABLATION_STUDIES.md +528 -0
- modules/control-plane/docs/ADAPTER_GUIDE.md +544 -0
- modules/control-plane/docs/ADVANCED_FEATURES.md +543 -0
- modules/control-plane/docs/AIOS_COMPARISON.md +296 -0
- modules/control-plane/docs/BIBLIOGRAPHY.md +367 -0
- modules/control-plane/docs/CASE_STUDIES.md +645 -0
- modules/control-plane/docs/DOCKER_DEPLOYMENT.md +184 -0
- modules/control-plane/docs/ECOSYSTEM_STATUS.md +98 -0
- modules/control-plane/docs/HF_MODEL_CARD.md +168 -0
- modules/control-plane/docs/KERNEL_V1_RELEASE.md +454 -0
- modules/control-plane/docs/LAYER3_FRAMEWORK.md +227 -0
- modules/control-plane/docs/LIMITATIONS.md +523 -0
- modules/control-plane/docs/PYPI_PUBLISHING.md +195 -0
- modules/control-plane/docs/README.md +58 -0
- modules/control-plane/docs/RELATED_WORK.md +319 -0
- modules/control-plane/docs/RELEASE_v1.1.0.md +252 -0
- modules/control-plane/docs/REPRODUCIBILITY.md +540 -0
- modules/control-plane/docs/RESEARCH_FOUNDATION.md +197 -0
- modules/control-plane/docs/api/CORE.md +270 -0
- modules/control-plane/docs/architecture/architecture.md +120 -0
- modules/control-plane/docs/community/ANNOUNCEMENT_TEMPLATES.md +52 -0
- modules/control-plane/docs/guides/IMPLEMENTATION.md +225 -0
- modules/control-plane/docs/guides/PHILOSOPHY.md +354 -0
- modules/control-plane/docs/guides/QUICKSTART.md +217 -0
- modules/control-plane/examples/README.md +138 -0
- modules/control-plane/examples/a2a_demo.py +410 -0
- modules/control-plane/examples/adapter_demo.py +347 -0
- modules/control-plane/examples/advanced_features.py +403 -0
- modules/control-plane/examples/basic_usage.py +261 -0
- modules/control-plane/examples/benchmark_demo.py +186 -0
- modules/control-plane/examples/compliance_demo.py +333 -0
- modules/control-plane/examples/configuration.py +265 -0
- modules/control-plane/examples/getting_started.py +178 -0
- modules/control-plane/examples/hibernation_and_time_travel_demo.py +406 -0
- modules/control-plane/examples/interactive_tutorial.ipynb +497 -0
- modules/control-plane/examples/kernel_interceptor_demo.py +202 -0
- modules/control-plane/examples/kernel_v1_demo.py +273 -0
- modules/control-plane/examples/langchain_demo.py +281 -0
- modules/control-plane/examples/lifecycle_demo.py +724 -0
- modules/control-plane/examples/mcp_demo.py +378 -0
- modules/control-plane/examples/ml_safety_demo.py +157 -0
- modules/control-plane/examples/multimodal_demo.py +347 -0
- modules/control-plane/examples/observability_demo.py +370 -0
- modules/control-plane/examples/use_cases.py +336 -0
- modules/control-plane/experiments/long_horizon_purge.py +235 -0
- modules/control-plane/experiments/multi_agent_rag.py +165 -0
- modules/control-plane/experiments/reproduce_results.py +667 -0
- modules/control-plane/paper/ARXIV_SUBMISSION_INFO.txt +122 -0
- modules/control-plane/paper/ETHICS_STATEMENT.md +248 -0
- modules/control-plane/paper/PAPER_CHECKLIST.md +72 -0
- modules/control-plane/paper/Paper.pdf +0 -0
- modules/control-plane/paper/README.md +71 -0
- modules/control-plane/paper/appendix.md +152 -0
- modules/control-plane/paper/architecture.md +15 -0
- modules/control-plane/paper/arxiv/figures/ablation_chart.png +0 -0
- modules/control-plane/paper/arxiv/figures/architecture.png +0 -0
- modules/control-plane/paper/arxiv/figures/constraint_graphs.png +0 -0
- modules/control-plane/paper/arxiv/figures/results_chart.png +0 -0
- modules/control-plane/paper/arxiv/main.aux +97 -0
- modules/control-plane/paper/arxiv/main.bbl +112 -0
- modules/control-plane/paper/arxiv/main.blg +48 -0
- modules/control-plane/paper/arxiv/main.out +33 -0
- modules/control-plane/paper/arxiv/main.pdf +0 -0
- modules/control-plane/paper/arxiv/main.tex +479 -0
- modules/control-plane/paper/arxiv/references.bib +234 -0
- modules/control-plane/paper/arxiv_submission.tar +0 -0
- modules/control-plane/paper/arxiv_submission.zip +0 -0
- modules/control-plane/paper/build.sh +68 -0
- modules/control-plane/paper/figures/README.md +47 -0
- modules/control-plane/paper/figures/ablation_chart.pdf +0 -0
- modules/control-plane/paper/figures/ablation_chart.png +0 -0
- modules/control-plane/paper/figures/architecture.pdf +0 -0
- modules/control-plane/paper/figures/architecture.png +0 -0
- modules/control-plane/paper/figures/constraint_graphs.pdf +0 -0
- modules/control-plane/paper/figures/constraint_graphs.png +0 -0
- modules/control-plane/paper/figures/generate_figures.py +252 -0
- modules/control-plane/paper/figures/results_chart.pdf +0 -0
- modules/control-plane/paper/figures/results_chart.png +0 -0
- modules/control-plane/paper/main.md +273 -0
- modules/control-plane/paper/main.tex +214 -0
- modules/control-plane/paper/main_arxiv.aux +53 -0
- modules/control-plane/paper/main_arxiv.out +17 -0
- modules/control-plane/paper/main_arxiv.pdf +0 -0
- modules/control-plane/paper/main_arxiv.tex +264 -0
- modules/control-plane/paper/references.bib +234 -0
- modules/control-plane/pyproject.toml +124 -0
- modules/control-plane/reproducibility/ABLATIONS.md +136 -0
- modules/control-plane/reproducibility/README.md +288 -0
- modules/control-plane/reproducibility/commands.md +467 -0
- modules/control-plane/reproducibility/docker_config/Dockerfile +39 -0
- modules/control-plane/reproducibility/experiment_configs/purge_config.json +46 -0
- modules/control-plane/reproducibility/experiment_configs/rag_config.json +36 -0
- modules/control-plane/reproducibility/hardware_specs.md +317 -0
- modules/control-plane/reproducibility/requirements_frozen.txt +0 -0
- modules/control-plane/reproducibility/run_all_experiments.sh +45 -0
- modules/control-plane/reproducibility/seeds.json +106 -0
- modules/control-plane/scripts/prepare_pypi.py +46 -0
- modules/control-plane/scripts/prepare_release.py +176 -0
- modules/control-plane/scripts/upload_dataset_to_hf.py +316 -0
- modules/control-plane/setup.py +69 -0
- modules/control-plane/src/agent_control_plane/__init__.py +639 -0
- modules/control-plane/src/agent_control_plane/a2a_adapter.py +541 -0
- modules/control-plane/src/agent_control_plane/adapter.py +415 -0
- modules/control-plane/src/agent_control_plane/agent_hibernation.py +364 -0
- modules/control-plane/src/agent_control_plane/agent_kernel.py +464 -0
- modules/control-plane/src/agent_control_plane/compliance.py +718 -0
- modules/control-plane/src/agent_control_plane/constraint_graphs.py +475 -0
- modules/control-plane/src/agent_control_plane/control_plane.py +848 -0
- modules/control-plane/src/agent_control_plane/example_executors.py +193 -0
- modules/control-plane/src/agent_control_plane/execution_engine.py +229 -0
- modules/control-plane/src/agent_control_plane/flight_recorder.py +600 -0
- modules/control-plane/src/agent_control_plane/governance_layer.py +432 -0
- modules/control-plane/src/agent_control_plane/hf_utils.py +561 -0
- modules/control-plane/src/agent_control_plane/interfaces/__init__.py +53 -0
- modules/control-plane/src/agent_control_plane/interfaces/kernel_interface.py +359 -0
- modules/control-plane/src/agent_control_plane/interfaces/plugin_interface.py +495 -0
- modules/control-plane/src/agent_control_plane/interfaces/protocol_interfaces.py +385 -0
- modules/control-plane/src/agent_control_plane/kernel_space.py +707 -0
- modules/control-plane/src/agent_control_plane/langchain_adapter.py +422 -0
- modules/control-plane/src/agent_control_plane/lifecycle.py +3111 -0
- modules/control-plane/src/agent_control_plane/mcp_adapter.py +517 -0
- modules/control-plane/src/agent_control_plane/ml_safety.py +560 -0
- modules/control-plane/src/agent_control_plane/multimodal.py +724 -0
- modules/control-plane/src/agent_control_plane/mute_agent.py +419 -0
- modules/control-plane/src/agent_control_plane/observability.py +785 -0
- modules/control-plane/src/agent_control_plane/orchestrator.py +480 -0
- modules/control-plane/src/agent_control_plane/plugin_registry.py +748 -0
- modules/control-plane/src/agent_control_plane/policy_engine.py +525 -0
- modules/control-plane/src/agent_control_plane/shadow_mode.py +307 -0
- modules/control-plane/src/agent_control_plane/signals.py +491 -0
- modules/control-plane/src/agent_control_plane/supervisor_agents.py +427 -0
- modules/control-plane/src/agent_control_plane/time_travel_debugger.py +554 -0
- modules/control-plane/src/agent_control_plane/tool_registry.py +350 -0
- modules/control-plane/src/agent_control_plane/vfs.py +695 -0
- modules/control-plane/tests/README.md +33 -0
- modules/control-plane/tests/test_a2a_adapter.py +336 -0
- modules/control-plane/tests/test_adapter.py +422 -0
- modules/control-plane/tests/test_advanced_features.py +389 -0
- modules/control-plane/tests/test_benchmark.py +223 -0
- modules/control-plane/tests/test_compliance.py +214 -0
- modules/control-plane/tests/test_control_plane.py +295 -0
- modules/control-plane/tests/test_hibernation.py +274 -0
- modules/control-plane/tests/test_kernel_interception.py +284 -0
- modules/control-plane/tests/test_langchain_adapter.py +258 -0
- modules/control-plane/tests/test_lifecycle.py +1174 -0
- modules/control-plane/tests/test_mcp_adapter.py +293 -0
- modules/control-plane/tests/test_ml_safety.py +142 -0
- modules/control-plane/tests/test_multimodal.py +317 -0
- modules/control-plane/tests/test_new_features.py +435 -0
- modules/control-plane/tests/test_observability.py +338 -0
- modules/control-plane/tests/test_time_travel.py +387 -0
- modules/emk/.github/workflows/ci.yml +105 -0
- modules/emk/.github/workflows/publish.yml +144 -0
- modules/emk/.gitignore +74 -0
- modules/emk/CHANGELOG.md +41 -0
- modules/emk/CONTRIBUTING.md +295 -0
- modules/emk/IMPLEMENTATION.md +174 -0
- modules/emk/LICENSE +21 -0
- modules/emk/MANIFEST.in +8 -0
- modules/emk/README.md +135 -0
- modules/emk/RELEASE_NOTES.md +82 -0
- modules/emk/SECURITY.md +52 -0
- modules/emk/codecov.yml +39 -0
- modules/emk/docs/MEMORY_MANAGEMENT.md +285 -0
- modules/emk/emk/__init__.py +106 -0
- modules/emk/emk/hf_utils.py +419 -0
- modules/emk/emk/indexer.py +144 -0
- modules/emk/emk/py.typed +0 -0
- modules/emk/emk/schema.py +204 -0
- modules/emk/emk/sleep_cycle.py +345 -0
- modules/emk/emk/store.py +479 -0
- modules/emk/examples/basic_usage.py +123 -0
- modules/emk/examples/memory_features_demo.py +154 -0
- modules/emk/experiments/README.md +59 -0
- modules/emk/experiments/reproduce_results.py +461 -0
- modules/emk/experiments/results.json +61 -0
- modules/emk/paper/structure.tex +192 -0
- modules/emk/paper/whitepaper.md +273 -0
- modules/emk/pyproject.toml +91 -0
- modules/emk/setup.py +5 -0
- modules/emk/tests/test_file_adapter.py +195 -0
- modules/emk/tests/test_indexer.py +174 -0
- modules/emk/tests/test_init.py +55 -0
- modules/emk/tests/test_negative_memory.py +83 -0
- modules/emk/tests/test_schema.py +150 -0
- modules/emk/tests/test_semantic_rules.py +175 -0
- modules/emk/tests/test_sleep_cycle.py +335 -0
- modules/emk/tests/test_store_anti_patterns.py +239 -0
- modules/iatp/.github/workflows/docker-build.yml +124 -0
- modules/iatp/.github/workflows/publish.yml +174 -0
- modules/iatp/.github/workflows/python-package.yml +121 -0
- modules/iatp/.gitignore +67 -0
- modules/iatp/.pre-commit-config.yaml +64 -0
- modules/iatp/CHANGELOG.md +120 -0
- modules/iatp/Dockerfile +91 -0
- modules/iatp/IMPLEMENTATION_SUMMARY.md +218 -0
- modules/iatp/MANIFEST.in +9 -0
- modules/iatp/README.md +180 -0
- modules/iatp/docker/Dockerfile.agent +27 -0
- modules/iatp/docker/Dockerfile.sidecar-python +86 -0
- modules/iatp/docker/README.md +258 -0
- modules/iatp/docker-compose.yml +194 -0
- modules/iatp/docs/ARCHITECTURE.md +243 -0
- modules/iatp/docs/CLI_GUIDE.md +220 -0
- modules/iatp/docs/DEPLOYMENT.md +304 -0
- modules/iatp/examples/README.md +132 -0
- modules/iatp/examples/backend_agent.py +39 -0
- modules/iatp/examples/client.py +168 -0
- modules/iatp/examples/demo_attestation_reputation.py +274 -0
- modules/iatp/examples/demo_client.py +240 -0
- modules/iatp/examples/demo_rbac.py +143 -0
- modules/iatp/examples/integration_demo.py +245 -0
- modules/iatp/examples/manifests/coder_agent.json +20 -0
- modules/iatp/examples/manifests/reviewer_agent.json +19 -0
- modules/iatp/examples/manifests/secure_bank.json +14 -0
- modules/iatp/examples/manifests/standard_agent.json +14 -0
- modules/iatp/examples/manifests/untrusted_honeypot.json +14 -0
- modules/iatp/examples/run_secure_bank_sidecar.py +85 -0
- modules/iatp/examples/run_sidecar.py +105 -0
- modules/iatp/examples/run_untrusted_sidecar.py +77 -0
- modules/iatp/examples/secure_bank_agent.py +138 -0
- modules/iatp/examples/test_untrusted.py +82 -0
- modules/iatp/examples/untrusted_agent.py +119 -0
- modules/iatp/experiments/README.md +58 -0
- modules/iatp/experiments/cascading_hallucination/README.md +149 -0
- modules/iatp/experiments/cascading_hallucination/agent_a_user.py +41 -0
- modules/iatp/experiments/cascading_hallucination/agent_b_summarizer.py +54 -0
- modules/iatp/experiments/cascading_hallucination/agent_c_database.py +47 -0
- modules/iatp/experiments/cascading_hallucination/proof_of_concept.py +290 -0
- modules/iatp/experiments/cascading_hallucination/run_experiment.py +226 -0
- modules/iatp/experiments/cascading_hallucination/sidecar_c.py +61 -0
- modules/iatp/experiments/reproduce_results.py +574 -0
- modules/iatp/experiments/results.json +2336 -0
- modules/iatp/iatp/__init__.py +164 -0
- modules/iatp/iatp/attestation.py +401 -0
- modules/iatp/iatp/cli.py +253 -0
- modules/iatp/iatp/hf_utils.py +469 -0
- modules/iatp/iatp/ipc_pipes.py +578 -0
- modules/iatp/iatp/main.py +410 -0
- modules/iatp/iatp/models/__init__.py +445 -0
- modules/iatp/iatp/policy_engine.py +335 -0
- modules/iatp/iatp/py.typed +2 -0
- modules/iatp/iatp/recovery.py +319 -0
- modules/iatp/iatp/security/__init__.py +268 -0
- modules/iatp/iatp/sidecar/__init__.py +517 -0
- modules/iatp/iatp/telemetry/__init__.py +162 -0
- modules/iatp/iatp/tests/__init__.py +1 -0
- modules/iatp/iatp/tests/test_attestation.py +368 -0
- modules/iatp/iatp/tests/test_cli.py +129 -0
- modules/iatp/iatp/tests/test_models.py +128 -0
- modules/iatp/iatp/tests/test_policy_engine.py +345 -0
- modules/iatp/iatp/tests/test_recovery.py +279 -0
- modules/iatp/iatp/tests/test_security.py +220 -0
- modules/iatp/iatp/tests/test_sidecar.py +165 -0
- modules/iatp/iatp/tests/test_telemetry.py +173 -0
- modules/iatp/paper/BLOG.md +307 -0
- modules/iatp/paper/PAPER.md +236 -0
- modules/iatp/paper/RFC_SUBMISSION.md +299 -0
- modules/iatp/paper/whitepaper.md +369 -0
- modules/iatp/proto/README.md +200 -0
- modules/iatp/proto/generate_stubs.py +81 -0
- modules/iatp/proto/iatp.proto +552 -0
- modules/iatp/pyproject.toml +180 -0
- modules/iatp/requirements-dev.txt +2 -0
- modules/iatp/requirements.txt +6 -0
- modules/iatp/setup.py +60 -0
- modules/iatp/sidecar/README.md +487 -0
- modules/iatp/sidecar/go/Dockerfile +32 -0
- modules/iatp/sidecar/go/README.md +237 -0
- modules/iatp/sidecar/go/go.mod +8 -0
- modules/iatp/sidecar/go/main.go +488 -0
- modules/iatp/spec/001-handshake.md +436 -0
- modules/iatp/spec/002-reversibility.md +394 -0
- modules/iatp/spec/schema/capability_manifest.json +266 -0
- modules/iatp/test_integration.py +310 -0
- modules/mcp-kernel-server/README.md +261 -0
- modules/mcp-kernel-server/pyproject.toml +60 -0
- modules/mcp-kernel-server/src/mcp_kernel_server/__init__.py +26 -0
- modules/mcp-kernel-server/src/mcp_kernel_server/cli.py +229 -0
- modules/mcp-kernel-server/src/mcp_kernel_server/resources.py +215 -0
- modules/mcp-kernel-server/src/mcp_kernel_server/server.py +562 -0
- modules/mcp-kernel-server/src/mcp_kernel_server/tools.py +1172 -0
- modules/mute-agent/.github/workflows/safety_check.yml +45 -0
- modules/mute-agent/.gitignore +53 -0
- modules/mute-agent/ARCHITECTURE.md +531 -0
- modules/mute-agent/BENCHMARK_GUIDE.md +384 -0
- modules/mute-agent/COMPLETION_SUMMARY.md +293 -0
- modules/mute-agent/EXPERIMENT_SUMMARY.md +318 -0
- modules/mute-agent/IMPLEMENTATION_SUMMARY.md +212 -0
- modules/mute-agent/LICENSE +21 -0
- modules/mute-agent/PHASE3_SUMMARY.md +297 -0
- modules/mute-agent/README.md +360 -0
- modules/mute-agent/STEEL_MAN_RESULTS.md +353 -0
- modules/mute-agent/USAGE.md +505 -0
- modules/mute-agent/V2_IMPLEMENTATION_SUMMARY.md +253 -0
- modules/mute-agent/V2_STEEL_MAN_IMPLEMENTATION.md +274 -0
- modules/mute-agent/VERIFICATION_REPORT.md +435 -0
- modules/mute-agent/charts/cost_comparison.png +0 -0
- modules/mute-agent/charts/cost_vs_ambiguity.png +0 -0
- modules/mute-agent/charts/metrics_comparison.png +0 -0
- modules/mute-agent/charts/scenario_breakdown.png +0 -0
- modules/mute-agent/charts/trace_attack_blocked.html +140 -0
- modules/mute-agent/charts/trace_attack_blocked.png +0 -0
- modules/mute-agent/charts/trace_failure.html +140 -0
- modules/mute-agent/charts/trace_failure.png +0 -0
- modules/mute-agent/charts/trace_success.html +140 -0
- modules/mute-agent/charts/trace_success.png +0 -0
- modules/mute-agent/examples/__init__.py +1 -0
- modules/mute-agent/examples/advanced_example.py +384 -0
- modules/mute-agent/examples/graph_debugger_demo.py +241 -0
- modules/mute-agent/examples/listener_example.py +297 -0
- modules/mute-agent/examples/simple_example.py +242 -0
- modules/mute-agent/examples/steel_man_demo.py +297 -0
- modules/mute-agent/experiments/README.md +135 -0
- modules/mute-agent/experiments/__init__.py +3 -0
- modules/mute-agent/experiments/agent_comparison.csv +6 -0
- modules/mute-agent/experiments/agent_comparison_50runs.csv +6 -0
- modules/mute-agent/experiments/ambiguity_test.py +335 -0
- modules/mute-agent/experiments/ambiguity_test_results.csv +31 -0
- modules/mute-agent/experiments/ambiguity_test_results_50runs.csv +51 -0
- modules/mute-agent/experiments/baseline_agent.py +189 -0
- modules/mute-agent/experiments/benchmark.py +402 -0
- modules/mute-agent/experiments/demo.py +172 -0
- modules/mute-agent/experiments/generate_cost_curve.py +474 -0
- modules/mute-agent/experiments/jailbreak_test.py +137 -0
- modules/mute-agent/experiments/latent_state_scenario.py +361 -0
- modules/mute-agent/experiments/mute_agent_experiment.py +349 -0
- modules/mute-agent/experiments/run_extended_experiment.py +40 -0
- modules/mute-agent/experiments/run_v2_experiments.py +266 -0
- modules/mute-agent/experiments/run_v2_experiments_auto.py +247 -0
- modules/mute-agent/experiments/v2_scenarios/README.md +214 -0
- modules/mute-agent/experiments/v2_scenarios/__init__.py +4 -0
- modules/mute-agent/experiments/v2_scenarios/scenario_1_deep_dependency.py +325 -0
- modules/mute-agent/experiments/v2_scenarios/scenario_2_adversarial.py +328 -0
- modules/mute-agent/experiments/v2_scenarios/scenario_3_false_positive.py +303 -0
- modules/mute-agent/experiments/v2_scenarios/scenario_4_performance.py +319 -0
- modules/mute-agent/experiments/visualize.py +400 -0
- modules/mute-agent/mute_agent/__init__.py +66 -0
- modules/mute-agent/mute_agent/core/__init__.py +1 -0
- modules/mute-agent/mute_agent/core/execution_agent.py +164 -0
- modules/mute-agent/mute_agent/core/handshake_protocol.py +199 -0
- modules/mute-agent/mute_agent/core/reasoning_agent.py +236 -0
- modules/mute-agent/mute_agent/knowledge_graph/__init__.py +1 -0
- modules/mute-agent/mute_agent/knowledge_graph/graph_elements.py +63 -0
- modules/mute-agent/mute_agent/knowledge_graph/multidimensional_graph.py +168 -0
- modules/mute-agent/mute_agent/knowledge_graph/subgraph.py +222 -0
- modules/mute-agent/mute_agent/listener/__init__.py +41 -0
- modules/mute-agent/mute_agent/listener/adapters/__init__.py +29 -0
- modules/mute-agent/mute_agent/listener/adapters/base_adapter.py +187 -0
- modules/mute-agent/mute_agent/listener/adapters/caas_adapter.py +342 -0
- modules/mute-agent/mute_agent/listener/adapters/control_plane_adapter.py +434 -0
- modules/mute-agent/mute_agent/listener/adapters/iatp_adapter.py +330 -0
- modules/mute-agent/mute_agent/listener/adapters/scak_adapter.py +249 -0
- modules/mute-agent/mute_agent/listener/listener.py +608 -0
- modules/mute-agent/mute_agent/listener/state_observer.py +434 -0
- modules/mute-agent/mute_agent/listener/threshold_config.py +311 -0
- modules/mute-agent/mute_agent/super_system/__init__.py +1 -0
- modules/mute-agent/mute_agent/super_system/router.py +202 -0
- modules/mute-agent/mute_agent/visualization/__init__.py +8 -0
- modules/mute-agent/mute_agent/visualization/graph_debugger.py +495 -0
- modules/mute-agent/requirements-dev.txt +6 -0
- modules/mute-agent/requirements.txt +9 -0
- modules/mute-agent/setup.py +64 -0
- modules/mute-agent/src/__init__.py +0 -0
- modules/mute-agent/src/agents/__init__.py +0 -0
- modules/mute-agent/src/agents/baseline_agent.py +524 -0
- modules/mute-agent/src/agents/interactive_agent.py +113 -0
- modules/mute-agent/src/agents/mute_agent.py +622 -0
- modules/mute-agent/src/benchmarks/__init__.py +0 -0
- modules/mute-agent/src/benchmarks/evaluator.py +481 -0
- modules/mute-agent/src/benchmarks/scenarios.json +985 -0
- modules/mute-agent/src/core/__init__.py +0 -0
- modules/mute-agent/src/core/mock_state.py +320 -0
- modules/mute-agent/src/core/tools.py +441 -0
- modules/nexus/__init__.py +49 -0
- modules/nexus/arbiter.py +357 -0
- modules/nexus/client.py +464 -0
- modules/nexus/dmz.py +417 -0
- modules/nexus/escrow.py +428 -0
- modules/nexus/exceptions.py +284 -0
- modules/nexus/registry.py +391 -0
- modules/nexus/reputation.py +423 -0
- modules/nexus/schemas/__init__.py +49 -0
- modules/nexus/schemas/compliance.py +274 -0
- modules/nexus/schemas/escrow.py +249 -0
- modules/nexus/schemas/manifest.py +223 -0
- modules/nexus/schemas/receipt.py +206 -0
- modules/observability/README.md +192 -0
- modules/observability/alertmanager/alertmanager.yml +116 -0
- modules/observability/alerts/agent-os-alerts.yaml +197 -0
- modules/observability/docker-compose.yml +128 -0
- modules/observability/grafana/dashboards/agent-os-amb.json +448 -0
- modules/observability/grafana/dashboards/agent-os-cmvk.json +441 -0
- modules/observability/grafana/dashboards/agent-os-overview.json +268 -0
- modules/observability/grafana/dashboards/agent-os-performance.json +15 -0
- modules/observability/grafana/dashboards/agent-os-safety.json +50 -0
- modules/observability/grafana/provisioning/dashboards/dashboards.yml +15 -0
- modules/observability/grafana/provisioning/datasources/datasources.yml +33 -0
- modules/observability/otel/otel-collector-config.yml +61 -0
- modules/observability/prometheus/prometheus.yml +63 -0
- modules/observability/pyproject.toml +53 -0
- modules/observability/scripts/export_dashboards.py +55 -0
- modules/observability/src/agent_os_observability/__init__.py +25 -0
- modules/observability/src/agent_os_observability/dashboards.py +896 -0
- modules/observability/src/agent_os_observability/metrics.py +396 -0
- modules/observability/src/agent_os_observability/server.py +221 -0
- modules/observability/src/agent_os_observability/tracer.py +226 -0
- modules/primitives/.gitignore +8 -0
- modules/primitives/README.md +62 -0
- modules/primitives/agent_primitives/__init__.py +22 -0
- modules/primitives/agent_primitives/failures.py +82 -0
- modules/primitives/agent_primitives/py.typed +0 -0
- modules/primitives/pyproject.toml +68 -0
- modules/scak/.github/copilot-instructions.md +396 -0
- modules/scak/.github/workflows/release.yml +117 -0
- modules/scak/.gitignore +32 -0
- modules/scak/CHANGELOG.md +173 -0
- modules/scak/CITATION.cff +62 -0
- modules/scak/CONTRIBUTING.md +429 -0
- modules/scak/Dockerfile +58 -0
- modules/scak/ENTERPRISE_FEATURES.md +518 -0
- modules/scak/IMPLEMENTATION_SUMMARY.md +206 -0
- modules/scak/LIMITATIONS.md +565 -0
- modules/scak/MANIFEST.in +16 -0
- modules/scak/NOVELTY.md +535 -0
- modules/scak/README.md +928 -0
- modules/scak/RESEARCH.md +670 -0
- modules/scak/agent_kernel/__init__.py +66 -0
- modules/scak/agent_kernel/analyzer.py +432 -0
- modules/scak/agent_kernel/auditor.py +31 -0
- modules/scak/agent_kernel/completeness_auditor.py +234 -0
- modules/scak/agent_kernel/detector.py +200 -0
- modules/scak/agent_kernel/kernel.py +741 -0
- modules/scak/agent_kernel/memory_manager.py +82 -0
- modules/scak/agent_kernel/models.py +372 -0
- modules/scak/agent_kernel/nudge_mechanism.py +260 -0
- modules/scak/agent_kernel/outcome_analyzer.py +335 -0
- modules/scak/agent_kernel/patcher.py +579 -0
- modules/scak/agent_kernel/semantic_analyzer.py +313 -0
- modules/scak/agent_kernel/semantic_purge.py +346 -0
- modules/scak/agent_kernel/simulator.py +447 -0
- modules/scak/agent_kernel/teacher.py +82 -0
- modules/scak/agent_kernel/triage.py +149 -0
- modules/scak/build_and_publish.ps1 +74 -0
- modules/scak/build_and_publish.sh +74 -0
- modules/scak/cli.py +471 -0
- modules/scak/dashboard.py +462 -0
- modules/scak/datasets/DATASET_CARD.md +219 -0
- modules/scak/datasets/README.md +143 -0
- modules/scak/datasets/gaia_vague_queries/vague_queries.json +262 -0
- modules/scak/datasets/hf_upload/README.md +219 -0
- modules/scak/datasets/hf_upload/scak_gaia_laziness.jsonl +50 -0
- modules/scak/datasets/prepare_hf_datasets.py +145 -0
- modules/scak/datasets/red_team/jailbreak_patterns.json +202 -0
- modules/scak/docker-compose.yml +99 -0
- modules/scak/docs/Adaptive-Memory-Hierarchy.md +319 -0
- modules/scak/docs/Data-Contracts-and-Schemas.md +285 -0
- modules/scak/docs/Dual-Loop-Architecture.md +344 -0
- modules/scak/docs/Enhanced-Features.md +612 -0
- modules/scak/docs/LANGCHAIN_INTEGRATION.md +572 -0
- modules/scak/docs/README.md +128 -0
- modules/scak/docs/Reference-Implementations.md +163 -0
- modules/scak/docs/SCAK_V2.md +374 -0
- modules/scak/docs/Three-Failure-Types.md +178 -0
- modules/scak/examples/basic_example.py +155 -0
- modules/scak/examples/circuit_breaker_lazy_eval_demo.py +243 -0
- modules/scak/examples/langchain_integration_example.py +339 -0
- modules/scak/examples/layer4_demo.py +243 -0
- modules/scak/examples/production_features_demo.py +353 -0
- modules/scak/examples/quick_demo.py +79 -0
- modules/scak/examples/scak_v2_demo.py +252 -0
- modules/scak/experiments/README.md +438 -0
- modules/scak/experiments/ablation_studies/README.md +192 -0
- modules/scak/experiments/ablation_studies/ablation_no_audit.py +116 -0
- modules/scak/experiments/ablation_studies/ablation_no_purge.py +133 -0
- modules/scak/experiments/chaos_engineering/README.md +332 -0
- modules/scak/experiments/context_efficiency_test.py +328 -0
- modules/scak/experiments/gaia_benchmark/README.md +208 -0
- modules/scak/experiments/laziness_benchmark.py +179 -0
- modules/scak/experiments/long_horizon_task_experiment.py +252 -0
- modules/scak/experiments/multi_agent_rag_experiment.py +284 -0
- modules/scak/experiments/results/ablation_table.md +12 -0
- modules/scak/experiments/results/long_horizon.json +36 -0
- modules/scak/experiments/results/multi_agent_rag.json +66 -0
- modules/scak/experiments/run_comprehensive_ablations.py +332 -0
- modules/scak/experiments/test_auditor_patcher_integration.py +251 -0
- modules/scak/notebooks/getting_started.ipynb +33 -0
- modules/scak/paper/ARXIV_SUBMISSION_METADATA.txt +109 -0
- modules/scak/paper/PAPER_CHECKLIST.md +304 -0
- modules/scak/paper/Paper.pdf +0 -0
- modules/scak/paper/README.md +113 -0
- modules/scak/paper/appendix.md +351 -0
- modules/scak/paper/arxiv/bibliography.bib +284 -0
- modules/scak/paper/arxiv/fig1_ooda_architecture.pdf +0 -0
- modules/scak/paper/arxiv/fig2_memory_hierarchy.pdf +0 -0
- modules/scak/paper/arxiv/fig3_gaia_results.pdf +0 -0
- modules/scak/paper/arxiv/fig4_ablation_heatmap.pdf +0 -0
- modules/scak/paper/arxiv/fig5_context_reduction.pdf +0 -0
- modules/scak/paper/arxiv/fig6_mttr_boxplot.pdf +0 -0
- modules/scak/paper/arxiv/main.aux +103 -0
- modules/scak/paper/arxiv/main.bbl +113 -0
- modules/scak/paper/arxiv/main.blg +55 -0
- modules/scak/paper/arxiv/main.out +31 -0
- modules/scak/paper/arxiv/main.pdf +0 -0
- modules/scak/paper/arxiv/main.tex +482 -0
- modules/scak/paper/arxiv_submission/bibliography.bib +284 -0
- modules/scak/paper/arxiv_submission/fig1_ooda_architecture.pdf +0 -0
- modules/scak/paper/arxiv_submission/fig2_memory_hierarchy.pdf +0 -0
- modules/scak/paper/arxiv_submission/fig3_gaia_results.pdf +0 -0
- modules/scak/paper/arxiv_submission/fig4_ablation_heatmap.pdf +0 -0
- modules/scak/paper/arxiv_submission/fig5_context_reduction.pdf +0 -0
- modules/scak/paper/arxiv_submission/fig6_mttr_boxplot.pdf +0 -0
- modules/scak/paper/arxiv_submission/main.aux +103 -0
- modules/scak/paper/arxiv_submission/main.bbl +113 -0
- modules/scak/paper/arxiv_submission/main.blg +55 -0
- modules/scak/paper/arxiv_submission/main.out +31 -0
- modules/scak/paper/arxiv_submission/main.pdf +0 -0
- modules/scak/paper/arxiv_submission/main.tex +482 -0
- modules/scak/paper/arxiv_submission.tar.gz +0 -0
- modules/scak/paper/bibliography.bib +284 -0
- modules/scak/paper/build.sh +55 -0
- modules/scak/paper/figures/README.md +32 -0
- modules/scak/paper/figures/fig1_ooda_architecture.md +75 -0
- modules/scak/paper/figures/fig1_ooda_architecture.pdf +0 -0
- modules/scak/paper/figures/fig1_ooda_architecture.png +0 -0
- modules/scak/paper/figures/fig2_memory_hierarchy.md +83 -0
- modules/scak/paper/figures/fig2_memory_hierarchy.pdf +0 -0
- modules/scak/paper/figures/fig2_memory_hierarchy.png +0 -0
- modules/scak/paper/figures/fig3_gaia_results.md +64 -0
- modules/scak/paper/figures/fig3_gaia_results.pdf +0 -0
- modules/scak/paper/figures/fig3_gaia_results.png +0 -0
- modules/scak/paper/figures/fig4_ablation_heatmap.md +64 -0
- modules/scak/paper/figures/fig4_ablation_heatmap.pdf +0 -0
- modules/scak/paper/figures/fig4_ablation_heatmap.png +0 -0
- modules/scak/paper/figures/fig5_context_reduction.md +71 -0
- modules/scak/paper/figures/fig5_context_reduction.pdf +0 -0
- modules/scak/paper/figures/fig5_context_reduction.png +0 -0
- modules/scak/paper/figures/fig6_mttr_boxplot.md +80 -0
- modules/scak/paper/figures/fig6_mttr_boxplot.pdf +0 -0
- modules/scak/paper/figures/fig6_mttr_boxplot.png +0 -0
- modules/scak/paper/figures/generate_figures.py +463 -0
- modules/scak/paper/main.aux +103 -0
- modules/scak/paper/main.bbl +113 -0
- modules/scak/paper/main.blg +55 -0
- modules/scak/paper/main.md +192 -0
- modules/scak/paper/main.out +31 -0
- modules/scak/paper/main.pdf +0 -0
- modules/scak/paper/main.tex +482 -0
- modules/scak/reproducibility/ABLATIONS.md +225 -0
- modules/scak/reproducibility/Dockerfile.reproducibility +34 -0
- modules/scak/reproducibility/README.md +421 -0
- modules/scak/reproducibility/requirements-pinned.txt +32 -0
- modules/scak/reproducibility/run_all_experiments.py +395 -0
- modules/scak/reproducibility/seed_control.py +53 -0
- modules/scak/reproducibility/statistical_analysis.py +302 -0
- modules/scak/requirements.txt +50 -0
- modules/scak/setup.py +93 -0
- modules/scak/src/__init__.py +124 -0
- modules/scak/src/agents/__init__.py +13 -0
- modules/scak/src/agents/conflict_resolution.py +732 -0
- modules/scak/src/agents/orchestrator.py +761 -0
- modules/scak/src/agents/pubsub.py +484 -0
- modules/scak/src/agents/shadow_teacher.py +344 -0
- modules/scak/src/agents/swarm.py +661 -0
- modules/scak/src/agents/worker.py +357 -0
- modules/scak/src/integrations/__init__.py +81 -0
- modules/scak/src/integrations/cmvk_adapter.py +430 -0
- modules/scak/src/integrations/control_plane_adapter.py +601 -0
- modules/scak/src/integrations/langchain_integration.py +902 -0
- modules/scak/src/interfaces/__init__.py +59 -0
- modules/scak/src/interfaces/llm_clients.py +505 -0
- modules/scak/src/interfaces/openapi_tools.py +611 -0
- modules/scak/src/interfaces/plugin_system.py +605 -0
- modules/scak/src/interfaces/protocols.py +365 -0
- modules/scak/src/interfaces/telemetry.py +464 -0
- modules/scak/src/interfaces/tool_registry.py +547 -0
- modules/scak/src/kernel/__init__.py +100 -0
- modules/scak/src/kernel/auditor.py +305 -0
- modules/scak/src/kernel/circuit_breaker.py +398 -0
- modules/scak/src/kernel/core.py +724 -0
- modules/scak/src/kernel/distributed.py +667 -0
- modules/scak/src/kernel/evolution.py +455 -0
- modules/scak/src/kernel/failover.py +621 -0
- modules/scak/src/kernel/governance.py +710 -0
- modules/scak/src/kernel/governance_v2.py +603 -0
- modules/scak/src/kernel/lazy_evaluator.py +514 -0
- modules/scak/src/kernel/load_testing.py +633 -0
- modules/scak/src/kernel/memory.py +945 -0
- modules/scak/src/kernel/patcher.py +581 -0
- modules/scak/src/kernel/rubric.py +419 -0
- modules/scak/src/kernel/schemas.py +390 -0
- modules/scak/src/kernel/skill_mapper.py +309 -0
- modules/scak/src/kernel/triage.py +149 -0
- modules/scak/src/mocks/__init__.py +99 -0
- modules/scak/tests/__init__.py +1 -0
- modules/scak/tests/test_circuit_breaker.py +403 -0
- modules/scak/tests/test_conflict_resolution.py +287 -0
- modules/scak/tests/test_dual_loop.py +463 -0
- modules/scak/tests/test_enhanced_features.py +421 -0
- modules/scak/tests/test_failover_and_load.py +438 -0
- modules/scak/tests/test_governance.py +185 -0
- modules/scak/tests/test_kernel.py +359 -0
- modules/scak/tests/test_langchain_integration.py +451 -0
- modules/scak/tests/test_lazy_evaluator.py +465 -0
- modules/scak/tests/test_llm_clients.py +122 -0
- modules/scak/tests/test_memory_controller.py +528 -0
- modules/scak/tests/test_orchestrator.py +181 -0
- modules/scak/tests/test_phase3_integration.py +265 -0
- modules/scak/tests/test_pubsub_swarm.py +203 -0
- modules/scak/tests/test_reference_implementations.py +240 -0
- modules/scak/tests/test_rubric.py +363 -0
- modules/scak/tests/test_scak_v2.py +651 -0
- modules/scak/tests/test_skill_mapper.py +217 -0
- modules/scak/tests/test_specific_failures.py +393 -0
- modules/scak/tests/test_tool_registry.py +264 -0
- modules/scak/tests/test_tools_and_plugins.py +303 -0
- modules/scak/tests/test_triage.py +596 -0
- modules/scak/tests/test_write_through.py +319 -0
- agent_os_kernel-1.1.0.dist-info/METADATA +0 -400
- agent_os_kernel-1.1.0.dist-info/RECORD +0 -12
- {agent_os_kernel-1.1.0.dist-info → agent_os_kernel-1.2.0.dist-info}/WHEEL +0 -0
- {agent_os_kernel-1.1.0.dist-info → agent_os_kernel-1.2.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,575 @@
|
|
|
1
|
+
<!DOCTYPE html>
|
|
2
|
+
<html lang="en">
|
|
3
|
+
<head>
|
|
4
|
+
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
|
5
|
+
<title>Coverage for src\cmvk\benchmarks.py: 0%</title>
|
|
6
|
+
<link rel="icon" sizes="32x32" href="favicon_32_cb_c827f16f.png">
|
|
7
|
+
<link rel="stylesheet" href="style_cb_5c747636.css" type="text/css">
|
|
8
|
+
<script src="coverage_html_cb_188fc9a4.js" defer></script>
|
|
9
|
+
</head>
|
|
10
|
+
<body class="pyfile">
|
|
11
|
+
<header>
|
|
12
|
+
<div class="content">
|
|
13
|
+
<h1>
|
|
14
|
+
<span class="text">Coverage for </span><b>src \ cmvk \ benchmarks.py</b>:
|
|
15
|
+
<span class="pc_cov">0%</span>
|
|
16
|
+
</h1>
|
|
17
|
+
<aside id="help_panel_wrapper">
|
|
18
|
+
<input id="help_panel_state" type="checkbox">
|
|
19
|
+
<label for="help_panel_state">
|
|
20
|
+
<img id="keyboard_icon" src="keybd_closed_cb_900cfef5.png" alt="Show/hide keyboard shortcuts">
|
|
21
|
+
</label>
|
|
22
|
+
<div id="help_panel">
|
|
23
|
+
<p class="legend">Shortcuts on this page</p>
|
|
24
|
+
<div class="keyhelp">
|
|
25
|
+
<p>
|
|
26
|
+
<kbd>r</kbd>
|
|
27
|
+
<kbd>m</kbd>
|
|
28
|
+
<kbd>x</kbd>
|
|
29
|
+
<kbd>p</kbd>
|
|
30
|
+
toggle line displays
|
|
31
|
+
</p>
|
|
32
|
+
<p>
|
|
33
|
+
<kbd>j</kbd>
|
|
34
|
+
<kbd>k</kbd>
|
|
35
|
+
next/prev highlighted chunk
|
|
36
|
+
</p>
|
|
37
|
+
<p>
|
|
38
|
+
<kbd>0</kbd> (zero) top of page
|
|
39
|
+
</p>
|
|
40
|
+
<p>
|
|
41
|
+
<kbd>1</kbd> (one) first highlighted chunk
|
|
42
|
+
</p>
|
|
43
|
+
<p>
|
|
44
|
+
<kbd>[</kbd>
|
|
45
|
+
<kbd>]</kbd>
|
|
46
|
+
prev/next file
|
|
47
|
+
</p>
|
|
48
|
+
<p>
|
|
49
|
+
<kbd>u</kbd> up to the index
|
|
50
|
+
</p>
|
|
51
|
+
<p>
|
|
52
|
+
<kbd>?</kbd> show/hide this help
|
|
53
|
+
</p>
|
|
54
|
+
</div>
|
|
55
|
+
</div>
|
|
56
|
+
</aside>
|
|
57
|
+
<h2>
|
|
58
|
+
<span class="text">178 statements </span>
|
|
59
|
+
<button type="button" class="run button_toggle_run" value="run" data-shortcut="r" title="Toggle lines run">0<span class="text"> run</span></button>
|
|
60
|
+
<button type="button" class="mis show_mis button_toggle_mis" value="mis" data-shortcut="m" title="Toggle lines missing">178<span class="text"> missing</span></button>
|
|
61
|
+
<button type="button" class="exc show_exc button_toggle_exc" value="exc" data-shortcut="x" title="Toggle lines excluded">12<span class="text"> excluded</span></button>
|
|
62
|
+
<button type="button" class="par run show_par button_toggle_par" value="par" data-shortcut="p" title="Toggle lines partially run">0<span class="text"> partial</span></button>
|
|
63
|
+
</h2>
|
|
64
|
+
<p class="text">
|
|
65
|
+
<a id="prevFileLink" class="nav" href="z_2c49bd2ed3e01e38_audit_py.html">« prev</a>
|
|
66
|
+
<a id="indexLink" class="nav" href="index.html">^ index</a>
|
|
67
|
+
<a id="nextFileLink" class="nav" href="z_2c49bd2ed3e01e38_constitutional_py.html">» next</a>
|
|
68
|
+
|
|
69
|
+
<a class="nav" href="https://coverage.readthedocs.io/en/7.13.1">coverage.py v7.13.1</a>,
|
|
70
|
+
created at 2026-02-02 21:04 -0800
|
|
71
|
+
</p>
|
|
72
|
+
<aside class="hidden">
|
|
73
|
+
<button type="button" class="button_next_chunk" data-shortcut="j"></button>
|
|
74
|
+
<button type="button" class="button_prev_chunk" data-shortcut="k"></button>
|
|
75
|
+
<button type="button" class="button_top_of_page" data-shortcut="0"></button>
|
|
76
|
+
<button type="button" class="button_first_chunk" data-shortcut="1"></button>
|
|
77
|
+
<button type="button" class="button_prev_file" data-shortcut="["></button>
|
|
78
|
+
<button type="button" class="button_next_file" data-shortcut="]"></button>
|
|
79
|
+
<button type="button" class="button_to_index" data-shortcut="u"></button>
|
|
80
|
+
<button type="button" class="button_show_hide_help" data-shortcut="?"></button>
|
|
81
|
+
</aside>
|
|
82
|
+
</div>
|
|
83
|
+
</header>
|
|
84
|
+
<main id="source">
|
|
85
|
+
<p class="pln"><span class="n"><a id="t1" href="#t1">1</a></span><span class="t"><span class="str">"""</span> </span><span class="r"></span></p>
|
|
86
|
+
<p class="pln"><span class="n"><a id="t2" href="#t2">2</a></span><span class="t"><span class="str">CMVK Benchmark Suite</span> </span><span class="r"></span></p>
|
|
87
|
+
<p class="pln"><span class="n"><a id="t3" href="#t3">3</a></span><span class="t"> </span><span class="r"></span></p>
|
|
88
|
+
<p class="pln"><span class="n"><a id="t4" href="#t4">4</a></span><span class="t"><span class="str">Framework for benchmarking single-model vs multi-model verification accuracy.</span> </span><span class="r"></span></p>
|
|
89
|
+
<p class="pln"><span class="n"><a id="t5" href="#t5">5</a></span><span class="t"><span class="str">This creates the infrastructure for benchmarks - actual results require running</span> </span><span class="r"></span></p>
|
|
90
|
+
<p class="pln"><span class="n"><a id="t6" href="#t6">6</a></span><span class="t"><span class="str">with real LLM API calls.</span> </span><span class="r"></span></p>
|
|
91
|
+
<p class="pln"><span class="n"><a id="t7" href="#t7">7</a></span><span class="t"> </span><span class="r"></span></p>
|
|
92
|
+
<p class="pln"><span class="n"><a id="t8" href="#t8">8</a></span><span class="t"><span class="str">Usage:</span> </span><span class="r"></span></p>
|
|
93
|
+
<p class="pln"><span class="n"><a id="t9" href="#t9">9</a></span><span class="t"><span class="str"> python -m cmvk.benchmarks.run --models gpt-4,claude-sonnet-4,gemini-pro</span> </span><span class="r"></span></p>
|
|
94
|
+
<p class="pln"><span class="n"><a id="t10" href="#t10">10</a></span><span class="t"><span class="str">"""</span> </span><span class="r"></span></p>
|
|
95
|
+
<p class="pln"><span class="n"><a id="t11" href="#t11">11</a></span><span class="t"> </span><span class="r"></span></p>
|
|
96
|
+
<p class="mis show_mis"><span class="n"><a id="t12" href="#t12">12</a></span><span class="t"><span class="key">from</span> <span class="nam">__future__</span> <span class="key">import</span> <span class="nam">annotations</span> </span><span class="r"></span></p>
|
|
97
|
+
<p class="pln"><span class="n"><a id="t13" href="#t13">13</a></span><span class="t"> </span><span class="r"></span></p>
|
|
98
|
+
<p class="mis show_mis"><span class="n"><a id="t14" href="#t14">14</a></span><span class="t"><span class="key">import</span> <span class="nam">json</span> </span><span class="r"></span></p>
|
|
99
|
+
<p class="mis show_mis"><span class="n"><a id="t15" href="#t15">15</a></span><span class="t"><span class="key">import</span> <span class="nam">time</span> </span><span class="r"></span></p>
|
|
100
|
+
<p class="mis show_mis"><span class="n"><a id="t16" href="#t16">16</a></span><span class="t"><span class="key">from</span> <span class="nam">dataclasses</span> <span class="key">import</span> <span class="nam">dataclass</span><span class="op">,</span> <span class="nam">field</span> </span><span class="r"></span></p>
|
|
101
|
+
<p class="mis show_mis"><span class="n"><a id="t17" href="#t17">17</a></span><span class="t"><span class="key">from</span> <span class="nam">datetime</span> <span class="key">import</span> <span class="nam">datetime</span> </span><span class="r"></span></p>
|
|
102
|
+
<p class="mis show_mis"><span class="n"><a id="t18" href="#t18">18</a></span><span class="t"><span class="key">from</span> <span class="nam">enum</span> <span class="key">import</span> <span class="nam">Enum</span> </span><span class="r"></span></p>
|
|
103
|
+
<p class="mis show_mis"><span class="n"><a id="t19" href="#t19">19</a></span><span class="t"><span class="key">from</span> <span class="nam">pathlib</span> <span class="key">import</span> <span class="nam">Path</span> </span><span class="r"></span></p>
|
|
104
|
+
<p class="mis show_mis"><span class="n"><a id="t20" href="#t20">20</a></span><span class="t"><span class="key">from</span> <span class="nam">typing</span> <span class="key">import</span> <span class="nam">Any</span><span class="op">,</span> <span class="nam">Callable</span><span class="op">,</span> <span class="nam">Optional</span> </span><span class="r"></span></p>
|
|
105
|
+
<p class="pln"><span class="n"><a id="t21" href="#t21">21</a></span><span class="t"> </span><span class="r"></span></p>
|
|
106
|
+
<p class="mis show_mis"><span class="n"><a id="t22" href="#t22">22</a></span><span class="t"><span class="key">import</span> <span class="nam">numpy</span> <span class="key">as</span> <span class="nam">np</span> </span><span class="r"></span></p>
|
|
107
|
+
<p class="pln"><span class="n"><a id="t23" href="#t23">23</a></span><span class="t"> </span><span class="r"></span></p>
|
|
108
|
+
<p class="pln"><span class="n"><a id="t24" href="#t24">24</a></span><span class="t"> </span><span class="r"></span></p>
|
|
109
|
+
<p class="mis show_mis"><span class="n"><a id="t25" href="#t25">25</a></span><span class="t"><span class="key">class</span> <span class="nam">TaskCategory</span><span class="op">(</span><span class="nam">Enum</span><span class="op">)</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
110
|
+
<p class="pln"><span class="n"><a id="t26" href="#t26">26</a></span><span class="t"> <span class="str">"""Categories of benchmark tasks."""</span> </span><span class="r"></span></p>
|
|
111
|
+
<p class="mis show_mis"><span class="n"><a id="t27" href="#t27">27</a></span><span class="t"> <span class="nam">FACTUAL</span> <span class="op">=</span> <span class="str">"factual"</span> <span class="com"># Verifiable facts</span> </span><span class="r"></span></p>
|
|
112
|
+
<p class="mis show_mis"><span class="n"><a id="t28" href="#t28">28</a></span><span class="t"> <span class="nam">MATHEMATICAL</span> <span class="op">=</span> <span class="str">"mathematical"</span> <span class="com"># Numeric calculations</span> </span><span class="r"></span></p>
|
|
113
|
+
<p class="mis show_mis"><span class="n"><a id="t29" href="#t29">29</a></span><span class="t"> <span class="nam">REASONING</span> <span class="op">=</span> <span class="str">"reasoning"</span> <span class="com"># Logic and inference</span> </span><span class="r"></span></p>
|
|
114
|
+
<p class="mis show_mis"><span class="n"><a id="t30" href="#t30">30</a></span><span class="t"> <span class="nam">EXTRACTION</span> <span class="op">=</span> <span class="str">"extraction"</span> <span class="com"># Information extraction</span> </span><span class="r"></span></p>
|
|
115
|
+
<p class="pln"><span class="n"><a id="t31" href="#t31">31</a></span><span class="t"> </span><span class="r"></span></p>
|
|
116
|
+
<p class="pln"><span class="n"><a id="t32" href="#t32">32</a></span><span class="t"> </span><span class="r"></span></p>
|
|
117
|
+
<p class="mis show_mis"><span class="n"><a id="t33" href="#t33">33</a></span><span class="t"><span class="op">@</span><span class="nam">dataclass</span> </span><span class="r"></span></p>
|
|
118
|
+
<p class="mis show_mis"><span class="n"><a id="t34" href="#t34">34</a></span><span class="t"><span class="key">class</span> <span class="nam">BenchmarkTask</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
119
|
+
<p class="pln"><span class="n"><a id="t35" href="#t35">35</a></span><span class="t"> <span class="str">"""A single benchmark task."""</span> </span><span class="r"></span></p>
|
|
120
|
+
<p class="mis show_mis"><span class="n"><a id="t36" href="#t36">36</a></span><span class="t"> <span class="nam">id</span><span class="op">:</span> <span class="nam">str</span> </span><span class="r"></span></p>
|
|
121
|
+
<p class="mis show_mis"><span class="n"><a id="t37" href="#t37">37</a></span><span class="t"> <span class="nam">category</span><span class="op">:</span> <span class="nam">TaskCategory</span> </span><span class="r"></span></p>
|
|
122
|
+
<p class="mis show_mis"><span class="n"><a id="t38" href="#t38">38</a></span><span class="t"> <span class="nam">prompt</span><span class="op">:</span> <span class="nam">str</span> </span><span class="r"></span></p>
|
|
123
|
+
<p class="mis show_mis"><span class="n"><a id="t39" href="#t39">39</a></span><span class="t"> <span class="nam">ground_truth</span><span class="op">:</span> <span class="nam">Any</span> </span><span class="r"></span></p>
|
|
124
|
+
<p class="mis show_mis"><span class="n"><a id="t40" href="#t40">40</a></span><span class="t"> <span class="nam">difficulty</span><span class="op">:</span> <span class="nam">str</span> <span class="op">=</span> <span class="str">"medium"</span> <span class="com"># easy, medium, hard</span> </span><span class="r"></span></p>
|
|
125
|
+
<p class="mis show_mis"><span class="n"><a id="t41" href="#t41">41</a></span><span class="t"> <span class="nam">metadata</span><span class="op">:</span> <span class="nam">dict</span> <span class="op">=</span> <span class="nam">field</span><span class="op">(</span><span class="nam">default_factory</span><span class="op">=</span><span class="nam">dict</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
126
|
+
<p class="pln"><span class="n"><a id="t42" href="#t42">42</a></span><span class="t"> </span><span class="r"></span></p>
|
|
127
|
+
<p class="pln"><span class="n"><a id="t43" href="#t43">43</a></span><span class="t"> </span><span class="r"></span></p>
|
|
128
|
+
<p class="mis show_mis"><span class="n"><a id="t44" href="#t44">44</a></span><span class="t"><span class="op">@</span><span class="nam">dataclass</span> </span><span class="r"></span></p>
|
|
129
|
+
<p class="mis show_mis"><span class="n"><a id="t45" href="#t45">45</a></span><span class="t"><span class="key">class</span> <span class="nam">ModelResponse</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
130
|
+
<p class="pln"><span class="n"><a id="t46" href="#t46">46</a></span><span class="t"> <span class="str">"""Response from a single model."""</span> </span><span class="r"></span></p>
|
|
131
|
+
<p class="mis show_mis"><span class="n"><a id="t47" href="#t47">47</a></span><span class="t"> <span class="nam">model_name</span><span class="op">:</span> <span class="nam">str</span> </span><span class="r"></span></p>
|
|
132
|
+
<p class="mis show_mis"><span class="n"><a id="t48" href="#t48">48</a></span><span class="t"> <span class="nam">response</span><span class="op">:</span> <span class="nam">Any</span> </span><span class="r"></span></p>
|
|
133
|
+
<p class="mis show_mis"><span class="n"><a id="t49" href="#t49">49</a></span><span class="t"> <span class="nam">latency_ms</span><span class="op">:</span> <span class="nam">float</span> </span><span class="r"></span></p>
|
|
134
|
+
<p class="mis show_mis"><span class="n"><a id="t50" href="#t50">50</a></span><span class="t"> <span class="nam">raw_output</span><span class="op">:</span> <span class="nam">str</span> <span class="op">=</span> <span class="str">""</span> </span><span class="r"></span></p>
|
|
135
|
+
<p class="pln"><span class="n"><a id="t51" href="#t51">51</a></span><span class="t"> </span><span class="r"></span></p>
|
|
136
|
+
<p class="pln"><span class="n"><a id="t52" href="#t52">52</a></span><span class="t"> </span><span class="r"></span></p>
|
|
137
|
+
<p class="mis show_mis"><span class="n"><a id="t53" href="#t53">53</a></span><span class="t"><span class="op">@</span><span class="nam">dataclass</span> </span><span class="r"></span></p>
|
|
138
|
+
<p class="mis show_mis"><span class="n"><a id="t54" href="#t54">54</a></span><span class="t"><span class="key">class</span> <span class="nam">VerificationResult</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
139
|
+
<p class="pln"><span class="n"><a id="t55" href="#t55">55</a></span><span class="t"> <span class="str">"""Result of verifying a task."""</span> </span><span class="r"></span></p>
|
|
140
|
+
<p class="mis show_mis"><span class="n"><a id="t56" href="#t56">56</a></span><span class="t"> <span class="nam">task_id</span><span class="op">:</span> <span class="nam">str</span> </span><span class="r"></span></p>
|
|
141
|
+
<p class="mis show_mis"><span class="n"><a id="t57" href="#t57">57</a></span><span class="t"> <span class="nam">is_correct</span><span class="op">:</span> <span class="nam">bool</span> </span><span class="r"></span></p>
|
|
142
|
+
<p class="mis show_mis"><span class="n"><a id="t58" href="#t58">58</a></span><span class="t"> <span class="nam">confidence</span><span class="op">:</span> <span class="nam">float</span> </span><span class="r"></span></p>
|
|
143
|
+
<p class="mis show_mis"><span class="n"><a id="t59" href="#t59">59</a></span><span class="t"> <span class="nam">responses</span><span class="op">:</span> <span class="nam">list</span><span class="op">[</span><span class="nam">ModelResponse</span><span class="op">]</span> </span><span class="r"></span></p>
|
|
144
|
+
<p class="mis show_mis"><span class="n"><a id="t60" href="#t60">60</a></span><span class="t"> <span class="nam">consensus_method</span><span class="op">:</span> <span class="nam">str</span> </span><span class="r"></span></p>
|
|
145
|
+
<p class="mis show_mis"><span class="n"><a id="t61" href="#t61">61</a></span><span class="t"> <span class="nam">drift_score</span><span class="op">:</span> <span class="nam">float</span> <span class="op">=</span> <span class="num">0.0</span> </span><span class="r"></span></p>
|
|
146
|
+
<p class="mis show_mis"><span class="n"><a id="t62" href="#t62">62</a></span><span class="t"> <span class="nam">details</span><span class="op">:</span> <span class="nam">dict</span> <span class="op">=</span> <span class="nam">field</span><span class="op">(</span><span class="nam">default_factory</span><span class="op">=</span><span class="nam">dict</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
147
|
+
<p class="pln"><span class="n"><a id="t63" href="#t63">63</a></span><span class="t"> </span><span class="r"></span></p>
|
|
148
|
+
<p class="pln"><span class="n"><a id="t64" href="#t64">64</a></span><span class="t"> </span><span class="r"></span></p>
|
|
149
|
+
<p class="mis show_mis"><span class="n"><a id="t65" href="#t65">65</a></span><span class="t"><span class="op">@</span><span class="nam">dataclass</span> </span><span class="r"></span></p>
|
|
150
|
+
<p class="mis show_mis"><span class="n"><a id="t66" href="#t66">66</a></span><span class="t"><span class="key">class</span> <span class="nam">BenchmarkResults</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
151
|
+
<p class="pln"><span class="n"><a id="t67" href="#t67">67</a></span><span class="t"> <span class="str">"""Aggregate benchmark results."""</span> </span><span class="r"></span></p>
|
|
152
|
+
<p class="mis show_mis"><span class="n"><a id="t68" href="#t68">68</a></span><span class="t"> <span class="nam">total_tasks</span><span class="op">:</span> <span class="nam">int</span> </span><span class="r"></span></p>
|
|
153
|
+
<p class="mis show_mis"><span class="n"><a id="t69" href="#t69">69</a></span><span class="t"> <span class="nam">correct</span><span class="op">:</span> <span class="nam">int</span> </span><span class="r"></span></p>
|
|
154
|
+
<p class="mis show_mis"><span class="n"><a id="t70" href="#t70">70</a></span><span class="t"> <span class="nam">accuracy</span><span class="op">:</span> <span class="nam">float</span> </span><span class="r"></span></p>
|
|
155
|
+
<p class="mis show_mis"><span class="n"><a id="t71" href="#t71">71</a></span><span class="t"> <span class="nam">avg_latency_ms</span><span class="op">:</span> <span class="nam">float</span> </span><span class="r"></span></p>
|
|
156
|
+
<p class="mis show_mis"><span class="n"><a id="t72" href="#t72">72</a></span><span class="t"> <span class="nam">by_category</span><span class="op">:</span> <span class="nam">dict</span><span class="op">[</span><span class="nam">str</span><span class="op">,</span> <span class="nam">dict</span><span class="op">]</span> </span><span class="r"></span></p>
|
|
157
|
+
<p class="mis show_mis"><span class="n"><a id="t73" href="#t73">73</a></span><span class="t"> <span class="nam">by_difficulty</span><span class="op">:</span> <span class="nam">dict</span><span class="op">[</span><span class="nam">str</span><span class="op">,</span> <span class="nam">dict</span><span class="op">]</span> </span><span class="r"></span></p>
|
|
158
|
+
<p class="mis show_mis"><span class="n"><a id="t74" href="#t74">74</a></span><span class="t"> <span class="nam">timestamp</span><span class="op">:</span> <span class="nam">str</span> </span><span class="r"></span></p>
|
|
159
|
+
<p class="mis show_mis"><span class="n"><a id="t75" href="#t75">75</a></span><span class="t"> <span class="nam">config</span><span class="op">:</span> <span class="nam">dict</span> </span><span class="r"></span></p>
|
|
160
|
+
<p class="pln"><span class="n"><a id="t76" href="#t76">76</a></span><span class="t"> </span><span class="r"></span></p>
|
|
161
|
+
<p class="pln"><span class="n"><a id="t77" href="#t77">77</a></span><span class="t"> </span><span class="r"></span></p>
|
|
162
|
+
<p class="mis show_mis"><span class="n"><a id="t78" href="#t78">78</a></span><span class="t"><span class="key">class</span> <span class="nam">ConsensusMethod</span><span class="op">(</span><span class="nam">Enum</span><span class="op">)</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
163
|
+
<p class="pln"><span class="n"><a id="t79" href="#t79">79</a></span><span class="t"> <span class="str">"""Methods for reaching consensus across models."""</span> </span><span class="r"></span></p>
|
|
164
|
+
<p class="mis show_mis"><span class="n"><a id="t80" href="#t80">80</a></span><span class="t"> <span class="nam">MAJORITY_VOTE</span> <span class="op">=</span> <span class="str">"majority_vote"</span> </span><span class="r"></span></p>
|
|
165
|
+
<p class="mis show_mis"><span class="n"><a id="t81" href="#t81">81</a></span><span class="t"> <span class="nam">UNANIMOUS</span> <span class="op">=</span> <span class="str">"unanimous"</span> </span><span class="r"></span></p>
|
|
166
|
+
<p class="mis show_mis"><span class="n"><a id="t82" href="#t82">82</a></span><span class="t"> <span class="nam">WEIGHTED</span> <span class="op">=</span> <span class="str">"weighted"</span> </span><span class="r"></span></p>
|
|
167
|
+
<p class="mis show_mis"><span class="n"><a id="t83" href="#t83">83</a></span><span class="t"> <span class="nam">DRIFT_THRESHOLD</span> <span class="op">=</span> <span class="str">"drift_threshold"</span> </span><span class="r"></span></p>
|
|
168
|
+
<p class="pln"><span class="n"><a id="t84" href="#t84">84</a></span><span class="t"> </span><span class="r"></span></p>
|
|
169
|
+
<p class="pln"><span class="n"><a id="t85" href="#t85">85</a></span><span class="t"> </span><span class="r"></span></p>
|
|
170
|
+
<p class="mis show_mis"><span class="n"><a id="t86" href="#t86">86</a></span><span class="t"><span class="key">class</span> <span class="nam">CMVKBenchmark</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
171
|
+
<p class="pln"><span class="n"><a id="t87" href="#t87">87</a></span><span class="t"> <span class="str">"""</span> </span><span class="r"></span></p>
|
|
172
|
+
<p class="pln"><span class="n"><a id="t88" href="#t88">88</a></span><span class="t"><span class="str"> Benchmark framework for CMVK verification.</span> </span><span class="r"></span></p>
|
|
173
|
+
<p class="pln"><span class="n"><a id="t89" href="#t89">89</a></span><span class="t"><span class="str"> </span> </span><span class="r"></span></p>
|
|
174
|
+
<p class="pln"><span class="n"><a id="t90" href="#t90">90</a></span><span class="t"><span class="str"> This class provides infrastructure for running benchmarks.</span> </span><span class="r"></span></p>
|
|
175
|
+
<p class="pln"><span class="n"><a id="t91" href="#t91">91</a></span><span class="t"><span class="str"> Actual model calls must be provided via the `model_fn` parameter.</span> </span><span class="r"></span></p>
|
|
176
|
+
<p class="pln"><span class="n"><a id="t92" href="#t92">92</a></span><span class="t"><span class="str"> """</span> </span><span class="r"></span></p>
|
|
177
|
+
<p class="pln"><span class="n"><a id="t93" href="#t93">93</a></span><span class="t"> </span><span class="r"></span></p>
|
|
178
|
+
<p class="mis show_mis"><span class="n"><a id="t94" href="#t94">94</a></span><span class="t"> <span class="key">def</span> <span class="nam">__init__</span><span class="op">(</span> </span><span class="r"></span></p>
|
|
179
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t95" href="#t95">95</a></span><span class="t"> <span class="nam">self</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
180
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t96" href="#t96">96</a></span><span class="t"> <span class="nam">models</span><span class="op">:</span> <span class="nam">list</span><span class="op">[</span><span class="nam">str</span><span class="op">]</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
181
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t97" href="#t97">97</a></span><span class="t"> <span class="nam">model_fn</span><span class="op">:</span> <span class="nam">Optional</span><span class="op">[</span><span class="nam">Callable</span><span class="op">[</span><span class="op">[</span><span class="nam">str</span><span class="op">,</span> <span class="nam">str</span><span class="op">]</span><span class="op">,</span> <span class="nam">str</span><span class="op">]</span><span class="op">]</span> <span class="op">=</span> <span class="key">None</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
182
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t98" href="#t98">98</a></span><span class="t"> <span class="nam">consensus_method</span><span class="op">:</span> <span class="nam">ConsensusMethod</span> <span class="op">=</span> <span class="nam">ConsensusMethod</span><span class="op">.</span><span class="nam">DRIFT_THRESHOLD</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
183
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t99" href="#t99">99</a></span><span class="t"> <span class="nam">drift_threshold</span><span class="op">:</span> <span class="nam">float</span> <span class="op">=</span> <span class="num">0.15</span> </span><span class="r"></span></p>
|
|
184
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t100" href="#t100">100</a></span><span class="t"> <span class="op">)</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
185
|
+
<p class="pln"><span class="n"><a id="t101" href="#t101">101</a></span><span class="t"> <span class="str">"""</span> </span><span class="r"></span></p>
|
|
186
|
+
<p class="pln"><span class="n"><a id="t102" href="#t102">102</a></span><span class="t"><span class="str"> Initialize benchmark.</span> </span><span class="r"></span></p>
|
|
187
|
+
<p class="pln"><span class="n"><a id="t103" href="#t103">103</a></span><span class="t"><span class="str"> </span> </span><span class="r"></span></p>
|
|
188
|
+
<p class="pln"><span class="n"><a id="t104" href="#t104">104</a></span><span class="t"><span class="str"> Args:</span> </span><span class="r"></span></p>
|
|
189
|
+
<p class="pln"><span class="n"><a id="t105" href="#t105">105</a></span><span class="t"><span class="str"> models: List of model names to use</span> </span><span class="r"></span></p>
|
|
190
|
+
<p class="pln"><span class="n"><a id="t106" href="#t106">106</a></span><span class="t"><span class="str"> model_fn: Function(model_name, prompt) -> response</span> </span><span class="r"></span></p>
|
|
191
|
+
<p class="pln"><span class="n"><a id="t107" href="#t107">107</a></span><span class="t"><span class="str"> If None, uses mock responses for testing</span> </span><span class="r"></span></p>
|
|
192
|
+
<p class="pln"><span class="n"><a id="t108" href="#t108">108</a></span><span class="t"><span class="str"> consensus_method: How to combine model responses</span> </span><span class="r"></span></p>
|
|
193
|
+
<p class="pln"><span class="n"><a id="t109" href="#t109">109</a></span><span class="t"><span class="str"> drift_threshold: Threshold for drift-based consensus</span> </span><span class="r"></span></p>
|
|
194
|
+
<p class="pln"><span class="n"><a id="t110" href="#t110">110</a></span><span class="t"><span class="str"> """</span> </span><span class="r"></span></p>
|
|
195
|
+
<p class="mis show_mis"><span class="n"><a id="t111" href="#t111">111</a></span><span class="t"> <span class="nam">self</span><span class="op">.</span><span class="nam">models</span> <span class="op">=</span> <span class="nam">models</span> </span><span class="r"></span></p>
|
|
196
|
+
<p class="mis show_mis"><span class="n"><a id="t112" href="#t112">112</a></span><span class="t"> <span class="nam">self</span><span class="op">.</span><span class="nam">model_fn</span> <span class="op">=</span> <span class="nam">model_fn</span> <span class="key">or</span> <span class="nam">self</span><span class="op">.</span><span class="nam">_mock_model_fn</span> </span><span class="r"></span></p>
|
|
197
|
+
<p class="mis show_mis"><span class="n"><a id="t113" href="#t113">113</a></span><span class="t"> <span class="nam">self</span><span class="op">.</span><span class="nam">consensus_method</span> <span class="op">=</span> <span class="nam">consensus_method</span> </span><span class="r"></span></p>
|
|
198
|
+
<p class="mis show_mis"><span class="n"><a id="t114" href="#t114">114</a></span><span class="t"> <span class="nam">self</span><span class="op">.</span><span class="nam">drift_threshold</span> <span class="op">=</span> <span class="nam">drift_threshold</span> </span><span class="r"></span></p>
|
|
199
|
+
<p class="mis show_mis"><span class="n"><a id="t115" href="#t115">115</a></span><span class="t"> <span class="nam">self</span><span class="op">.</span><span class="nam">results</span><span class="op">:</span> <span class="nam">list</span><span class="op">[</span><span class="nam">VerificationResult</span><span class="op">]</span> <span class="op">=</span> <span class="op">[</span><span class="op">]</span> </span><span class="r"></span></p>
|
|
200
|
+
<p class="pln"><span class="n"><a id="t116" href="#t116">116</a></span><span class="t"> </span><span class="r"></span></p>
|
|
201
|
+
<p class="mis show_mis"><span class="n"><a id="t117" href="#t117">117</a></span><span class="t"> <span class="key">def</span> <span class="nam">_mock_model_fn</span><span class="op">(</span><span class="nam">self</span><span class="op">,</span> <span class="nam">model_name</span><span class="op">:</span> <span class="nam">str</span><span class="op">,</span> <span class="nam">prompt</span><span class="op">:</span> <span class="nam">str</span><span class="op">)</span> <span class="op">-></span> <span class="nam">str</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
202
|
+
<p class="pln"><span class="n"><a id="t118" href="#t118">118</a></span><span class="t"> <span class="str">"""Mock model function for testing the framework."""</span> </span><span class="r"></span></p>
|
|
203
|
+
<p class="pln"><span class="n"><a id="t119" href="#t119">119</a></span><span class="t"> <span class="com"># Return deterministic mock response based on prompt hash</span> </span><span class="r"></span></p>
|
|
204
|
+
<p class="mis show_mis"><span class="n"><a id="t120" href="#t120">120</a></span><span class="t"> <span class="key">return</span> <span class="fst">f"</span><span class="fst">Mock response from </span><span class="op">{</span><span class="nam">model_name</span><span class="op">}</span><span class="fst"> for prompt hash </span><span class="op">{</span><span class="nam">hash</span><span class="op">(</span><span class="nam">prompt</span><span class="op">)</span> <span class="op">%</span> <span class="num">1000</span><span class="op">}</span><span class="fst">"</span> </span><span class="r"></span></p>
|
|
205
|
+
<p class="pln"><span class="n"><a id="t121" href="#t121">121</a></span><span class="t"> </span><span class="r"></span></p>
|
|
206
|
+
<p class="mis show_mis"><span class="n"><a id="t122" href="#t122">122</a></span><span class="t"> <span class="key">def</span> <span class="nam">load_tasks</span><span class="op">(</span><span class="nam">self</span><span class="op">,</span> <span class="nam">path</span><span class="op">:</span> <span class="nam">Path</span><span class="op">)</span> <span class="op">-></span> <span class="nam">list</span><span class="op">[</span><span class="nam">BenchmarkTask</span><span class="op">]</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
207
|
+
<p class="pln"><span class="n"><a id="t123" href="#t123">123</a></span><span class="t"> <span class="str">"""Load benchmark tasks from JSON file."""</span> </span><span class="r"></span></p>
|
|
208
|
+
<p class="mis show_mis"><span class="n"><a id="t124" href="#t124">124</a></span><span class="t"> <span class="key">with</span> <span class="nam">open</span><span class="op">(</span><span class="nam">path</span><span class="op">)</span> <span class="key">as</span> <span class="nam">f</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
209
|
+
<p class="mis show_mis"><span class="n"><a id="t125" href="#t125">125</a></span><span class="t"> <span class="nam">data</span> <span class="op">=</span> <span class="nam">json</span><span class="op">.</span><span class="nam">load</span><span class="op">(</span><span class="nam">f</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
210
|
+
<p class="pln"><span class="n"><a id="t126" href="#t126">126</a></span><span class="t"> </span><span class="r"></span></p>
|
|
211
|
+
<p class="mis show_mis"><span class="n"><a id="t127" href="#t127">127</a></span><span class="t"> <span class="key">return</span> <span class="op">[</span> </span><span class="r"></span></p>
|
|
212
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t128" href="#t128">128</a></span><span class="t"> <span class="nam">BenchmarkTask</span><span class="op">(</span> </span><span class="r"></span></p>
|
|
213
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t129" href="#t129">129</a></span><span class="t"> <span class="nam">id</span><span class="op">=</span><span class="nam">t</span><span class="op">[</span><span class="str">"id"</span><span class="op">]</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
214
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t130" href="#t130">130</a></span><span class="t"> <span class="nam">category</span><span class="op">=</span><span class="nam">TaskCategory</span><span class="op">(</span><span class="nam">t</span><span class="op">[</span><span class="str">"category"</span><span class="op">]</span><span class="op">)</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
215
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t131" href="#t131">131</a></span><span class="t"> <span class="nam">prompt</span><span class="op">=</span><span class="nam">t</span><span class="op">[</span><span class="str">"prompt"</span><span class="op">]</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
216
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t132" href="#t132">132</a></span><span class="t"> <span class="nam">ground_truth</span><span class="op">=</span><span class="nam">t</span><span class="op">[</span><span class="str">"ground_truth"</span><span class="op">]</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
217
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t133" href="#t133">133</a></span><span class="t"> <span class="nam">difficulty</span><span class="op">=</span><span class="nam">t</span><span class="op">.</span><span class="nam">get</span><span class="op">(</span><span class="str">"difficulty"</span><span class="op">,</span> <span class="str">"medium"</span><span class="op">)</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
218
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t134" href="#t134">134</a></span><span class="t"> <span class="nam">metadata</span><span class="op">=</span><span class="nam">t</span><span class="op">.</span><span class="nam">get</span><span class="op">(</span><span class="str">"metadata"</span><span class="op">,</span> <span class="op">{</span><span class="op">}</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
219
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t135" href="#t135">135</a></span><span class="t"> <span class="op">)</span> </span><span class="r"></span></p>
|
|
220
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t136" href="#t136">136</a></span><span class="t"> <span class="key">for</span> <span class="nam">t</span> <span class="key">in</span> <span class="nam">data</span><span class="op">[</span><span class="str">"tasks"</span><span class="op">]</span> </span><span class="r"></span></p>
|
|
221
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t137" href="#t137">137</a></span><span class="t"> <span class="op">]</span> </span><span class="r"></span></p>
|
|
222
|
+
<p class="pln"><span class="n"><a id="t138" href="#t138">138</a></span><span class="t"> </span><span class="r"></span></p>
|
|
223
|
+
<p class="mis show_mis"><span class="n"><a id="t139" href="#t139">139</a></span><span class="t"> <span class="key">def</span> <span class="nam">run_single_model</span><span class="op">(</span><span class="nam">self</span><span class="op">,</span> <span class="nam">tasks</span><span class="op">:</span> <span class="nam">list</span><span class="op">[</span><span class="nam">BenchmarkTask</span><span class="op">]</span><span class="op">,</span> <span class="nam">model</span><span class="op">:</span> <span class="nam">str</span><span class="op">)</span> <span class="op">-></span> <span class="nam">list</span><span class="op">[</span><span class="nam">VerificationResult</span><span class="op">]</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
224
|
+
<p class="pln"><span class="n"><a id="t140" href="#t140">140</a></span><span class="t"> <span class="str">"""Run benchmark with a single model."""</span> </span><span class="r"></span></p>
|
|
225
|
+
<p class="mis show_mis"><span class="n"><a id="t141" href="#t141">141</a></span><span class="t"> <span class="nam">results</span> <span class="op">=</span> <span class="op">[</span><span class="op">]</span> </span><span class="r"></span></p>
|
|
226
|
+
<p class="pln"><span class="n"><a id="t142" href="#t142">142</a></span><span class="t"> </span><span class="r"></span></p>
|
|
227
|
+
<p class="mis show_mis"><span class="n"><a id="t143" href="#t143">143</a></span><span class="t"> <span class="key">for</span> <span class="nam">task</span> <span class="key">in</span> <span class="nam">tasks</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
228
|
+
<p class="mis show_mis"><span class="n"><a id="t144" href="#t144">144</a></span><span class="t"> <span class="nam">start</span> <span class="op">=</span> <span class="nam">time</span><span class="op">.</span><span class="nam">perf_counter</span><span class="op">(</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
229
|
+
<p class="mis show_mis"><span class="n"><a id="t145" href="#t145">145</a></span><span class="t"> <span class="nam">response</span> <span class="op">=</span> <span class="nam">self</span><span class="op">.</span><span class="nam">model_fn</span><span class="op">(</span><span class="nam">model</span><span class="op">,</span> <span class="nam">task</span><span class="op">.</span><span class="nam">prompt</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
230
|
+
<p class="mis show_mis"><span class="n"><a id="t146" href="#t146">146</a></span><span class="t"> <span class="nam">latency</span> <span class="op">=</span> <span class="op">(</span><span class="nam">time</span><span class="op">.</span><span class="nam">perf_counter</span><span class="op">(</span><span class="op">)</span> <span class="op">-</span> <span class="nam">start</span><span class="op">)</span> <span class="op">*</span> <span class="num">1000</span> </span><span class="r"></span></p>
|
|
231
|
+
<p class="pln"><span class="n"><a id="t147" href="#t147">147</a></span><span class="t"> </span><span class="r"></span></p>
|
|
232
|
+
<p class="mis show_mis"><span class="n"><a id="t148" href="#t148">148</a></span><span class="t"> <span class="nam">model_response</span> <span class="op">=</span> <span class="nam">ModelResponse</span><span class="op">(</span> </span><span class="r"></span></p>
|
|
233
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t149" href="#t149">149</a></span><span class="t"> <span class="nam">model_name</span><span class="op">=</span><span class="nam">model</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
234
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t150" href="#t150">150</a></span><span class="t"> <span class="nam">response</span><span class="op">=</span><span class="nam">response</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
235
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t151" href="#t151">151</a></span><span class="t"> <span class="nam">latency_ms</span><span class="op">=</span><span class="nam">latency</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
236
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t152" href="#t152">152</a></span><span class="t"> <span class="nam">raw_output</span><span class="op">=</span><span class="nam">response</span> </span><span class="r"></span></p>
|
|
237
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t153" href="#t153">153</a></span><span class="t"> <span class="op">)</span> </span><span class="r"></span></p>
|
|
238
|
+
<p class="pln"><span class="n"><a id="t154" href="#t154">154</a></span><span class="t"> </span><span class="r"></span></p>
|
|
239
|
+
<p class="mis show_mis"><span class="n"><a id="t155" href="#t155">155</a></span><span class="t"> <span class="nam">is_correct</span> <span class="op">=</span> <span class="nam">self</span><span class="op">.</span><span class="nam">_check_correctness</span><span class="op">(</span><span class="nam">response</span><span class="op">,</span> <span class="nam">task</span><span class="op">.</span><span class="nam">ground_truth</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
240
|
+
<p class="pln"><span class="n"><a id="t156" href="#t156">156</a></span><span class="t"> </span><span class="r"></span></p>
|
|
241
|
+
<p class="mis show_mis"><span class="n"><a id="t157" href="#t157">157</a></span><span class="t"> <span class="nam">results</span><span class="op">.</span><span class="nam">append</span><span class="op">(</span><span class="nam">VerificationResult</span><span class="op">(</span> </span><span class="r"></span></p>
|
|
242
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t158" href="#t158">158</a></span><span class="t"> <span class="nam">task_id</span><span class="op">=</span><span class="nam">task</span><span class="op">.</span><span class="nam">id</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
243
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t159" href="#t159">159</a></span><span class="t"> <span class="nam">is_correct</span><span class="op">=</span><span class="nam">is_correct</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
244
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t160" href="#t160">160</a></span><span class="t"> <span class="nam">confidence</span><span class="op">=</span><span class="num">1.0</span> <span class="key">if</span> <span class="nam">is_correct</span> <span class="key">else</span> <span class="num">0.0</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
245
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t161" href="#t161">161</a></span><span class="t"> <span class="nam">responses</span><span class="op">=</span><span class="op">[</span><span class="nam">model_response</span><span class="op">]</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
246
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t162" href="#t162">162</a></span><span class="t"> <span class="nam">consensus_method</span><span class="op">=</span><span class="str">"single_model"</span> </span><span class="r"></span></p>
|
|
247
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t163" href="#t163">163</a></span><span class="t"> <span class="op">)</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
248
|
+
<p class="pln"><span class="n"><a id="t164" href="#t164">164</a></span><span class="t"> </span><span class="r"></span></p>
|
|
249
|
+
<p class="mis show_mis"><span class="n"><a id="t165" href="#t165">165</a></span><span class="t"> <span class="key">return</span> <span class="nam">results</span> </span><span class="r"></span></p>
|
|
250
|
+
<p class="pln"><span class="n"><a id="t166" href="#t166">166</a></span><span class="t"> </span><span class="r"></span></p>
|
|
251
|
+
<p class="mis show_mis"><span class="n"><a id="t167" href="#t167">167</a></span><span class="t"> <span class="key">def</span> <span class="nam">run_multi_model</span><span class="op">(</span><span class="nam">self</span><span class="op">,</span> <span class="nam">tasks</span><span class="op">:</span> <span class="nam">list</span><span class="op">[</span><span class="nam">BenchmarkTask</span><span class="op">]</span><span class="op">)</span> <span class="op">-></span> <span class="nam">list</span><span class="op">[</span><span class="nam">VerificationResult</span><span class="op">]</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
252
|
+
<p class="pln"><span class="n"><a id="t168" href="#t168">168</a></span><span class="t"> <span class="str">"""Run benchmark with multiple models using CMVK consensus."""</span> </span><span class="r"></span></p>
|
|
253
|
+
<p class="mis show_mis"><span class="n"><a id="t169" href="#t169">169</a></span><span class="t"> <span class="nam">results</span> <span class="op">=</span> <span class="op">[</span><span class="op">]</span> </span><span class="r"></span></p>
|
|
254
|
+
<p class="pln"><span class="n"><a id="t170" href="#t170">170</a></span><span class="t"> </span><span class="r"></span></p>
|
|
255
|
+
<p class="mis show_mis"><span class="n"><a id="t171" href="#t171">171</a></span><span class="t"> <span class="key">for</span> <span class="nam">task</span> <span class="key">in</span> <span class="nam">tasks</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
256
|
+
<p class="mis show_mis"><span class="n"><a id="t172" href="#t172">172</a></span><span class="t"> <span class="nam">responses</span> <span class="op">=</span> <span class="op">[</span><span class="op">]</span> </span><span class="r"></span></p>
|
|
257
|
+
<p class="pln"><span class="n"><a id="t173" href="#t173">173</a></span><span class="t"> </span><span class="r"></span></p>
|
|
258
|
+
<p class="pln"><span class="n"><a id="t174" href="#t174">174</a></span><span class="t"> <span class="com"># Get response from each model</span> </span><span class="r"></span></p>
|
|
259
|
+
<p class="mis show_mis"><span class="n"><a id="t175" href="#t175">175</a></span><span class="t"> <span class="key">for</span> <span class="nam">model</span> <span class="key">in</span> <span class="nam">self</span><span class="op">.</span><span class="nam">models</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
260
|
+
<p class="mis show_mis"><span class="n"><a id="t176" href="#t176">176</a></span><span class="t"> <span class="nam">start</span> <span class="op">=</span> <span class="nam">time</span><span class="op">.</span><span class="nam">perf_counter</span><span class="op">(</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
261
|
+
<p class="mis show_mis"><span class="n"><a id="t177" href="#t177">177</a></span><span class="t"> <span class="nam">response</span> <span class="op">=</span> <span class="nam">self</span><span class="op">.</span><span class="nam">model_fn</span><span class="op">(</span><span class="nam">model</span><span class="op">,</span> <span class="nam">task</span><span class="op">.</span><span class="nam">prompt</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
262
|
+
<p class="mis show_mis"><span class="n"><a id="t178" href="#t178">178</a></span><span class="t"> <span class="nam">latency</span> <span class="op">=</span> <span class="op">(</span><span class="nam">time</span><span class="op">.</span><span class="nam">perf_counter</span><span class="op">(</span><span class="op">)</span> <span class="op">-</span> <span class="nam">start</span><span class="op">)</span> <span class="op">*</span> <span class="num">1000</span> </span><span class="r"></span></p>
|
|
263
|
+
<p class="pln"><span class="n"><a id="t179" href="#t179">179</a></span><span class="t"> </span><span class="r"></span></p>
|
|
264
|
+
<p class="mis show_mis"><span class="n"><a id="t180" href="#t180">180</a></span><span class="t"> <span class="nam">responses</span><span class="op">.</span><span class="nam">append</span><span class="op">(</span><span class="nam">ModelResponse</span><span class="op">(</span> </span><span class="r"></span></p>
|
|
265
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t181" href="#t181">181</a></span><span class="t"> <span class="nam">model_name</span><span class="op">=</span><span class="nam">model</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
266
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t182" href="#t182">182</a></span><span class="t"> <span class="nam">response</span><span class="op">=</span><span class="nam">response</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
267
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t183" href="#t183">183</a></span><span class="t"> <span class="nam">latency_ms</span><span class="op">=</span><span class="nam">latency</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
268
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t184" href="#t184">184</a></span><span class="t"> <span class="nam">raw_output</span><span class="op">=</span><span class="nam">response</span> </span><span class="r"></span></p>
|
|
269
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t185" href="#t185">185</a></span><span class="t"> <span class="op">)</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
270
|
+
<p class="pln"><span class="n"><a id="t186" href="#t186">186</a></span><span class="t"> </span><span class="r"></span></p>
|
|
271
|
+
<p class="pln"><span class="n"><a id="t187" href="#t187">187</a></span><span class="t"> <span class="com"># Apply consensus method</span> </span><span class="r"></span></p>
|
|
272
|
+
<p class="mis show_mis"><span class="n"><a id="t188" href="#t188">188</a></span><span class="t"> <span class="nam">consensus_result</span> <span class="op">=</span> <span class="nam">self</span><span class="op">.</span><span class="nam">_apply_consensus</span><span class="op">(</span><span class="nam">responses</span><span class="op">,</span> <span class="nam">task</span><span class="op">.</span><span class="nam">ground_truth</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
273
|
+
<p class="pln"><span class="n"><a id="t189" href="#t189">189</a></span><span class="t"> </span><span class="r"></span></p>
|
|
274
|
+
<p class="mis show_mis"><span class="n"><a id="t190" href="#t190">190</a></span><span class="t"> <span class="nam">results</span><span class="op">.</span><span class="nam">append</span><span class="op">(</span><span class="nam">VerificationResult</span><span class="op">(</span> </span><span class="r"></span></p>
|
|
275
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t191" href="#t191">191</a></span><span class="t"> <span class="nam">task_id</span><span class="op">=</span><span class="nam">task</span><span class="op">.</span><span class="nam">id</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
276
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t192" href="#t192">192</a></span><span class="t"> <span class="nam">is_correct</span><span class="op">=</span><span class="nam">consensus_result</span><span class="op">[</span><span class="str">"is_correct"</span><span class="op">]</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
277
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t193" href="#t193">193</a></span><span class="t"> <span class="nam">confidence</span><span class="op">=</span><span class="nam">consensus_result</span><span class="op">[</span><span class="str">"confidence"</span><span class="op">]</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
278
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t194" href="#t194">194</a></span><span class="t"> <span class="nam">responses</span><span class="op">=</span><span class="nam">responses</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
279
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t195" href="#t195">195</a></span><span class="t"> <span class="nam">consensus_method</span><span class="op">=</span><span class="nam">self</span><span class="op">.</span><span class="nam">consensus_method</span><span class="op">.</span><span class="nam">value</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
280
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t196" href="#t196">196</a></span><span class="t"> <span class="nam">drift_score</span><span class="op">=</span><span class="nam">consensus_result</span><span class="op">.</span><span class="nam">get</span><span class="op">(</span><span class="str">"drift_score"</span><span class="op">,</span> <span class="num">0.0</span><span class="op">)</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
281
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t197" href="#t197">197</a></span><span class="t"> <span class="nam">details</span><span class="op">=</span><span class="nam">consensus_result</span><span class="op">.</span><span class="nam">get</span><span class="op">(</span><span class="str">"details"</span><span class="op">,</span> <span class="op">{</span><span class="op">}</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
282
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t198" href="#t198">198</a></span><span class="t"> <span class="op">)</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
283
|
+
<p class="pln"><span class="n"><a id="t199" href="#t199">199</a></span><span class="t"> </span><span class="r"></span></p>
|
|
284
|
+
<p class="mis show_mis"><span class="n"><a id="t200" href="#t200">200</a></span><span class="t"> <span class="key">return</span> <span class="nam">results</span> </span><span class="r"></span></p>
|
|
285
|
+
<p class="pln"><span class="n"><a id="t201" href="#t201">201</a></span><span class="t"> </span><span class="r"></span></p>
|
|
286
|
+
<p class="mis show_mis"><span class="n"><a id="t202" href="#t202">202</a></span><span class="t"> <span class="key">def</span> <span class="nam">_apply_consensus</span><span class="op">(</span><span class="nam">self</span><span class="op">,</span> <span class="nam">responses</span><span class="op">:</span> <span class="nam">list</span><span class="op">[</span><span class="nam">ModelResponse</span><span class="op">]</span><span class="op">,</span> <span class="nam">ground_truth</span><span class="op">:</span> <span class="nam">Any</span><span class="op">)</span> <span class="op">-></span> <span class="nam">dict</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
287
|
+
<p class="pln"><span class="n"><a id="t203" href="#t203">203</a></span><span class="t"> <span class="str">"""</span> </span><span class="r"></span></p>
|
|
288
|
+
<p class="pln"><span class="n"><a id="t204" href="#t204">204</a></span><span class="t"><span class="str"> Apply consensus method to multiple model responses.</span> </span><span class="r"></span></p>
|
|
289
|
+
<p class="pln"><span class="n"><a id="t205" href="#t205">205</a></span><span class="t"><span class="str"> </span> </span><span class="r"></span></p>
|
|
290
|
+
<p class="pln"><span class="n"><a id="t206" href="#t206">206</a></span><span class="t"><span class="str"> This is where the CMVK algorithm is applied.</span> </span><span class="r"></span></p>
|
|
291
|
+
<p class="pln"><span class="n"><a id="t207" href="#t207">207</a></span><span class="t"><span class="str"> """</span> </span><span class="r"></span></p>
|
|
292
|
+
<p class="mis show_mis"><span class="n"><a id="t208" href="#t208">208</a></span><span class="t"> <span class="key">if</span> <span class="nam">self</span><span class="op">.</span><span class="nam">consensus_method</span> <span class="op">==</span> <span class="nam">ConsensusMethod</span><span class="op">.</span><span class="nam">MAJORITY_VOTE</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
293
|
+
<p class="mis show_mis"><span class="n"><a id="t209" href="#t209">209</a></span><span class="t"> <span class="key">return</span> <span class="nam">self</span><span class="op">.</span><span class="nam">_majority_vote_consensus</span><span class="op">(</span><span class="nam">responses</span><span class="op">,</span> <span class="nam">ground_truth</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
294
|
+
<p class="mis show_mis"><span class="n"><a id="t210" href="#t210">210</a></span><span class="t"> <span class="key">elif</span> <span class="nam">self</span><span class="op">.</span><span class="nam">consensus_method</span> <span class="op">==</span> <span class="nam">ConsensusMethod</span><span class="op">.</span><span class="nam">UNANIMOUS</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
295
|
+
<p class="mis show_mis"><span class="n"><a id="t211" href="#t211">211</a></span><span class="t"> <span class="key">return</span> <span class="nam">self</span><span class="op">.</span><span class="nam">_unanimous_consensus</span><span class="op">(</span><span class="nam">responses</span><span class="op">,</span> <span class="nam">ground_truth</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
296
|
+
<p class="mis show_mis"><span class="n"><a id="t212" href="#t212">212</a></span><span class="t"> <span class="key">elif</span> <span class="nam">self</span><span class="op">.</span><span class="nam">consensus_method</span> <span class="op">==</span> <span class="nam">ConsensusMethod</span><span class="op">.</span><span class="nam">DRIFT_THRESHOLD</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
297
|
+
<p class="mis show_mis"><span class="n"><a id="t213" href="#t213">213</a></span><span class="t"> <span class="key">return</span> <span class="nam">self</span><span class="op">.</span><span class="nam">_drift_consensus</span><span class="op">(</span><span class="nam">responses</span><span class="op">,</span> <span class="nam">ground_truth</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
298
|
+
<p class="pln"><span class="n"><a id="t214" href="#t214">214</a></span><span class="t"> <span class="key">else</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
299
|
+
<p class="mis show_mis"><span class="n"><a id="t215" href="#t215">215</a></span><span class="t"> <span class="key">return</span> <span class="nam">self</span><span class="op">.</span><span class="nam">_majority_vote_consensus</span><span class="op">(</span><span class="nam">responses</span><span class="op">,</span> <span class="nam">ground_truth</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
300
|
+
<p class="pln"><span class="n"><a id="t216" href="#t216">216</a></span><span class="t"> </span><span class="r"></span></p>
|
|
301
|
+
<p class="mis show_mis"><span class="n"><a id="t217" href="#t217">217</a></span><span class="t"> <span class="key">def</span> <span class="nam">_majority_vote_consensus</span><span class="op">(</span><span class="nam">self</span><span class="op">,</span> <span class="nam">responses</span><span class="op">:</span> <span class="nam">list</span><span class="op">[</span><span class="nam">ModelResponse</span><span class="op">]</span><span class="op">,</span> <span class="nam">ground_truth</span><span class="op">:</span> <span class="nam">Any</span><span class="op">)</span> <span class="op">-></span> <span class="nam">dict</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
302
|
+
<p class="pln"><span class="n"><a id="t218" href="#t218">218</a></span><span class="t"> <span class="str">"""Simple majority voting."""</span> </span><span class="r"></span></p>
|
|
303
|
+
<p class="mis show_mis"><span class="n"><a id="t219" href="#t219">219</a></span><span class="t"> <span class="nam">correct_count</span> <span class="op">=</span> <span class="nam">sum</span><span class="op">(</span> </span><span class="r"></span></p>
|
|
304
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t220" href="#t220">220</a></span><span class="t"> <span class="num">1</span> <span class="key">for</span> <span class="nam">r</span> <span class="key">in</span> <span class="nam">responses</span> </span><span class="r"></span></p>
|
|
305
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t221" href="#t221">221</a></span><span class="t"> <span class="key">if</span> <span class="nam">self</span><span class="op">.</span><span class="nam">_check_correctness</span><span class="op">(</span><span class="nam">r</span><span class="op">.</span><span class="nam">response</span><span class="op">,</span> <span class="nam">ground_truth</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
306
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t222" href="#t222">222</a></span><span class="t"> <span class="op">)</span> </span><span class="r"></span></p>
|
|
307
|
+
<p class="pln"><span class="n"><a id="t223" href="#t223">223</a></span><span class="t"> </span><span class="r"></span></p>
|
|
308
|
+
<p class="mis show_mis"><span class="n"><a id="t224" href="#t224">224</a></span><span class="t"> <span class="nam">majority</span> <span class="op">=</span> <span class="nam">correct_count</span> <span class="op">></span> <span class="nam">len</span><span class="op">(</span><span class="nam">responses</span><span class="op">)</span> <span class="op">/</span> <span class="num">2</span> </span><span class="r"></span></p>
|
|
309
|
+
<p class="mis show_mis"><span class="n"><a id="t225" href="#t225">225</a></span><span class="t"> <span class="nam">confidence</span> <span class="op">=</span> <span class="nam">correct_count</span> <span class="op">/</span> <span class="nam">len</span><span class="op">(</span><span class="nam">responses</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
310
|
+
<p class="pln"><span class="n"><a id="t226" href="#t226">226</a></span><span class="t"> </span><span class="r"></span></p>
|
|
311
|
+
<p class="mis show_mis"><span class="n"><a id="t227" href="#t227">227</a></span><span class="t"> <span class="key">return</span> <span class="op">{</span> </span><span class="r"></span></p>
|
|
312
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t228" href="#t228">228</a></span><span class="t"> <span class="str">"is_correct"</span><span class="op">:</span> <span class="nam">majority</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
313
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t229" href="#t229">229</a></span><span class="t"> <span class="str">"confidence"</span><span class="op">:</span> <span class="nam">confidence</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
314
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t230" href="#t230">230</a></span><span class="t"> <span class="str">"details"</span><span class="op">:</span> <span class="op">{</span><span class="str">"correct_count"</span><span class="op">:</span> <span class="nam">correct_count</span><span class="op">,</span> <span class="str">"total"</span><span class="op">:</span> <span class="nam">len</span><span class="op">(</span><span class="nam">responses</span><span class="op">)</span><span class="op">}</span> </span><span class="r"></span></p>
|
|
315
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t231" href="#t231">231</a></span><span class="t"> <span class="op">}</span> </span><span class="r"></span></p>
|
|
316
|
+
<p class="pln"><span class="n"><a id="t232" href="#t232">232</a></span><span class="t"> </span><span class="r"></span></p>
|
|
317
|
+
<p class="mis show_mis"><span class="n"><a id="t233" href="#t233">233</a></span><span class="t"> <span class="key">def</span> <span class="nam">_unanimous_consensus</span><span class="op">(</span><span class="nam">self</span><span class="op">,</span> <span class="nam">responses</span><span class="op">:</span> <span class="nam">list</span><span class="op">[</span><span class="nam">ModelResponse</span><span class="op">]</span><span class="op">,</span> <span class="nam">ground_truth</span><span class="op">:</span> <span class="nam">Any</span><span class="op">)</span> <span class="op">-></span> <span class="nam">dict</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
318
|
+
<p class="pln"><span class="n"><a id="t234" href="#t234">234</a></span><span class="t"> <span class="str">"""Require all models to agree."""</span> </span><span class="r"></span></p>
|
|
319
|
+
<p class="mis show_mis"><span class="n"><a id="t235" href="#t235">235</a></span><span class="t"> <span class="nam">all_correct</span> <span class="op">=</span> <span class="nam">all</span><span class="op">(</span> </span><span class="r"></span></p>
|
|
320
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t236" href="#t236">236</a></span><span class="t"> <span class="nam">self</span><span class="op">.</span><span class="nam">_check_correctness</span><span class="op">(</span><span class="nam">r</span><span class="op">.</span><span class="nam">response</span><span class="op">,</span> <span class="nam">ground_truth</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
321
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t237" href="#t237">237</a></span><span class="t"> <span class="key">for</span> <span class="nam">r</span> <span class="key">in</span> <span class="nam">responses</span> </span><span class="r"></span></p>
|
|
322
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t238" href="#t238">238</a></span><span class="t"> <span class="op">)</span> </span><span class="r"></span></p>
|
|
323
|
+
<p class="pln"><span class="n"><a id="t239" href="#t239">239</a></span><span class="t"> </span><span class="r"></span></p>
|
|
324
|
+
<p class="mis show_mis"><span class="n"><a id="t240" href="#t240">240</a></span><span class="t"> <span class="key">return</span> <span class="op">{</span> </span><span class="r"></span></p>
|
|
325
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t241" href="#t241">241</a></span><span class="t"> <span class="str">"is_correct"</span><span class="op">:</span> <span class="nam">all_correct</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
326
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t242" href="#t242">242</a></span><span class="t"> <span class="str">"confidence"</span><span class="op">:</span> <span class="num">1.0</span> <span class="key">if</span> <span class="nam">all_correct</span> <span class="key">else</span> <span class="num">0.0</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
327
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t243" href="#t243">243</a></span><span class="t"> <span class="str">"details"</span><span class="op">:</span> <span class="op">{</span><span class="str">"unanimous"</span><span class="op">:</span> <span class="nam">all_correct</span><span class="op">}</span> </span><span class="r"></span></p>
|
|
328
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t244" href="#t244">244</a></span><span class="t"> <span class="op">}</span> </span><span class="r"></span></p>
|
|
329
|
+
<p class="pln"><span class="n"><a id="t245" href="#t245">245</a></span><span class="t"> </span><span class="r"></span></p>
|
|
330
|
+
<p class="mis show_mis"><span class="n"><a id="t246" href="#t246">246</a></span><span class="t"> <span class="key">def</span> <span class="nam">_drift_consensus</span><span class="op">(</span><span class="nam">self</span><span class="op">,</span> <span class="nam">responses</span><span class="op">:</span> <span class="nam">list</span><span class="op">[</span><span class="nam">ModelResponse</span><span class="op">]</span><span class="op">,</span> <span class="nam">ground_truth</span><span class="op">:</span> <span class="nam">Any</span><span class="op">)</span> <span class="op">-></span> <span class="nam">dict</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
331
|
+
<p class="pln"><span class="n"><a id="t247" href="#t247">247</a></span><span class="t"> <span class="str">"""</span> </span><span class="r"></span></p>
|
|
332
|
+
<p class="pln"><span class="n"><a id="t248" href="#t248">248</a></span><span class="t"><span class="str"> CMVK drift-based consensus.</span> </span><span class="r"></span></p>
|
|
333
|
+
<p class="pln"><span class="n"><a id="t249" href="#t249">249</a></span><span class="t"><span class="str"> </span> </span><span class="r"></span></p>
|
|
334
|
+
<p class="pln"><span class="n"><a id="t250" href="#t250">250</a></span><span class="t"><span class="str"> Algorithm:</span> </span><span class="r"></span></p>
|
|
335
|
+
<p class="pln"><span class="n"><a id="t251" href="#t251">251</a></span><span class="t"><span class="str"> 1. Convert responses to vectors (embeddings or numeric)</span> </span><span class="r"></span></p>
|
|
336
|
+
<p class="pln"><span class="n"><a id="t252" href="#t252">252</a></span><span class="t"><span class="str"> 2. Calculate pairwise drift between all responses</span> </span><span class="r"></span></p>
|
|
337
|
+
<p class="pln"><span class="n"><a id="t253" href="#t253">253</a></span><span class="t"><span class="str"> 3. If max drift > threshold, responses disagree → flag for review</span> </span><span class="r"></span></p>
|
|
338
|
+
<p class="pln"><span class="n"><a id="t254" href="#t254">254</a></span><span class="t"><span class="str"> 4. If responses agree, check against ground truth</span> </span><span class="r"></span></p>
|
|
339
|
+
<p class="pln"><span class="n"><a id="t255" href="#t255">255</a></span><span class="t"><span class="str"> """</span> </span><span class="r"></span></p>
|
|
340
|
+
<p class="pln"><span class="n"><a id="t256" href="#t256">256</a></span><span class="t"> <span class="com"># Calculate drift between responses</span> </span><span class="r"></span></p>
|
|
341
|
+
<p class="mis show_mis"><span class="n"><a id="t257" href="#t257">257</a></span><span class="t"> <span class="nam">drift_scores</span> <span class="op">=</span> <span class="nam">self</span><span class="op">.</span><span class="nam">_calculate_pairwise_drift</span><span class="op">(</span><span class="nam">responses</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
342
|
+
<p class="mis show_mis"><span class="n"><a id="t258" href="#t258">258</a></span><span class="t"> <span class="nam">max_drift</span> <span class="op">=</span> <span class="nam">max</span><span class="op">(</span><span class="nam">drift_scores</span><span class="op">)</span> <span class="key">if</span> <span class="nam">drift_scores</span> <span class="key">else</span> <span class="num">0.0</span> </span><span class="r"></span></p>
|
|
343
|
+
<p class="mis show_mis"><span class="n"><a id="t259" href="#t259">259</a></span><span class="t"> <span class="nam">avg_drift</span> <span class="op">=</span> <span class="nam">np</span><span class="op">.</span><span class="nam">mean</span><span class="op">(</span><span class="nam">drift_scores</span><span class="op">)</span> <span class="key">if</span> <span class="nam">drift_scores</span> <span class="key">else</span> <span class="num">0.0</span> </span><span class="r"></span></p>
|
|
344
|
+
<p class="pln"><span class="n"><a id="t260" href="#t260">260</a></span><span class="t"> </span><span class="r"></span></p>
|
|
345
|
+
<p class="pln"><span class="n"><a id="t261" href="#t261">261</a></span><span class="t"> <span class="com"># High drift = disagreement = low confidence</span> </span><span class="r"></span></p>
|
|
346
|
+
<p class="mis show_mis"><span class="n"><a id="t262" href="#t262">262</a></span><span class="t"> <span class="key">if</span> <span class="nam">max_drift</span> <span class="op">></span> <span class="nam">self</span><span class="op">.</span><span class="nam">drift_threshold</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
347
|
+
<p class="pln"><span class="n"><a id="t263" href="#t263">263</a></span><span class="t"> <span class="com"># Models disagree significantly</span> </span><span class="r"></span></p>
|
|
348
|
+
<p class="mis show_mis"><span class="n"><a id="t264" href="#t264">264</a></span><span class="t"> <span class="key">return</span> <span class="op">{</span> </span><span class="r"></span></p>
|
|
349
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t265" href="#t265">265</a></span><span class="t"> <span class="str">"is_correct"</span><span class="op">:</span> <span class="key">False</span><span class="op">,</span> <span class="com"># Can't trust when models disagree</span> </span><span class="r"></span></p>
|
|
350
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t266" href="#t266">266</a></span><span class="t"> <span class="str">"confidence"</span><span class="op">:</span> <span class="num">1.0</span> <span class="op">-</span> <span class="nam">max_drift</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
351
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t267" href="#t267">267</a></span><span class="t"> <span class="str">"drift_score"</span><span class="op">:</span> <span class="nam">max_drift</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
352
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t268" href="#t268">268</a></span><span class="t"> <span class="str">"details"</span><span class="op">:</span> <span class="op">{</span> </span><span class="r"></span></p>
|
|
353
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t269" href="#t269">269</a></span><span class="t"> <span class="str">"disagreement_detected"</span><span class="op">:</span> <span class="key">True</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
354
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t270" href="#t270">270</a></span><span class="t"> <span class="str">"max_drift"</span><span class="op">:</span> <span class="nam">max_drift</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
355
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t271" href="#t271">271</a></span><span class="t"> <span class="str">"avg_drift"</span><span class="op">:</span> <span class="nam">avg_drift</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
356
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t272" href="#t272">272</a></span><span class="t"> <span class="str">"threshold"</span><span class="op">:</span> <span class="nam">self</span><span class="op">.</span><span class="nam">drift_threshold</span> </span><span class="r"></span></p>
|
|
357
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t273" href="#t273">273</a></span><span class="t"> <span class="op">}</span> </span><span class="r"></span></p>
|
|
358
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t274" href="#t274">274</a></span><span class="t"> <span class="op">}</span> </span><span class="r"></span></p>
|
|
359
|
+
<p class="pln"><span class="n"><a id="t275" href="#t275">275</a></span><span class="t"> </span><span class="r"></span></p>
|
|
360
|
+
<p class="pln"><span class="n"><a id="t276" href="#t276">276</a></span><span class="t"> <span class="com"># Models agree - check correctness</span> </span><span class="r"></span></p>
|
|
361
|
+
<p class="pln"><span class="n"><a id="t277" href="#t277">277</a></span><span class="t"> <span class="com"># Use first response as representative (they all agree)</span> </span><span class="r"></span></p>
|
|
362
|
+
<p class="mis show_mis"><span class="n"><a id="t278" href="#t278">278</a></span><span class="t"> <span class="nam">is_correct</span> <span class="op">=</span> <span class="nam">self</span><span class="op">.</span><span class="nam">_check_correctness</span><span class="op">(</span><span class="nam">responses</span><span class="op">[</span><span class="num">0</span><span class="op">]</span><span class="op">.</span><span class="nam">response</span><span class="op">,</span> <span class="nam">ground_truth</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
363
|
+
<p class="pln"><span class="n"><a id="t279" href="#t279">279</a></span><span class="t"> </span><span class="r"></span></p>
|
|
364
|
+
<p class="mis show_mis"><span class="n"><a id="t280" href="#t280">280</a></span><span class="t"> <span class="key">return</span> <span class="op">{</span> </span><span class="r"></span></p>
|
|
365
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t281" href="#t281">281</a></span><span class="t"> <span class="str">"is_correct"</span><span class="op">:</span> <span class="nam">is_correct</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
366
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t282" href="#t282">282</a></span><span class="t"> <span class="str">"confidence"</span><span class="op">:</span> <span class="num">1.0</span> <span class="op">-</span> <span class="nam">avg_drift</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
367
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t283" href="#t283">283</a></span><span class="t"> <span class="str">"drift_score"</span><span class="op">:</span> <span class="nam">avg_drift</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
368
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t284" href="#t284">284</a></span><span class="t"> <span class="str">"details"</span><span class="op">:</span> <span class="op">{</span> </span><span class="r"></span></p>
|
|
369
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t285" href="#t285">285</a></span><span class="t"> <span class="str">"disagreement_detected"</span><span class="op">:</span> <span class="key">False</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
370
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t286" href="#t286">286</a></span><span class="t"> <span class="str">"max_drift"</span><span class="op">:</span> <span class="nam">max_drift</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
371
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t287" href="#t287">287</a></span><span class="t"> <span class="str">"avg_drift"</span><span class="op">:</span> <span class="nam">avg_drift</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
372
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t288" href="#t288">288</a></span><span class="t"> <span class="str">"consensus_response"</span><span class="op">:</span> <span class="nam">responses</span><span class="op">[</span><span class="num">0</span><span class="op">]</span><span class="op">.</span><span class="nam">response</span> </span><span class="r"></span></p>
|
|
373
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t289" href="#t289">289</a></span><span class="t"> <span class="op">}</span> </span><span class="r"></span></p>
|
|
374
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t290" href="#t290">290</a></span><span class="t"> <span class="op">}</span> </span><span class="r"></span></p>
|
|
375
|
+
<p class="pln"><span class="n"><a id="t291" href="#t291">291</a></span><span class="t"> </span><span class="r"></span></p>
|
|
376
|
+
<p class="mis show_mis"><span class="n"><a id="t292" href="#t292">292</a></span><span class="t"> <span class="key">def</span> <span class="nam">_calculate_pairwise_drift</span><span class="op">(</span><span class="nam">self</span><span class="op">,</span> <span class="nam">responses</span><span class="op">:</span> <span class="nam">list</span><span class="op">[</span><span class="nam">ModelResponse</span><span class="op">]</span><span class="op">)</span> <span class="op">-></span> <span class="nam">list</span><span class="op">[</span><span class="nam">float</span><span class="op">]</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
377
|
+
<p class="pln"><span class="n"><a id="t293" href="#t293">293</a></span><span class="t"> <span class="str">"""Calculate drift between all pairs of responses."""</span> </span><span class="r"></span></p>
|
|
378
|
+
<p class="mis show_mis"><span class="n"><a id="t294" href="#t294">294</a></span><span class="t"> <span class="nam">drifts</span> <span class="op">=</span> <span class="op">[</span><span class="op">]</span> </span><span class="r"></span></p>
|
|
379
|
+
<p class="pln"><span class="n"><a id="t295" href="#t295">295</a></span><span class="t"> </span><span class="r"></span></p>
|
|
380
|
+
<p class="mis show_mis"><span class="n"><a id="t296" href="#t296">296</a></span><span class="t"> <span class="key">for</span> <span class="nam">i</span> <span class="key">in</span> <span class="nam">range</span><span class="op">(</span><span class="nam">len</span><span class="op">(</span><span class="nam">responses</span><span class="op">)</span><span class="op">)</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
381
|
+
<p class="mis show_mis"><span class="n"><a id="t297" href="#t297">297</a></span><span class="t"> <span class="key">for</span> <span class="nam">j</span> <span class="key">in</span> <span class="nam">range</span><span class="op">(</span><span class="nam">i</span> <span class="op">+</span> <span class="num">1</span><span class="op">,</span> <span class="nam">len</span><span class="op">(</span><span class="nam">responses</span><span class="op">)</span><span class="op">)</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
382
|
+
<p class="mis show_mis"><span class="n"><a id="t298" href="#t298">298</a></span><span class="t"> <span class="nam">drift</span> <span class="op">=</span> <span class="nam">self</span><span class="op">.</span><span class="nam">_response_drift</span><span class="op">(</span> </span><span class="r"></span></p>
|
|
383
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t299" href="#t299">299</a></span><span class="t"> <span class="nam">responses</span><span class="op">[</span><span class="nam">i</span><span class="op">]</span><span class="op">.</span><span class="nam">response</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
384
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t300" href="#t300">300</a></span><span class="t"> <span class="nam">responses</span><span class="op">[</span><span class="nam">j</span><span class="op">]</span><span class="op">.</span><span class="nam">response</span> </span><span class="r"></span></p>
|
|
385
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t301" href="#t301">301</a></span><span class="t"> <span class="op">)</span> </span><span class="r"></span></p>
|
|
386
|
+
<p class="mis show_mis"><span class="n"><a id="t302" href="#t302">302</a></span><span class="t"> <span class="nam">drifts</span><span class="op">.</span><span class="nam">append</span><span class="op">(</span><span class="nam">drift</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
387
|
+
<p class="pln"><span class="n"><a id="t303" href="#t303">303</a></span><span class="t"> </span><span class="r"></span></p>
|
|
388
|
+
<p class="mis show_mis"><span class="n"><a id="t304" href="#t304">304</a></span><span class="t"> <span class="key">return</span> <span class="nam">drifts</span> </span><span class="r"></span></p>
|
|
389
|
+
<p class="pln"><span class="n"><a id="t305" href="#t305">305</a></span><span class="t"> </span><span class="r"></span></p>
|
|
390
|
+
<p class="mis show_mis"><span class="n"><a id="t306" href="#t306">306</a></span><span class="t"> <span class="key">def</span> <span class="nam">_response_drift</span><span class="op">(</span><span class="nam">self</span><span class="op">,</span> <span class="nam">response_a</span><span class="op">:</span> <span class="nam">Any</span><span class="op">,</span> <span class="nam">response_b</span><span class="op">:</span> <span class="nam">Any</span><span class="op">)</span> <span class="op">-></span> <span class="nam">float</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
391
|
+
<p class="pln"><span class="n"><a id="t307" href="#t307">307</a></span><span class="t"> <span class="str">"""</span> </span><span class="r"></span></p>
|
|
392
|
+
<p class="pln"><span class="n"><a id="t308" href="#t308">308</a></span><span class="t"><span class="str"> Calculate drift between two responses.</span> </span><span class="r"></span></p>
|
|
393
|
+
<p class="pln"><span class="n"><a id="t309" href="#t309">309</a></span><span class="t"><span class="str"> </span> </span><span class="r"></span></p>
|
|
394
|
+
<p class="pln"><span class="n"><a id="t310" href="#t310">310</a></span><span class="t"><span class="str"> For numeric responses: normalized absolute difference</span> </span><span class="r"></span></p>
|
|
395
|
+
<p class="pln"><span class="n"><a id="t311" href="#t311">311</a></span><span class="t"><span class="str"> For string responses: Levenshtein-based similarity</span> </span><span class="r"></span></p>
|
|
396
|
+
<p class="pln"><span class="n"><a id="t312" href="#t312">312</a></span><span class="t"><span class="str"> For structured: recursive comparison</span> </span><span class="r"></span></p>
|
|
397
|
+
<p class="pln"><span class="n"><a id="t313" href="#t313">313</a></span><span class="t"><span class="str"> """</span> </span><span class="r"></span></p>
|
|
398
|
+
<p class="pln"><span class="n"><a id="t314" href="#t314">314</a></span><span class="t"> <span class="com"># Handle numeric</span> </span><span class="r"></span></p>
|
|
399
|
+
<p class="mis show_mis"><span class="n"><a id="t315" href="#t315">315</a></span><span class="t"> <span class="key">if</span> <span class="nam">isinstance</span><span class="op">(</span><span class="nam">response_a</span><span class="op">,</span> <span class="op">(</span><span class="nam">int</span><span class="op">,</span> <span class="nam">float</span><span class="op">)</span><span class="op">)</span> <span class="key">and</span> <span class="nam">isinstance</span><span class="op">(</span><span class="nam">response_b</span><span class="op">,</span> <span class="op">(</span><span class="nam">int</span><span class="op">,</span> <span class="nam">float</span><span class="op">)</span><span class="op">)</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
400
|
+
<p class="mis show_mis"><span class="n"><a id="t316" href="#t316">316</a></span><span class="t"> <span class="nam">max_val</span> <span class="op">=</span> <span class="nam">max</span><span class="op">(</span><span class="nam">abs</span><span class="op">(</span><span class="nam">response_a</span><span class="op">)</span><span class="op">,</span> <span class="nam">abs</span><span class="op">(</span><span class="nam">response_b</span><span class="op">)</span><span class="op">,</span> <span class="num">1</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
401
|
+
<p class="mis show_mis"><span class="n"><a id="t317" href="#t317">317</a></span><span class="t"> <span class="key">return</span> <span class="nam">abs</span><span class="op">(</span><span class="nam">response_a</span> <span class="op">-</span> <span class="nam">response_b</span><span class="op">)</span> <span class="op">/</span> <span class="nam">max_val</span> </span><span class="r"></span></p>
|
|
402
|
+
<p class="pln"><span class="n"><a id="t318" href="#t318">318</a></span><span class="t"> </span><span class="r"></span></p>
|
|
403
|
+
<p class="pln"><span class="n"><a id="t319" href="#t319">319</a></span><span class="t"> <span class="com"># Handle string</span> </span><span class="r"></span></p>
|
|
404
|
+
<p class="mis show_mis"><span class="n"><a id="t320" href="#t320">320</a></span><span class="t"> <span class="key">if</span> <span class="nam">isinstance</span><span class="op">(</span><span class="nam">response_a</span><span class="op">,</span> <span class="nam">str</span><span class="op">)</span> <span class="key">and</span> <span class="nam">isinstance</span><span class="op">(</span><span class="nam">response_b</span><span class="op">,</span> <span class="nam">str</span><span class="op">)</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
405
|
+
<p class="mis show_mis"><span class="n"><a id="t321" href="#t321">321</a></span><span class="t"> <span class="key">return</span> <span class="nam">self</span><span class="op">.</span><span class="nam">_string_drift</span><span class="op">(</span><span class="nam">response_a</span><span class="op">,</span> <span class="nam">response_b</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
406
|
+
<p class="pln"><span class="n"><a id="t322" href="#t322">322</a></span><span class="t"> </span><span class="r"></span></p>
|
|
407
|
+
<p class="pln"><span class="n"><a id="t323" href="#t323">323</a></span><span class="t"> <span class="com"># Handle lists/arrays</span> </span><span class="r"></span></p>
|
|
408
|
+
<p class="mis show_mis"><span class="n"><a id="t324" href="#t324">324</a></span><span class="t"> <span class="key">if</span> <span class="nam">isinstance</span><span class="op">(</span><span class="nam">response_a</span><span class="op">,</span> <span class="op">(</span><span class="nam">list</span><span class="op">,</span> <span class="nam">np</span><span class="op">.</span><span class="nam">ndarray</span><span class="op">)</span><span class="op">)</span> <span class="key">and</span> <span class="nam">isinstance</span><span class="op">(</span><span class="nam">response_b</span><span class="op">,</span> <span class="op">(</span><span class="nam">list</span><span class="op">,</span> <span class="nam">np</span><span class="op">.</span><span class="nam">ndarray</span><span class="op">)</span><span class="op">)</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
409
|
+
<p class="mis show_mis"><span class="n"><a id="t325" href="#t325">325</a></span><span class="t"> <span class="nam">a</span> <span class="op">=</span> <span class="nam">np</span><span class="op">.</span><span class="nam">array</span><span class="op">(</span><span class="nam">response_a</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
410
|
+
<p class="mis show_mis"><span class="n"><a id="t326" href="#t326">326</a></span><span class="t"> <span class="nam">b</span> <span class="op">=</span> <span class="nam">np</span><span class="op">.</span><span class="nam">array</span><span class="op">(</span><span class="nam">response_b</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
411
|
+
<p class="mis show_mis"><span class="n"><a id="t327" href="#t327">327</a></span><span class="t"> <span class="key">if</span> <span class="nam">a</span><span class="op">.</span><span class="nam">shape</span> <span class="op">==</span> <span class="nam">b</span><span class="op">.</span><span class="nam">shape</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
412
|
+
<p class="mis show_mis"><span class="n"><a id="t328" href="#t328">328</a></span><span class="t"> <span class="key">return</span> <span class="nam">float</span><span class="op">(</span><span class="nam">np</span><span class="op">.</span><span class="nam">linalg</span><span class="op">.</span><span class="nam">norm</span><span class="op">(</span><span class="nam">a</span> <span class="op">-</span> <span class="nam">b</span><span class="op">)</span> <span class="op">/</span> <span class="op">(</span><span class="nam">np</span><span class="op">.</span><span class="nam">linalg</span><span class="op">.</span><span class="nam">norm</span><span class="op">(</span><span class="nam">a</span><span class="op">)</span> <span class="op">+</span> <span class="nam">np</span><span class="op">.</span><span class="nam">linalg</span><span class="op">.</span><span class="nam">norm</span><span class="op">(</span><span class="nam">b</span><span class="op">)</span> <span class="op">+</span> <span class="num">1e-10</span><span class="op">)</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
413
|
+
<p class="pln"><span class="n"><a id="t329" href="#t329">329</a></span><span class="t"> </span><span class="r"></span></p>
|
|
414
|
+
<p class="pln"><span class="n"><a id="t330" href="#t330">330</a></span><span class="t"> <span class="com"># Fallback: exact match</span> </span><span class="r"></span></p>
|
|
415
|
+
<p class="mis show_mis"><span class="n"><a id="t331" href="#t331">331</a></span><span class="t"> <span class="key">return</span> <span class="num">0.0</span> <span class="key">if</span> <span class="nam">response_a</span> <span class="op">==</span> <span class="nam">response_b</span> <span class="key">else</span> <span class="num">1.0</span> </span><span class="r"></span></p>
|
|
416
|
+
<p class="pln"><span class="n"><a id="t332" href="#t332">332</a></span><span class="t"> </span><span class="r"></span></p>
|
|
417
|
+
<p class="mis show_mis"><span class="n"><a id="t333" href="#t333">333</a></span><span class="t"> <span class="key">def</span> <span class="nam">_string_drift</span><span class="op">(</span><span class="nam">self</span><span class="op">,</span> <span class="nam">a</span><span class="op">:</span> <span class="nam">str</span><span class="op">,</span> <span class="nam">b</span><span class="op">:</span> <span class="nam">str</span><span class="op">)</span> <span class="op">-></span> <span class="nam">float</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
418
|
+
<p class="pln"><span class="n"><a id="t334" href="#t334">334</a></span><span class="t"> <span class="str">"""Calculate drift between strings using character-level comparison."""</span> </span><span class="r"></span></p>
|
|
419
|
+
<p class="mis show_mis"><span class="n"><a id="t335" href="#t335">335</a></span><span class="t"> <span class="key">if</span> <span class="nam">a</span> <span class="op">==</span> <span class="nam">b</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
420
|
+
<p class="mis show_mis"><span class="n"><a id="t336" href="#t336">336</a></span><span class="t"> <span class="key">return</span> <span class="num">0.0</span> </span><span class="r"></span></p>
|
|
421
|
+
<p class="mis show_mis"><span class="n"><a id="t337" href="#t337">337</a></span><span class="t"> <span class="key">if</span> <span class="key">not</span> <span class="nam">a</span> <span class="key">or</span> <span class="key">not</span> <span class="nam">b</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
422
|
+
<p class="mis show_mis"><span class="n"><a id="t338" href="#t338">338</a></span><span class="t"> <span class="key">return</span> <span class="num">1.0</span> </span><span class="r"></span></p>
|
|
423
|
+
<p class="pln"><span class="n"><a id="t339" href="#t339">339</a></span><span class="t"> </span><span class="r"></span></p>
|
|
424
|
+
<p class="pln"><span class="n"><a id="t340" href="#t340">340</a></span><span class="t"> <span class="com"># Simple normalized edit distance approximation</span> </span><span class="r"></span></p>
|
|
425
|
+
<p class="pln"><span class="n"><a id="t341" href="#t341">341</a></span><span class="t"> <span class="com"># (For production, use proper Levenshtein or embedding similarity)</span> </span><span class="r"></span></p>
|
|
426
|
+
<p class="mis show_mis"><span class="n"><a id="t342" href="#t342">342</a></span><span class="t"> <span class="nam">common</span> <span class="op">=</span> <span class="nam">sum</span><span class="op">(</span><span class="num">1</span> <span class="key">for</span> <span class="nam">c</span> <span class="key">in</span> <span class="nam">a</span> <span class="key">if</span> <span class="nam">c</span> <span class="key">in</span> <span class="nam">b</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
427
|
+
<p class="mis show_mis"><span class="n"><a id="t343" href="#t343">343</a></span><span class="t"> <span class="nam">total</span> <span class="op">=</span> <span class="nam">len</span><span class="op">(</span><span class="nam">a</span><span class="op">)</span> <span class="op">+</span> <span class="nam">len</span><span class="op">(</span><span class="nam">b</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
428
|
+
<p class="mis show_mis"><span class="n"><a id="t344" href="#t344">344</a></span><span class="t"> <span class="nam">similarity</span> <span class="op">=</span> <span class="op">(</span><span class="num">2</span> <span class="op">*</span> <span class="nam">common</span><span class="op">)</span> <span class="op">/</span> <span class="nam">total</span> <span class="key">if</span> <span class="nam">total</span> <span class="op">></span> <span class="num">0</span> <span class="key">else</span> <span class="num">0</span> </span><span class="r"></span></p>
|
|
429
|
+
<p class="mis show_mis"><span class="n"><a id="t345" href="#t345">345</a></span><span class="t"> <span class="key">return</span> <span class="num">1.0</span> <span class="op">-</span> <span class="nam">similarity</span> </span><span class="r"></span></p>
|
|
430
|
+
<p class="pln"><span class="n"><a id="t346" href="#t346">346</a></span><span class="t"> </span><span class="r"></span></p>
|
|
431
|
+
<p class="mis show_mis"><span class="n"><a id="t347" href="#t347">347</a></span><span class="t"> <span class="key">def</span> <span class="nam">_check_correctness</span><span class="op">(</span><span class="nam">self</span><span class="op">,</span> <span class="nam">response</span><span class="op">:</span> <span class="nam">Any</span><span class="op">,</span> <span class="nam">ground_truth</span><span class="op">:</span> <span class="nam">Any</span><span class="op">)</span> <span class="op">-></span> <span class="nam">bool</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
432
|
+
<p class="pln"><span class="n"><a id="t348" href="#t348">348</a></span><span class="t"> <span class="str">"""Check if response matches ground truth."""</span> </span><span class="r"></span></p>
|
|
433
|
+
<p class="pln"><span class="n"><a id="t349" href="#t349">349</a></span><span class="t"> <span class="com"># Exact match</span> </span><span class="r"></span></p>
|
|
434
|
+
<p class="mis show_mis"><span class="n"><a id="t350" href="#t350">350</a></span><span class="t"> <span class="key">if</span> <span class="nam">response</span> <span class="op">==</span> <span class="nam">ground_truth</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
435
|
+
<p class="mis show_mis"><span class="n"><a id="t351" href="#t351">351</a></span><span class="t"> <span class="key">return</span> <span class="key">True</span> </span><span class="r"></span></p>
|
|
436
|
+
<p class="pln"><span class="n"><a id="t352" href="#t352">352</a></span><span class="t"> </span><span class="r"></span></p>
|
|
437
|
+
<p class="pln"><span class="n"><a id="t353" href="#t353">353</a></span><span class="t"> <span class="com"># Numeric tolerance</span> </span><span class="r"></span></p>
|
|
438
|
+
<p class="mis show_mis"><span class="n"><a id="t354" href="#t354">354</a></span><span class="t"> <span class="key">if</span> <span class="nam">isinstance</span><span class="op">(</span><span class="nam">response</span><span class="op">,</span> <span class="op">(</span><span class="nam">int</span><span class="op">,</span> <span class="nam">float</span><span class="op">)</span><span class="op">)</span> <span class="key">and</span> <span class="nam">isinstance</span><span class="op">(</span><span class="nam">ground_truth</span><span class="op">,</span> <span class="op">(</span><span class="nam">int</span><span class="op">,</span> <span class="nam">float</span><span class="op">)</span><span class="op">)</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
439
|
+
<p class="mis show_mis"><span class="n"><a id="t355" href="#t355">355</a></span><span class="t"> <span class="key">return</span> <span class="nam">abs</span><span class="op">(</span><span class="nam">response</span> <span class="op">-</span> <span class="nam">ground_truth</span><span class="op">)</span> <span class="op"><</span> <span class="num">0.01</span> <span class="op">*</span> <span class="nam">abs</span><span class="op">(</span><span class="nam">ground_truth</span> <span class="op">+</span> <span class="num">1e-10</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
440
|
+
<p class="pln"><span class="n"><a id="t356" href="#t356">356</a></span><span class="t"> </span><span class="r"></span></p>
|
|
441
|
+
<p class="pln"><span class="n"><a id="t357" href="#t357">357</a></span><span class="t"> <span class="com"># String containment (ground truth in response)</span> </span><span class="r"></span></p>
|
|
442
|
+
<p class="mis show_mis"><span class="n"><a id="t358" href="#t358">358</a></span><span class="t"> <span class="key">if</span> <span class="nam">isinstance</span><span class="op">(</span><span class="nam">response</span><span class="op">,</span> <span class="nam">str</span><span class="op">)</span> <span class="key">and</span> <span class="nam">isinstance</span><span class="op">(</span><span class="nam">ground_truth</span><span class="op">,</span> <span class="nam">str</span><span class="op">)</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
443
|
+
<p class="mis show_mis"><span class="n"><a id="t359" href="#t359">359</a></span><span class="t"> <span class="key">return</span> <span class="nam">ground_truth</span><span class="op">.</span><span class="nam">lower</span><span class="op">(</span><span class="op">)</span> <span class="key">in</span> <span class="nam">response</span><span class="op">.</span><span class="nam">lower</span><span class="op">(</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
444
|
+
<p class="pln"><span class="n"><a id="t360" href="#t360">360</a></span><span class="t"> </span><span class="r"></span></p>
|
|
445
|
+
<p class="mis show_mis"><span class="n"><a id="t361" href="#t361">361</a></span><span class="t"> <span class="key">return</span> <span class="key">False</span> </span><span class="r"></span></p>
|
|
446
|
+
<p class="pln"><span class="n"><a id="t362" href="#t362">362</a></span><span class="t"> </span><span class="r"></span></p>
|
|
447
|
+
<p class="mis show_mis"><span class="n"><a id="t363" href="#t363">363</a></span><span class="t"> <span class="key">def</span> <span class="nam">aggregate_results</span><span class="op">(</span><span class="nam">self</span><span class="op">,</span> <span class="nam">results</span><span class="op">:</span> <span class="nam">list</span><span class="op">[</span><span class="nam">VerificationResult</span><span class="op">]</span><span class="op">,</span> <span class="nam">config</span><span class="op">:</span> <span class="nam">dict</span> <span class="op">=</span> <span class="key">None</span><span class="op">)</span> <span class="op">-></span> <span class="nam">BenchmarkResults</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
448
|
+
<p class="pln"><span class="n"><a id="t364" href="#t364">364</a></span><span class="t"> <span class="str">"""Aggregate results into summary statistics."""</span> </span><span class="r"></span></p>
|
|
449
|
+
<p class="mis show_mis"><span class="n"><a id="t365" href="#t365">365</a></span><span class="t"> <span class="nam">correct</span> <span class="op">=</span> <span class="nam">sum</span><span class="op">(</span><span class="num">1</span> <span class="key">for</span> <span class="nam">r</span> <span class="key">in</span> <span class="nam">results</span> <span class="key">if</span> <span class="nam">r</span><span class="op">.</span><span class="nam">is_correct</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
450
|
+
<p class="mis show_mis"><span class="n"><a id="t366" href="#t366">366</a></span><span class="t"> <span class="nam">total</span> <span class="op">=</span> <span class="nam">len</span><span class="op">(</span><span class="nam">results</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
451
|
+
<p class="pln"><span class="n"><a id="t367" href="#t367">367</a></span><span class="t"> </span><span class="r"></span></p>
|
|
452
|
+
<p class="pln"><span class="n"><a id="t368" href="#t368">368</a></span><span class="t"> <span class="com"># Group by category</span> </span><span class="r"></span></p>
|
|
453
|
+
<p class="mis show_mis"><span class="n"><a id="t369" href="#t369">369</a></span><span class="t"> <span class="nam">by_category</span> <span class="op">=</span> <span class="op">{</span><span class="op">}</span> </span><span class="r"></span></p>
|
|
454
|
+
<p class="mis show_mis"><span class="n"><a id="t370" href="#t370">370</a></span><span class="t"> <span class="key">for</span> <span class="nam">r</span> <span class="key">in</span> <span class="nam">results</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
455
|
+
<p class="pln"><span class="n"><a id="t371" href="#t371">371</a></span><span class="t"> <span class="com"># Would need task info to group properly</span> </span><span class="r"></span></p>
|
|
456
|
+
<p class="mis show_mis"><span class="n"><a id="t372" href="#t372">372</a></span><span class="t"> <span class="key">pass</span> </span><span class="r"></span></p>
|
|
457
|
+
<p class="pln"><span class="n"><a id="t373" href="#t373">373</a></span><span class="t"> </span><span class="r"></span></p>
|
|
458
|
+
<p class="pln"><span class="n"><a id="t374" href="#t374">374</a></span><span class="t"> <span class="com"># Calculate latency</span> </span><span class="r"></span></p>
|
|
459
|
+
<p class="mis show_mis"><span class="n"><a id="t375" href="#t375">375</a></span><span class="t"> <span class="nam">all_latencies</span> <span class="op">=</span> <span class="op">[</span><span class="op">]</span> </span><span class="r"></span></p>
|
|
460
|
+
<p class="mis show_mis"><span class="n"><a id="t376" href="#t376">376</a></span><span class="t"> <span class="key">for</span> <span class="nam">r</span> <span class="key">in</span> <span class="nam">results</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
461
|
+
<p class="mis show_mis"><span class="n"><a id="t377" href="#t377">377</a></span><span class="t"> <span class="key">for</span> <span class="nam">resp</span> <span class="key">in</span> <span class="nam">r</span><span class="op">.</span><span class="nam">responses</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
462
|
+
<p class="mis show_mis"><span class="n"><a id="t378" href="#t378">378</a></span><span class="t"> <span class="nam">all_latencies</span><span class="op">.</span><span class="nam">append</span><span class="op">(</span><span class="nam">resp</span><span class="op">.</span><span class="nam">latency_ms</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
463
|
+
<p class="pln"><span class="n"><a id="t379" href="#t379">379</a></span><span class="t"> </span><span class="r"></span></p>
|
|
464
|
+
<p class="mis show_mis"><span class="n"><a id="t380" href="#t380">380</a></span><span class="t"> <span class="nam">avg_latency</span> <span class="op">=</span> <span class="nam">np</span><span class="op">.</span><span class="nam">mean</span><span class="op">(</span><span class="nam">all_latencies</span><span class="op">)</span> <span class="key">if</span> <span class="nam">all_latencies</span> <span class="key">else</span> <span class="num">0.0</span> </span><span class="r"></span></p>
|
|
465
|
+
<p class="pln"><span class="n"><a id="t381" href="#t381">381</a></span><span class="t"> </span><span class="r"></span></p>
|
|
466
|
+
<p class="mis show_mis"><span class="n"><a id="t382" href="#t382">382</a></span><span class="t"> <span class="key">return</span> <span class="nam">BenchmarkResults</span><span class="op">(</span> </span><span class="r"></span></p>
|
|
467
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t383" href="#t383">383</a></span><span class="t"> <span class="nam">total_tasks</span><span class="op">=</span><span class="nam">total</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
468
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t384" href="#t384">384</a></span><span class="t"> <span class="nam">correct</span><span class="op">=</span><span class="nam">correct</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
469
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t385" href="#t385">385</a></span><span class="t"> <span class="nam">accuracy</span><span class="op">=</span><span class="nam">correct</span> <span class="op">/</span> <span class="nam">total</span> <span class="key">if</span> <span class="nam">total</span> <span class="op">></span> <span class="num">0</span> <span class="key">else</span> <span class="num">0.0</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
470
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t386" href="#t386">386</a></span><span class="t"> <span class="nam">avg_latency_ms</span><span class="op">=</span><span class="nam">avg_latency</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
471
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t387" href="#t387">387</a></span><span class="t"> <span class="nam">by_category</span><span class="op">=</span><span class="op">{</span><span class="op">}</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
472
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t388" href="#t388">388</a></span><span class="t"> <span class="nam">by_difficulty</span><span class="op">=</span><span class="op">{</span><span class="op">}</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
473
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t389" href="#t389">389</a></span><span class="t"> <span class="nam">timestamp</span><span class="op">=</span><span class="nam">datetime</span><span class="op">.</span><span class="nam">utcnow</span><span class="op">(</span><span class="op">)</span><span class="op">.</span><span class="nam">isoformat</span><span class="op">(</span><span class="op">)</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
474
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t390" href="#t390">390</a></span><span class="t"> <span class="nam">config</span><span class="op">=</span><span class="nam">config</span> <span class="key">or</span> <span class="op">{</span><span class="op">}</span> </span><span class="r"></span></p>
|
|
475
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t391" href="#t391">391</a></span><span class="t"> <span class="op">)</span> </span><span class="r"></span></p>
|
|
476
|
+
<p class="pln"><span class="n"><a id="t392" href="#t392">392</a></span><span class="t"> </span><span class="r"></span></p>
|
|
477
|
+
<p class="mis show_mis"><span class="n"><a id="t393" href="#t393">393</a></span><span class="t"> <span class="key">def</span> <span class="nam">save_results</span><span class="op">(</span><span class="nam">self</span><span class="op">,</span> <span class="nam">results</span><span class="op">:</span> <span class="nam">BenchmarkResults</span><span class="op">,</span> <span class="nam">path</span><span class="op">:</span> <span class="nam">Path</span><span class="op">)</span> <span class="op">-></span> <span class="key">None</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
478
|
+
<p class="pln"><span class="n"><a id="t394" href="#t394">394</a></span><span class="t"> <span class="str">"""Save results to JSON file."""</span> </span><span class="r"></span></p>
|
|
479
|
+
<p class="mis show_mis"><span class="n"><a id="t395" href="#t395">395</a></span><span class="t"> <span class="key">with</span> <span class="nam">open</span><span class="op">(</span><span class="nam">path</span><span class="op">,</span> <span class="str">"w"</span><span class="op">)</span> <span class="key">as</span> <span class="nam">f</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
480
|
+
<p class="mis show_mis"><span class="n"><a id="t396" href="#t396">396</a></span><span class="t"> <span class="nam">json</span><span class="op">.</span><span class="nam">dump</span><span class="op">(</span><span class="op">{</span> </span><span class="r"></span></p>
|
|
481
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t397" href="#t397">397</a></span><span class="t"> <span class="str">"total_tasks"</span><span class="op">:</span> <span class="nam">results</span><span class="op">.</span><span class="nam">total_tasks</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
482
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t398" href="#t398">398</a></span><span class="t"> <span class="str">"correct"</span><span class="op">:</span> <span class="nam">results</span><span class="op">.</span><span class="nam">correct</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
483
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t399" href="#t399">399</a></span><span class="t"> <span class="str">"accuracy"</span><span class="op">:</span> <span class="nam">results</span><span class="op">.</span><span class="nam">accuracy</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
484
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t400" href="#t400">400</a></span><span class="t"> <span class="str">"avg_latency_ms"</span><span class="op">:</span> <span class="nam">results</span><span class="op">.</span><span class="nam">avg_latency_ms</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
485
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t401" href="#t401">401</a></span><span class="t"> <span class="str">"by_category"</span><span class="op">:</span> <span class="nam">results</span><span class="op">.</span><span class="nam">by_category</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
486
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t402" href="#t402">402</a></span><span class="t"> <span class="str">"by_difficulty"</span><span class="op">:</span> <span class="nam">results</span><span class="op">.</span><span class="nam">by_difficulty</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
487
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t403" href="#t403">403</a></span><span class="t"> <span class="str">"timestamp"</span><span class="op">:</span> <span class="nam">results</span><span class="op">.</span><span class="nam">timestamp</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
488
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t404" href="#t404">404</a></span><span class="t"> <span class="str">"config"</span><span class="op">:</span> <span class="nam">results</span><span class="op">.</span><span class="nam">config</span> </span><span class="r"></span></p>
|
|
489
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t405" href="#t405">405</a></span><span class="t"> <span class="op">}</span><span class="op">,</span> <span class="nam">f</span><span class="op">,</span> <span class="nam">indent</span><span class="op">=</span><span class="num">2</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
490
|
+
<p class="pln"><span class="n"><a id="t406" href="#t406">406</a></span><span class="t"> </span><span class="r"></span></p>
|
|
491
|
+
<p class="pln"><span class="n"><a id="t407" href="#t407">407</a></span><span class="t"> </span><span class="r"></span></p>
|
|
492
|
+
<p class="mis show_mis"><span class="n"><a id="t408" href="#t408">408</a></span><span class="t"><span class="key">def</span> <span class="nam">create_sample_tasks</span><span class="op">(</span><span class="nam">n</span><span class="op">:</span> <span class="nam">int</span> <span class="op">=</span> <span class="num">100</span><span class="op">)</span> <span class="op">-></span> <span class="nam">list</span><span class="op">[</span><span class="nam">BenchmarkTask</span><span class="op">]</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
493
|
+
<p class="pln"><span class="n"><a id="t409" href="#t409">409</a></span><span class="t"> <span class="str">"""Create sample benchmark tasks for testing."""</span> </span><span class="r"></span></p>
|
|
494
|
+
<p class="mis show_mis"><span class="n"><a id="t410" href="#t410">410</a></span><span class="t"> <span class="nam">tasks</span> <span class="op">=</span> <span class="op">[</span><span class="op">]</span> </span><span class="r"></span></p>
|
|
495
|
+
<p class="pln"><span class="n"><a id="t411" href="#t411">411</a></span><span class="t"> </span><span class="r"></span></p>
|
|
496
|
+
<p class="mis show_mis"><span class="n"><a id="t412" href="#t412">412</a></span><span class="t"> <span class="nam">categories</span> <span class="op">=</span> <span class="nam">list</span><span class="op">(</span><span class="nam">TaskCategory</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
497
|
+
<p class="mis show_mis"><span class="n"><a id="t413" href="#t413">413</a></span><span class="t"> <span class="nam">difficulties</span> <span class="op">=</span> <span class="op">[</span><span class="str">"easy"</span><span class="op">,</span> <span class="str">"medium"</span><span class="op">,</span> <span class="str">"hard"</span><span class="op">]</span> </span><span class="r"></span></p>
|
|
498
|
+
<p class="pln"><span class="n"><a id="t414" href="#t414">414</a></span><span class="t"> </span><span class="r"></span></p>
|
|
499
|
+
<p class="mis show_mis"><span class="n"><a id="t415" href="#t415">415</a></span><span class="t"> <span class="key">for</span> <span class="nam">i</span> <span class="key">in</span> <span class="nam">range</span><span class="op">(</span><span class="nam">n</span><span class="op">)</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
500
|
+
<p class="mis show_mis"><span class="n"><a id="t416" href="#t416">416</a></span><span class="t"> <span class="nam">cat</span> <span class="op">=</span> <span class="nam">categories</span><span class="op">[</span><span class="nam">i</span> <span class="op">%</span> <span class="nam">len</span><span class="op">(</span><span class="nam">categories</span><span class="op">)</span><span class="op">]</span> </span><span class="r"></span></p>
|
|
501
|
+
<p class="mis show_mis"><span class="n"><a id="t417" href="#t417">417</a></span><span class="t"> <span class="nam">diff</span> <span class="op">=</span> <span class="nam">difficulties</span><span class="op">[</span><span class="nam">i</span> <span class="op">%</span> <span class="nam">len</span><span class="op">(</span><span class="nam">difficulties</span><span class="op">)</span><span class="op">]</span> </span><span class="r"></span></p>
|
|
502
|
+
<p class="pln"><span class="n"><a id="t418" href="#t418">418</a></span><span class="t"> </span><span class="r"></span></p>
|
|
503
|
+
<p class="mis show_mis"><span class="n"><a id="t419" href="#t419">419</a></span><span class="t"> <span class="key">if</span> <span class="nam">cat</span> <span class="op">==</span> <span class="nam">TaskCategory</span><span class="op">.</span><span class="nam">FACTUAL</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
504
|
+
<p class="mis show_mis"><span class="n"><a id="t420" href="#t420">420</a></span><span class="t"> <span class="nam">tasks</span><span class="op">.</span><span class="nam">append</span><span class="op">(</span><span class="nam">BenchmarkTask</span><span class="op">(</span> </span><span class="r"></span></p>
|
|
505
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t421" href="#t421">421</a></span><span class="t"> <span class="nam">id</span><span class="op">=</span><span class="fst">f"</span><span class="fst">factual_</span><span class="op">{</span><span class="nam">i</span><span class="op">}</span><span class="fst">"</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
506
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t422" href="#t422">422</a></span><span class="t"> <span class="nam">category</span><span class="op">=</span><span class="nam">cat</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
507
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t423" href="#t423">423</a></span><span class="t"> <span class="nam">prompt</span><span class="op">=</span><span class="fst">f"</span><span class="fst">What is the capital of country </span><span class="op">{</span><span class="nam">i</span> <span class="op">%</span> <span class="num">50</span><span class="op">}</span><span class="fst">?</span><span class="fst">"</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
508
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t424" href="#t424">424</a></span><span class="t"> <span class="nam">ground_truth</span><span class="op">=</span><span class="fst">f"</span><span class="fst">Capital_</span><span class="op">{</span><span class="nam">i</span> <span class="op">%</span> <span class="num">50</span><span class="op">}</span><span class="fst">"</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
509
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t425" href="#t425">425</a></span><span class="t"> <span class="nam">difficulty</span><span class="op">=</span><span class="nam">diff</span> </span><span class="r"></span></p>
|
|
510
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t426" href="#t426">426</a></span><span class="t"> <span class="op">)</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
511
|
+
<p class="mis show_mis"><span class="n"><a id="t427" href="#t427">427</a></span><span class="t"> <span class="key">elif</span> <span class="nam">cat</span> <span class="op">==</span> <span class="nam">TaskCategory</span><span class="op">.</span><span class="nam">MATHEMATICAL</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
512
|
+
<p class="mis show_mis"><span class="n"><a id="t428" href="#t428">428</a></span><span class="t"> <span class="nam">a</span><span class="op">,</span> <span class="nam">b</span> <span class="op">=</span> <span class="nam">i</span> <span class="op">*</span> <span class="num">7</span><span class="op">,</span> <span class="nam">i</span> <span class="op">*</span> <span class="num">3</span> </span><span class="r"></span></p>
|
|
513
|
+
<p class="mis show_mis"><span class="n"><a id="t429" href="#t429">429</a></span><span class="t"> <span class="nam">tasks</span><span class="op">.</span><span class="nam">append</span><span class="op">(</span><span class="nam">BenchmarkTask</span><span class="op">(</span> </span><span class="r"></span></p>
|
|
514
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t430" href="#t430">430</a></span><span class="t"> <span class="nam">id</span><span class="op">=</span><span class="fst">f"</span><span class="fst">math_</span><span class="op">{</span><span class="nam">i</span><span class="op">}</span><span class="fst">"</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
515
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t431" href="#t431">431</a></span><span class="t"> <span class="nam">category</span><span class="op">=</span><span class="nam">cat</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
516
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t432" href="#t432">432</a></span><span class="t"> <span class="nam">prompt</span><span class="op">=</span><span class="fst">f"</span><span class="fst">What is </span><span class="op">{</span><span class="nam">a</span><span class="op">}</span><span class="fst"> + </span><span class="op">{</span><span class="nam">b</span><span class="op">}</span><span class="fst">?</span><span class="fst">"</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
517
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t433" href="#t433">433</a></span><span class="t"> <span class="nam">ground_truth</span><span class="op">=</span><span class="nam">a</span> <span class="op">+</span> <span class="nam">b</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
518
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t434" href="#t434">434</a></span><span class="t"> <span class="nam">difficulty</span><span class="op">=</span><span class="nam">diff</span> </span><span class="r"></span></p>
|
|
519
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t435" href="#t435">435</a></span><span class="t"> <span class="op">)</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
520
|
+
<p class="mis show_mis"><span class="n"><a id="t436" href="#t436">436</a></span><span class="t"> <span class="key">elif</span> <span class="nam">cat</span> <span class="op">==</span> <span class="nam">TaskCategory</span><span class="op">.</span><span class="nam">REASONING</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
521
|
+
<p class="mis show_mis"><span class="n"><a id="t437" href="#t437">437</a></span><span class="t"> <span class="nam">tasks</span><span class="op">.</span><span class="nam">append</span><span class="op">(</span><span class="nam">BenchmarkTask</span><span class="op">(</span> </span><span class="r"></span></p>
|
|
522
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t438" href="#t438">438</a></span><span class="t"> <span class="nam">id</span><span class="op">=</span><span class="fst">f"</span><span class="fst">reasoning_</span><span class="op">{</span><span class="nam">i</span><span class="op">}</span><span class="fst">"</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
523
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t439" href="#t439">439</a></span><span class="t"> <span class="nam">category</span><span class="op">=</span><span class="nam">cat</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
524
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t440" href="#t440">440</a></span><span class="t"> <span class="nam">prompt</span><span class="op">=</span><span class="fst">f"</span><span class="fst">If A implies B, and A is true, what can we conclude about B?</span><span class="fst">"</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
525
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t441" href="#t441">441</a></span><span class="t"> <span class="nam">ground_truth</span><span class="op">=</span><span class="str">"B is true"</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
526
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t442" href="#t442">442</a></span><span class="t"> <span class="nam">difficulty</span><span class="op">=</span><span class="nam">diff</span> </span><span class="r"></span></p>
|
|
527
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t443" href="#t443">443</a></span><span class="t"> <span class="op">)</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
528
|
+
<p class="pln"><span class="n"><a id="t444" href="#t444">444</a></span><span class="t"> <span class="key">else</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
529
|
+
<p class="mis show_mis"><span class="n"><a id="t445" href="#t445">445</a></span><span class="t"> <span class="nam">tasks</span><span class="op">.</span><span class="nam">append</span><span class="op">(</span><span class="nam">BenchmarkTask</span><span class="op">(</span> </span><span class="r"></span></p>
|
|
530
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t446" href="#t446">446</a></span><span class="t"> <span class="nam">id</span><span class="op">=</span><span class="fst">f"</span><span class="fst">extraction_</span><span class="op">{</span><span class="nam">i</span><span class="op">}</span><span class="fst">"</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
531
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t447" href="#t447">447</a></span><span class="t"> <span class="nam">category</span><span class="op">=</span><span class="nam">cat</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
532
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t448" href="#t448">448</a></span><span class="t"> <span class="nam">prompt</span><span class="op">=</span><span class="fst">f"</span><span class="fst">Extract the number from: 'There are </span><span class="op">{</span><span class="nam">i</span> <span class="op">*</span> <span class="num">5</span><span class="op">}</span><span class="fst"> items'</span><span class="fst">"</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
533
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t449" href="#t449">449</a></span><span class="t"> <span class="nam">ground_truth</span><span class="op">=</span><span class="nam">i</span> <span class="op">*</span> <span class="num">5</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
534
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t450" href="#t450">450</a></span><span class="t"> <span class="nam">difficulty</span><span class="op">=</span><span class="nam">diff</span> </span><span class="r"></span></p>
|
|
535
|
+
<p class="mis mis2 show_mis"><span class="n"><a id="t451" href="#t451">451</a></span><span class="t"> <span class="op">)</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
536
|
+
<p class="pln"><span class="n"><a id="t452" href="#t452">452</a></span><span class="t"> </span><span class="r"></span></p>
|
|
537
|
+
<p class="mis show_mis"><span class="n"><a id="t453" href="#t453">453</a></span><span class="t"> <span class="key">return</span> <span class="nam">tasks</span> </span><span class="r"></span></p>
|
|
538
|
+
<p class="pln"><span class="n"><a id="t454" href="#t454">454</a></span><span class="t"> </span><span class="r"></span></p>
|
|
539
|
+
<p class="pln"><span class="n"><a id="t455" href="#t455">455</a></span><span class="t"> </span><span class="r"></span></p>
|
|
540
|
+
<p class="exc show_exc"><span class="n"><a id="t456" href="#t456">456</a></span><span class="t"><span class="key">if</span> <span class="nam">__name__</span> <span class="op">==</span> <span class="str">"__main__"</span><span class="op">:</span> </span><span class="r"></span></p>
|
|
541
|
+
<p class="pln"><span class="n"><a id="t457" href="#t457">457</a></span><span class="t"> <span class="com"># Demo: Run benchmark with mock data</span> </span><span class="r"></span></p>
|
|
542
|
+
<p class="exc show_exc"><span class="n"><a id="t458" href="#t458">458</a></span><span class="t"> <span class="nam">benchmark</span> <span class="op">=</span> <span class="nam">CMVKBenchmark</span><span class="op">(</span> </span><span class="r"></span></p>
|
|
543
|
+
<p class="exc exc2 show_exc"><span class="n"><a id="t459" href="#t459">459</a></span><span class="t"> <span class="nam">models</span><span class="op">=</span><span class="op">[</span><span class="str">"model_a"</span><span class="op">,</span> <span class="str">"model_b"</span><span class="op">,</span> <span class="str">"model_c"</span><span class="op">]</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
544
|
+
<p class="exc exc2 show_exc"><span class="n"><a id="t460" href="#t460">460</a></span><span class="t"> <span class="nam">consensus_method</span><span class="op">=</span><span class="nam">ConsensusMethod</span><span class="op">.</span><span class="nam">DRIFT_THRESHOLD</span><span class="op">,</span> </span><span class="r"></span></p>
|
|
545
|
+
<p class="exc exc2 show_exc"><span class="n"><a id="t461" href="#t461">461</a></span><span class="t"> <span class="nam">drift_threshold</span><span class="op">=</span><span class="num">0.15</span> </span><span class="r"></span></p>
|
|
546
|
+
<p class="exc exc2 show_exc"><span class="n"><a id="t462" href="#t462">462</a></span><span class="t"> <span class="op">)</span> </span><span class="r"></span></p>
|
|
547
|
+
<p class="pln"><span class="n"><a id="t463" href="#t463">463</a></span><span class="t"> </span><span class="r"></span></p>
|
|
548
|
+
<p class="exc show_exc"><span class="n"><a id="t464" href="#t464">464</a></span><span class="t"> <span class="nam">tasks</span> <span class="op">=</span> <span class="nam">create_sample_tasks</span><span class="op">(</span><span class="num">100</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
549
|
+
<p class="pln"><span class="n"><a id="t465" href="#t465">465</a></span><span class="t"> </span><span class="r"></span></p>
|
|
550
|
+
<p class="exc show_exc"><span class="n"><a id="t466" href="#t466">466</a></span><span class="t"> <span class="nam">print</span><span class="op">(</span><span class="str">"Running single-model benchmark..."</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
551
|
+
<p class="exc show_exc"><span class="n"><a id="t467" href="#t467">467</a></span><span class="t"> <span class="nam">single_results</span> <span class="op">=</span> <span class="nam">benchmark</span><span class="op">.</span><span class="nam">run_single_model</span><span class="op">(</span><span class="nam">tasks</span><span class="op">,</span> <span class="str">"model_a"</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
552
|
+
<p class="exc show_exc"><span class="n"><a id="t468" href="#t468">468</a></span><span class="t"> <span class="nam">single_summary</span> <span class="op">=</span> <span class="nam">benchmark</span><span class="op">.</span><span class="nam">aggregate_results</span><span class="op">(</span><span class="nam">single_results</span><span class="op">,</span> <span class="op">{</span><span class="str">"mode"</span><span class="op">:</span> <span class="str">"single"</span><span class="op">,</span> <span class="str">"model"</span><span class="op">:</span> <span class="str">"model_a"</span><span class="op">}</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
553
|
+
<p class="exc show_exc"><span class="n"><a id="t469" href="#t469">469</a></span><span class="t"> <span class="nam">print</span><span class="op">(</span><span class="fst">f"</span><span class="fst">Single model accuracy: </span><span class="op">{</span><span class="nam">single_summary</span><span class="op">.</span><span class="nam">accuracy</span><span class="op">:</span><span class="fst">.2%</span><span class="op">}</span><span class="fst">"</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
554
|
+
<p class="pln"><span class="n"><a id="t470" href="#t470">470</a></span><span class="t"> </span><span class="r"></span></p>
|
|
555
|
+
<p class="exc show_exc"><span class="n"><a id="t471" href="#t471">471</a></span><span class="t"> <span class="nam">print</span><span class="op">(</span><span class="str">"\nRunning multi-model (CMVK) benchmark..."</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
556
|
+
<p class="exc show_exc"><span class="n"><a id="t472" href="#t472">472</a></span><span class="t"> <span class="nam">multi_results</span> <span class="op">=</span> <span class="nam">benchmark</span><span class="op">.</span><span class="nam">run_multi_model</span><span class="op">(</span><span class="nam">tasks</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
557
|
+
<p class="exc show_exc"><span class="n"><a id="t473" href="#t473">473</a></span><span class="t"> <span class="nam">multi_summary</span> <span class="op">=</span> <span class="nam">benchmark</span><span class="op">.</span><span class="nam">aggregate_results</span><span class="op">(</span><span class="nam">multi_results</span><span class="op">,</span> <span class="op">{</span><span class="str">"mode"</span><span class="op">:</span> <span class="str">"cmvk"</span><span class="op">,</span> <span class="str">"models"</span><span class="op">:</span> <span class="nam">benchmark</span><span class="op">.</span><span class="nam">models</span><span class="op">}</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
558
|
+
<p class="exc show_exc"><span class="n"><a id="t474" href="#t474">474</a></span><span class="t"> <span class="nam">print</span><span class="op">(</span><span class="fst">f"</span><span class="fst">CMVK accuracy: </span><span class="op">{</span><span class="nam">multi_summary</span><span class="op">.</span><span class="nam">accuracy</span><span class="op">:</span><span class="fst">.2%</span><span class="op">}</span><span class="fst">"</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
559
|
+
<p class="pln"><span class="n"><a id="t475" href="#t475">475</a></span><span class="t"> </span><span class="r"></span></p>
|
|
560
|
+
<p class="exc show_exc"><span class="n"><a id="t476" href="#t476">476</a></span><span class="t"> <span class="nam">print</span><span class="op">(</span><span class="str">"\nNote: These are mock results. Run with real LLM APIs for actual benchmarks."</span><span class="op">)</span> </span><span class="r"></span></p>
|
|
561
|
+
</main>
|
|
562
|
+
<footer>
|
|
563
|
+
<div class="content">
|
|
564
|
+
<p>
|
|
565
|
+
<a class="nav" href="z_2c49bd2ed3e01e38_audit_py.html">« prev</a>
|
|
566
|
+
<a class="nav" href="index.html">^ index</a>
|
|
567
|
+
<a class="nav" href="z_2c49bd2ed3e01e38_constitutional_py.html">» next</a>
|
|
568
|
+
|
|
569
|
+
<a class="nav" href="https://coverage.readthedocs.io/en/7.13.1">coverage.py v7.13.1</a>,
|
|
570
|
+
created at 2026-02-02 21:04 -0800
|
|
571
|
+
</p>
|
|
572
|
+
</div>
|
|
573
|
+
</footer>
|
|
574
|
+
</body>
|
|
575
|
+
</html>
|