agent-os-kernel 1.1.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_os/__init__.py +66 -4
- agent_os/agents_compat.py +286 -0
- agent_os/base_agent.py +308 -0
- agent_os/cli.py +1079 -19
- agent_os/integrations/__init__.py +37 -2
- agent_os/integrations/openai_adapter.py +502 -0
- agent_os/integrations/semantic_kernel_adapter.py +569 -0
- agent_os/stateless.py +349 -0
- agent_os_kernel-1.2.0.dist-info/METADATA +676 -0
- agent_os_kernel-1.2.0.dist-info/RECORD +1053 -0
- {agent_os_kernel-1.1.0.dist-info → agent_os_kernel-1.2.0.dist-info}/entry_points.txt +0 -1
- modules/amb/.github/workflows/ci.yml +102 -0
- modules/amb/.github/workflows/publish.yml +146 -0
- modules/amb/.gitignore +134 -0
- modules/amb/CHANGELOG.md +118 -0
- modules/amb/CONTRIBUTING.md +141 -0
- modules/amb/LICENSE +21 -0
- modules/amb/README.md +188 -0
- modules/amb/amb_core/__init__.py +175 -0
- modules/amb/amb_core/adapters/__init__.py +55 -0
- modules/amb/amb_core/adapters/aws_sqs_broker.py +374 -0
- modules/amb/amb_core/adapters/azure_servicebus_broker.py +338 -0
- modules/amb/amb_core/adapters/kafka_broker.py +258 -0
- modules/amb/amb_core/adapters/nats_broker.py +283 -0
- modules/amb/amb_core/adapters/rabbitmq_broker.py +233 -0
- modules/amb/amb_core/adapters/redis_broker.py +260 -0
- modules/amb/amb_core/broker.py +143 -0
- modules/amb/amb_core/bus.py +479 -0
- modules/amb/amb_core/cloudevents.py +507 -0
- modules/amb/amb_core/dlq.py +343 -0
- modules/amb/amb_core/hf_utils.py +534 -0
- modules/amb/amb_core/memory_broker.py +408 -0
- modules/amb/amb_core/models.py +139 -0
- modules/amb/amb_core/persistence.py +527 -0
- modules/amb/amb_core/schema.py +292 -0
- modules/amb/amb_core/tracing.py +356 -0
- modules/amb/examples/advanced_features.py +223 -0
- modules/amb/examples/backpressure_demo.py +225 -0
- modules/amb/examples/basic_usage.py +117 -0
- modules/amb/examples/tracing_demo.py +104 -0
- modules/amb/experiments/README.md +52 -0
- modules/amb/experiments/reproduce_results.py +467 -0
- modules/amb/experiments/results.json +324 -0
- modules/amb/paper/README.md +40 -0
- modules/amb/paper/paper.tex +365 -0
- modules/amb/paper/whitepaper.md +377 -0
- modules/amb/pyproject.toml +117 -0
- modules/amb/tests/__init__.py +1 -0
- modules/amb/tests/test_backpressure_priority.py +280 -0
- modules/amb/tests/test_bus.py +198 -0
- modules/amb/tests/test_cloudevents.py +443 -0
- modules/amb/tests/test_features.py +531 -0
- modules/amb/tests/test_models.py +74 -0
- modules/amb/tests/test_tracing.py +254 -0
- modules/atr/.github/workflows/ci.yml +101 -0
- modules/atr/.github/workflows/publish.yml +140 -0
- modules/atr/.gitignore +134 -0
- modules/atr/.pre-commit-config.yaml +37 -0
- modules/atr/CHANGELOG.md +39 -0
- modules/atr/CONTRIBUTING.md +96 -0
- modules/atr/IMPLEMENTATION_SUMMARY.md +143 -0
- modules/atr/README.md +180 -0
- modules/atr/atr/__init__.py +638 -0
- modules/atr/atr/access.py +346 -0
- modules/atr/atr/composition.py +643 -0
- modules/atr/atr/decorator.py +355 -0
- modules/atr/atr/executor.py +382 -0
- modules/atr/atr/health.py +555 -0
- modules/atr/atr/hf_utils.py +447 -0
- modules/atr/atr/injection.py +420 -0
- modules/atr/atr/metrics.py +438 -0
- modules/atr/atr/policies.py +401 -0
- modules/atr/atr/py.typed +2 -0
- modules/atr/atr/registry.py +450 -0
- modules/atr/atr/schema.py +478 -0
- modules/atr/atr/tools/safe/__init__.py +73 -0
- modules/atr/atr/tools/safe/calculator.py +380 -0
- modules/atr/atr/tools/safe/datetime_tool.py +441 -0
- modules/atr/atr/tools/safe/file_reader.py +400 -0
- modules/atr/atr/tools/safe/http_client.py +314 -0
- modules/atr/atr/tools/safe/json_parser.py +372 -0
- modules/atr/atr/tools/safe/text_tool.py +526 -0
- modules/atr/atr/tools/safe/toolkit.py +173 -0
- modules/atr/docs/PYPI_SETUP.md +113 -0
- modules/atr/examples/README.md +27 -0
- modules/atr/examples/demo.py +144 -0
- modules/atr/examples/sandbox_demo.py +218 -0
- modules/atr/experiments/README.md +69 -0
- modules/atr/experiments/reproduce_results.py +509 -0
- modules/atr/experiments/results/.gitkeep +0 -0
- modules/atr/experiments/results/results_20260123_140334.json +71 -0
- modules/atr/paper/README.md +36 -0
- modules/atr/paper/figures/.gitkeep +0 -0
- modules/atr/paper/references.bib +84 -0
- modules/atr/paper/structure.tex +293 -0
- modules/atr/paper/whitepaper.md +234 -0
- modules/atr/pyproject.toml +148 -0
- modules/atr/requirements.txt +1 -0
- modules/atr/setup.py +30 -0
- modules/atr/tests/__init__.py +1 -0
- modules/atr/tests/test_decorator.py +317 -0
- modules/atr/tests/test_executor.py +245 -0
- modules/atr/tests/test_integration_executor.py +184 -0
- modules/atr/tests/test_registry.py +312 -0
- modules/atr/tests/test_schema.py +182 -0
- modules/atr/tests/test_v2_features.py +708 -0
- modules/caas/.dockerignore +63 -0
- modules/caas/.github/ISSUE_TEMPLATE/bug_report.md +38 -0
- modules/caas/.github/ISSUE_TEMPLATE/custom.md +10 -0
- modules/caas/.github/ISSUE_TEMPLATE/feature_request.md +20 -0
- modules/caas/.github/workflows/ci.yml +100 -0
- modules/caas/.github/workflows/lint.yml +39 -0
- modules/caas/.github/workflows/publish-pypi.yml +124 -0
- modules/caas/.gitignore +73 -0
- modules/caas/.pre-commit-config.yaml +33 -0
- modules/caas/CHANGELOG.md +58 -0
- modules/caas/CONTRIBUTING.md +346 -0
- modules/caas/Dockerfile +41 -0
- modules/caas/LICENSE +21 -0
- modules/caas/MANIFEST.in +11 -0
- modules/caas/README.md +158 -0
- modules/caas/benchmarks/README.md +255 -0
- modules/caas/benchmarks/create_hf_dataset.py +502 -0
- modules/caas/benchmarks/data/sample_corpus/README.md +86 -0
- modules/caas/benchmarks/data/sample_corpus/auth_module.py +211 -0
- modules/caas/benchmarks/data/sample_corpus/contribution_guide.md +185 -0
- modules/caas/benchmarks/data/sample_corpus/remote_work_policy.html +57 -0
- modules/caas/benchmarks/hf_dataset/README.md +214 -0
- modules/caas/benchmarks/hf_dataset/caas_benchmark_corpus.py +73 -0
- modules/caas/benchmarks/hf_dataset/corpus_preview.json +193 -0
- modules/caas/benchmarks/results/README.md +66 -0
- modules/caas/benchmarks/results/evaluation_2026-01-20.json +121 -0
- modules/caas/benchmarks/run_evaluation.py +561 -0
- modules/caas/benchmarks/statistical_tests.py +289 -0
- modules/caas/benchmarks/verify_sample_corpus.py +83 -0
- modules/caas/docker-compose.yml +38 -0
- modules/caas/docs/CONTEXT_TRIAD.md +462 -0
- modules/caas/docs/CONTRIBUTING.md +346 -0
- modules/caas/docs/ETHICS_AND_LIMITATIONS.md +336 -0
- modules/caas/docs/HEURISTIC_ROUTER.md +442 -0
- modules/caas/docs/IMPLEMENTATION_SUMMARY.md +363 -0
- modules/caas/docs/IMPLEMENTATION_SUMMARY_CONTEXT_TRIAD.md +277 -0
- modules/caas/docs/IMPLEMENTATION_SUMMARY_HEURISTIC_ROUTER.md +231 -0
- modules/caas/docs/IMPLEMENTATION_SUMMARY_METADATA_INJECTION.md +258 -0
- modules/caas/docs/IMPLEMENTATION_SUMMARY_PRAGMATIC_TRUTH.md +212 -0
- modules/caas/docs/IMPLEMENTATION_SUMMARY_TRUST_GATEWAY.md +319 -0
- modules/caas/docs/LAYER_1_PRIMITIVE.md +202 -0
- modules/caas/docs/METADATA_INJECTION.md +404 -0
- modules/caas/docs/PRAGMATIC_TRUTH.md +431 -0
- modules/caas/docs/RELATED_WORK.md +312 -0
- modules/caas/docs/RELEASE_CHECKLIST.md +219 -0
- modules/caas/docs/RELEASE_GUIDE.md +285 -0
- modules/caas/docs/REPRODUCIBILITY.md +386 -0
- modules/caas/docs/SLIDING_WINDOW.md +387 -0
- modules/caas/docs/STRUCTURE_AWARE_INDEXING.md +158 -0
- modules/caas/docs/TESTING.md +259 -0
- modules/caas/docs/THREAT_MODEL.md +247 -0
- modules/caas/docs/TRUST_GATEWAY.md +575 -0
- modules/caas/docs/VFS.md +298 -0
- modules/caas/examples/agents/enterprise_security_agent.py +414 -0
- modules/caas/examples/agents/intelligent_document_analyzer.py +380 -0
- modules/caas/examples/demos/demo.py +309 -0
- modules/caas/examples/demos/demo_context_triad.py +225 -0
- modules/caas/examples/demos/demo_conversation_manager.py +285 -0
- modules/caas/examples/demos/demo_heuristic_router.py +133 -0
- modules/caas/examples/demos/demo_metadata_injection.py +198 -0
- modules/caas/examples/demos/demo_pragmatic_truth.py +303 -0
- modules/caas/examples/demos/demo_structure_aware.py +140 -0
- modules/caas/examples/demos/demo_time_decay.py +247 -0
- modules/caas/examples/demos/demo_trust_gateway.py +383 -0
- modules/caas/examples/multi_agent/README.md +159 -0
- modules/caas/examples/multi_agent/research_team.py +369 -0
- modules/caas/examples/multi_agent/vfs_collaboration.py +393 -0
- modules/caas/examples/usage/auth_module.py +142 -0
- modules/caas/examples/usage/usage_example.py +173 -0
- modules/caas/experiments/README.md +42 -0
- modules/caas/experiments/reproduce_results.py +462 -0
- modules/caas/paper/ARXIV_METADATA.md +145 -0
- modules/caas/paper/ARXIV_README.md +47 -0
- modules/caas/paper/CHECKLIST.md +103 -0
- modules/caas/paper/GITHUB_RELEASE_NOTES.md +105 -0
- modules/caas/paper/README.md +71 -0
- modules/caas/paper/abstract.md +24 -0
- modules/caas/paper/arxiv_submission.tar +0 -0
- modules/caas/paper/arxiv_submission.zip +0 -0
- modules/caas/paper/build_pdf.py +355 -0
- modules/caas/paper/experiments.md +149 -0
- modules/caas/paper/figures/.gitkeep +0 -0
- modules/caas/paper/figures/README.md +237 -0
- modules/caas/paper/figures/fig1_system_architecture.png +0 -0
- modules/caas/paper/figures/fig1_system_architecture.svg +198 -0
- modules/caas/paper/figures/fig2_context_triad.png +0 -0
- modules/caas/paper/figures/fig2_context_triad.svg +105 -0
- modules/caas/paper/figures/fig3_ablation_results.png +0 -0
- modules/caas/paper/figures/fig3_ablation_results.svg +113 -0
- modules/caas/paper/figures/fig4_routing_latency.png +0 -0
- modules/caas/paper/figures/fig4_routing_latency.svg +97 -0
- modules/caas/paper/intro.md +103 -0
- modules/caas/paper/latex/figures/fig1_system_architecture.png +0 -0
- modules/caas/paper/latex/figures/fig2_context_triad.png +0 -0
- modules/caas/paper/latex/figures/fig3_ablation_results.png +0 -0
- modules/caas/paper/latex/figures/fig4_routing_latency.png +0 -0
- modules/caas/paper/latex/main.tex +468 -0
- modules/caas/paper/latex/references.bib +140 -0
- modules/caas/paper/method.md +350 -0
- modules/caas/paper/outline.md +123 -0
- modules/caas/paper/related_work.md +101 -0
- modules/caas/paper/tables/.gitkeep +0 -0
- modules/caas/paper/tables/results_tables.md +50 -0
- modules/caas/pyproject.toml +172 -0
- modules/caas/requirements.txt +11 -0
- modules/caas/src/caas/__init__.py +232 -0
- modules/caas/src/caas/api/__init__.py +7 -0
- modules/caas/src/caas/api/server.py +1326 -0
- modules/caas/src/caas/caching.py +832 -0
- modules/caas/src/caas/cli.py +208 -0
- modules/caas/src/caas/conversation.py +221 -0
- modules/caas/src/caas/decay.py +118 -0
- modules/caas/src/caas/detection/__init__.py +7 -0
- modules/caas/src/caas/detection/detector.py +236 -0
- modules/caas/src/caas/enrichment.py +127 -0
- modules/caas/src/caas/gateway/__init__.py +24 -0
- modules/caas/src/caas/gateway/trust_gateway.py +471 -0
- modules/caas/src/caas/hf_utils.py +477 -0
- modules/caas/src/caas/ingestion/__init__.py +21 -0
- modules/caas/src/caas/ingestion/processors.py +251 -0
- modules/caas/src/caas/ingestion/structure_parser.py +185 -0
- modules/caas/src/caas/models.py +354 -0
- modules/caas/src/caas/pragmatic_truth.py +441 -0
- modules/caas/src/caas/routing/__init__.py +8 -0
- modules/caas/src/caas/routing/heuristic_router.py +242 -0
- modules/caas/src/caas/storage/__init__.py +7 -0
- modules/caas/src/caas/storage/store.py +450 -0
- modules/caas/src/caas/triad.py +472 -0
- modules/caas/src/caas/tuning/__init__.py +7 -0
- modules/caas/src/caas/tuning/tuner.py +322 -0
- modules/caas/src/caas/vfs/__init__.py +12 -0
- modules/caas/src/caas/vfs/filesystem.py +450 -0
- modules/caas/tests/__init__.py +3 -0
- modules/caas/tests/conftest.py +8 -0
- modules/caas/tests/test_caching.py +628 -0
- modules/caas/tests/test_context_triad.py +385 -0
- modules/caas/tests/test_conversation_manager.py +289 -0
- modules/caas/tests/test_functionality.py +215 -0
- modules/caas/tests/test_heuristic_router.py +370 -0
- modules/caas/tests/test_metadata_injection.py +328 -0
- modules/caas/tests/test_pragmatic_truth.py +322 -0
- modules/caas/tests/test_structure_aware_indexing.py +283 -0
- modules/caas/tests/test_time_decay.py +268 -0
- modules/caas/tests/test_trust_gateway.py +445 -0
- modules/caas/tests/test_vfs.py +298 -0
- modules/cmvk/.github/FUNDING.yml +9 -0
- modules/cmvk/.github/dependabot.yml +54 -0
- modules/cmvk/.github/workflows/ci.yml +205 -0
- modules/cmvk/.github/workflows/publish.yml +143 -0
- modules/cmvk/.gitignore +147 -0
- modules/cmvk/.pre-commit-config.yaml +58 -0
- modules/cmvk/CHANGELOG.md +146 -0
- modules/cmvk/CITATION.cff +48 -0
- modules/cmvk/CONTRIBUTING.md +229 -0
- modules/cmvk/Dockerfile +87 -0
- modules/cmvk/HF_MODEL_CARD.md +185 -0
- modules/cmvk/LICENSE +21 -0
- modules/cmvk/README.md +149 -0
- modules/cmvk/SECURITY.md +114 -0
- modules/cmvk/config/prompts/generator_v1.txt +23 -0
- modules/cmvk/config/prompts/verifier_hostile.txt +32 -0
- modules/cmvk/config/settings.yaml +40 -0
- modules/cmvk/coverage_html/.gitignore +2 -0
- modules/cmvk/coverage_html/class_index.html +658 -0
- modules/cmvk/coverage_html/coverage_html_cb_188fc9a4.js +735 -0
- modules/cmvk/coverage_html/favicon_32_cb_c827f16f.png +0 -0
- modules/cmvk/coverage_html/function_index.html +1978 -0
- modules/cmvk/coverage_html/index.html +255 -0
- modules/cmvk/coverage_html/keybd_closed_cb_900cfef5.png +0 -0
- modules/cmvk/coverage_html/status.json +1 -0
- modules/cmvk/coverage_html/style_cb_5c747636.css +389 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38___init___py.html +315 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_audit_py.html +499 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_benchmarks_py.html +575 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_constitutional_py.html +1001 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_hf_utils_py.html +398 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_metrics_py.html +570 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_profiles_py.html +397 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_types_py.html +109 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_verification_py.html +1053 -0
- modules/cmvk/docs/DIAGRAMS.md +325 -0
- modules/cmvk/docs/architecture.md +345 -0
- modules/cmvk/docs/features.md +308 -0
- modules/cmvk/docs/getting_started.md +279 -0
- modules/cmvk/docs/innovation_layer.md +377 -0
- modules/cmvk/docs/safety.md +281 -0
- modules/cmvk/docs/traceability.md +150 -0
- modules/cmvk/examples/basic_example.py +62 -0
- modules/cmvk/examples/demo_complete_pipeline.py +209 -0
- modules/cmvk/examples/demo_innovation_layer.py +197 -0
- modules/cmvk/examples/example.py +112 -0
- modules/cmvk/examples/model_diversity_comparison.py +110 -0
- modules/cmvk/examples/real_api_integration.py +121 -0
- modules/cmvk/examples/test_full_pipeline.py +303 -0
- modules/cmvk/experiments/FEATURE_2_LATERAL_THINKING.md +187 -0
- modules/cmvk/experiments/README.md +216 -0
- modules/cmvk/experiments/ablation_runner.py +666 -0
- modules/cmvk/experiments/baseline_runner.py +158 -0
- modules/cmvk/experiments/blind_spot_benchmark.py +364 -0
- modules/cmvk/experiments/datasets/README.md +85 -0
- modules/cmvk/experiments/datasets/humaneval_50.json +352 -0
- modules/cmvk/experiments/datasets/humaneval_full.json +1150 -0
- modules/cmvk/experiments/datasets/humaneval_sample.json +32 -0
- modules/cmvk/experiments/datasets/sabotage.json +262 -0
- modules/cmvk/experiments/datasets/sample.json +40 -0
- modules/cmvk/experiments/demo_with_traces.py +110 -0
- modules/cmvk/experiments/efficiency_curve.py +259 -0
- modules/cmvk/experiments/experiment_runner.py +243 -0
- modules/cmvk/experiments/paper_data_generator.py +183 -0
- modules/cmvk/experiments/reproduce_results.py +407 -0
- modules/cmvk/experiments/reproducible_runner.py +352 -0
- modules/cmvk/experiments/sabotage_stress_test.py +311 -0
- modules/cmvk/experiments/test_lateral_thinking.py +116 -0
- modules/cmvk/experiments/test_prosecutor.py +41 -0
- modules/cmvk/experiments/visualize_results.py +735 -0
- modules/cmvk/logs/traces/demo_HumanEval_0_20260121-204900.json +36 -0
- modules/cmvk/notebooks/analysis.ipynb +124 -0
- modules/cmvk/paper/PAPER.md +561 -0
- modules/cmvk/paper/arxiv_checklist.md +230 -0
- modules/cmvk/paper/cmvk_neurips.aux +77 -0
- modules/cmvk/paper/cmvk_neurips.bbl +81 -0
- modules/cmvk/paper/cmvk_neurips.blg +48 -0
- modules/cmvk/paper/cmvk_neurips.out +16 -0
- modules/cmvk/paper/cmvk_neurips.pdf +0 -0
- modules/cmvk/paper/cmvk_neurips.tex +309 -0
- modules/cmvk/paper/figures/ablation.png +0 -0
- modules/cmvk/paper/figures/ablation.svg +39 -0
- modules/cmvk/paper/figures/architecture.png +0 -0
- modules/cmvk/paper/figures/architecture.svg +115 -0
- modules/cmvk/paper/figures/results_bar.png +0 -0
- modules/cmvk/paper/figures/results_bar.svg +70 -0
- modules/cmvk/paper/generate_figures.py +383 -0
- modules/cmvk/paper/neurips_2024.sty +101 -0
- modules/cmvk/paper/references.bib +98 -0
- modules/cmvk/paper/structure.tex +200 -0
- modules/cmvk/pyproject.toml +189 -0
- modules/cmvk/requirements-dev.txt +19 -0
- modules/cmvk/requirements.txt +14 -0
- modules/cmvk/src/cmvk/__init__.py +216 -0
- modules/cmvk/src/cmvk/audit.py +400 -0
- modules/cmvk/src/cmvk/benchmarks.py +476 -0
- modules/cmvk/src/cmvk/constitutional.py +902 -0
- modules/cmvk/src/cmvk/hf_utils.py +299 -0
- modules/cmvk/src/cmvk/metrics.py +471 -0
- modules/cmvk/src/cmvk/profiles.py +298 -0
- modules/cmvk/src/cmvk/py.typed +0 -0
- modules/cmvk/src/cmvk/types.py +10 -0
- modules/cmvk/src/cmvk/verification.py +954 -0
- modules/cmvk/src/cross_model_verification_kernel/__init__.py +91 -0
- modules/cmvk/src/cross_model_verification_kernel/__main__.py +10 -0
- modules/cmvk/src/cross_model_verification_kernel/agents/__init__.py +16 -0
- modules/cmvk/src/cross_model_verification_kernel/agents/base_agent.py +142 -0
- modules/cmvk/src/cross_model_verification_kernel/agents/generator_openai.py +223 -0
- modules/cmvk/src/cross_model_verification_kernel/agents/verifier_anthropic.py +448 -0
- modules/cmvk/src/cross_model_verification_kernel/agents/verifier_gemini.py +481 -0
- modules/cmvk/src/cross_model_verification_kernel/cli.py +570 -0
- modules/cmvk/src/cross_model_verification_kernel/core/__init__.py +26 -0
- modules/cmvk/src/cross_model_verification_kernel/core/graph_memory.py +308 -0
- modules/cmvk/src/cross_model_verification_kernel/core/kernel.py +413 -0
- modules/cmvk/src/cross_model_verification_kernel/core/trace_logger.py +75 -0
- modules/cmvk/src/cross_model_verification_kernel/core/types.py +121 -0
- modules/cmvk/src/cross_model_verification_kernel/datasets/__init__.py +20 -0
- modules/cmvk/src/cross_model_verification_kernel/datasets/humaneval_loader.py +271 -0
- modules/cmvk/src/cross_model_verification_kernel/generator.py +118 -0
- modules/cmvk/src/cross_model_verification_kernel/kernel.py +292 -0
- modules/cmvk/src/cross_model_verification_kernel/models.py +111 -0
- modules/cmvk/src/cross_model_verification_kernel/py.typed +1 -0
- modules/cmvk/src/cross_model_verification_kernel/simple_kernel.py +185 -0
- modules/cmvk/src/cross_model_verification_kernel/tools/__init__.py +94 -0
- modules/cmvk/src/cross_model_verification_kernel/tools/huggingface_upload.py +394 -0
- modules/cmvk/src/cross_model_verification_kernel/tools/sandbox.py +159 -0
- modules/cmvk/src/cross_model_verification_kernel/tools/statistics.py +468 -0
- modules/cmvk/src/cross_model_verification_kernel/tools/visualizer.py +312 -0
- modules/cmvk/src/cross_model_verification_kernel/tools/web_search.py +86 -0
- modules/cmvk/src/cross_model_verification_kernel/verifier.py +257 -0
- modules/cmvk/tests/__init__.py +3 -0
- modules/cmvk/tests/conftest.py +61 -0
- modules/cmvk/tests/integration/__init__.py +1 -0
- modules/cmvk/tests/integration/test_anthropic_verifier.py +269 -0
- modules/cmvk/tests/integration/test_integration.py +53 -0
- modules/cmvk/tests/integration/test_lateral_thinking_integration.py +199 -0
- modules/cmvk/tests/integration/test_lateral_thinking_witness.py +208 -0
- modules/cmvk/tests/integration/test_prosecutor_mode.py +131 -0
- modules/cmvk/tests/test_constitutional.py +611 -0
- modules/cmvk/tests/test_enhanced_features.py +603 -0
- modules/cmvk/tests/test_verification.py +255 -0
- modules/cmvk/tests/unit/__init__.py +1 -0
- modules/cmvk/tests/unit/test_agents.py +64 -0
- modules/cmvk/tests/unit/test_cli.py +224 -0
- modules/cmvk/tests/unit/test_core.py +126 -0
- modules/cmvk/tests/unit/test_humaneval_loader.py +197 -0
- modules/cmvk/tests/unit/test_kernel.py +255 -0
- modules/cmvk/tests/unit/test_reproducibility.py +160 -0
- modules/cmvk/tests/unit/test_trace_logger.py +115 -0
- modules/cmvk/tests/unit/test_visualizer.py +218 -0
- modules/control-plane/.github/ISSUE_TEMPLATE/bug_report.yml +82 -0
- modules/control-plane/.github/ISSUE_TEMPLATE/config.yml +11 -0
- modules/control-plane/.github/ISSUE_TEMPLATE/feature_request.yml +104 -0
- modules/control-plane/.github/ISSUE_TEMPLATE/question.yml +70 -0
- modules/control-plane/.github/ISSUE_TEMPLATE/security_vulnerability.yml +84 -0
- modules/control-plane/.github/discussions.yml +73 -0
- modules/control-plane/.github/pull_request_template.md +82 -0
- modules/control-plane/.github/workflows/publish.yml +146 -0
- modules/control-plane/.github/workflows/release.yml +39 -0
- modules/control-plane/.github/workflows/tests.yml +58 -0
- modules/control-plane/.gitignore +55 -0
- modules/control-plane/CHANGELOG.md +203 -0
- modules/control-plane/CONTRIBUTING.md +311 -0
- modules/control-plane/CONTRIBUTORS.md +88 -0
- modules/control-plane/Dockerfile +82 -0
- modules/control-plane/LICENSE +21 -0
- modules/control-plane/MANIFEST.in +17 -0
- modules/control-plane/README.md +1264 -0
- modules/control-plane/ROADMAP.md +228 -0
- modules/control-plane/SECURITY.md +210 -0
- modules/control-plane/SUPPORT.md +106 -0
- modules/control-plane/acp-cli.py +212 -0
- modules/control-plane/benchmark/README.md +257 -0
- modules/control-plane/benchmark/__init__.py +19 -0
- modules/control-plane/benchmark/red_team_dataset.py +517 -0
- modules/control-plane/benchmark.py +563 -0
- modules/control-plane/build_and_publish.sh +130 -0
- modules/control-plane/docker-compose.yml +74 -0
- modules/control-plane/docs/ABLATION_STUDIES.md +528 -0
- modules/control-plane/docs/ADAPTER_GUIDE.md +544 -0
- modules/control-plane/docs/ADVANCED_FEATURES.md +543 -0
- modules/control-plane/docs/AIOS_COMPARISON.md +296 -0
- modules/control-plane/docs/BIBLIOGRAPHY.md +367 -0
- modules/control-plane/docs/CASE_STUDIES.md +645 -0
- modules/control-plane/docs/DOCKER_DEPLOYMENT.md +184 -0
- modules/control-plane/docs/ECOSYSTEM_STATUS.md +98 -0
- modules/control-plane/docs/HF_MODEL_CARD.md +168 -0
- modules/control-plane/docs/KERNEL_V1_RELEASE.md +454 -0
- modules/control-plane/docs/LAYER3_FRAMEWORK.md +227 -0
- modules/control-plane/docs/LIMITATIONS.md +523 -0
- modules/control-plane/docs/PYPI_PUBLISHING.md +195 -0
- modules/control-plane/docs/README.md +58 -0
- modules/control-plane/docs/RELATED_WORK.md +319 -0
- modules/control-plane/docs/RELEASE_v1.1.0.md +252 -0
- modules/control-plane/docs/REPRODUCIBILITY.md +540 -0
- modules/control-plane/docs/RESEARCH_FOUNDATION.md +197 -0
- modules/control-plane/docs/api/CORE.md +270 -0
- modules/control-plane/docs/architecture/architecture.md +120 -0
- modules/control-plane/docs/community/ANNOUNCEMENT_TEMPLATES.md +52 -0
- modules/control-plane/docs/guides/IMPLEMENTATION.md +225 -0
- modules/control-plane/docs/guides/PHILOSOPHY.md +354 -0
- modules/control-plane/docs/guides/QUICKSTART.md +217 -0
- modules/control-plane/examples/README.md +138 -0
- modules/control-plane/examples/a2a_demo.py +410 -0
- modules/control-plane/examples/adapter_demo.py +347 -0
- modules/control-plane/examples/advanced_features.py +403 -0
- modules/control-plane/examples/basic_usage.py +261 -0
- modules/control-plane/examples/benchmark_demo.py +186 -0
- modules/control-plane/examples/compliance_demo.py +333 -0
- modules/control-plane/examples/configuration.py +265 -0
- modules/control-plane/examples/getting_started.py +178 -0
- modules/control-plane/examples/hibernation_and_time_travel_demo.py +406 -0
- modules/control-plane/examples/interactive_tutorial.ipynb +497 -0
- modules/control-plane/examples/kernel_interceptor_demo.py +202 -0
- modules/control-plane/examples/kernel_v1_demo.py +273 -0
- modules/control-plane/examples/langchain_demo.py +281 -0
- modules/control-plane/examples/lifecycle_demo.py +724 -0
- modules/control-plane/examples/mcp_demo.py +378 -0
- modules/control-plane/examples/ml_safety_demo.py +157 -0
- modules/control-plane/examples/multimodal_demo.py +347 -0
- modules/control-plane/examples/observability_demo.py +370 -0
- modules/control-plane/examples/use_cases.py +336 -0
- modules/control-plane/experiments/long_horizon_purge.py +235 -0
- modules/control-plane/experiments/multi_agent_rag.py +165 -0
- modules/control-plane/experiments/reproduce_results.py +667 -0
- modules/control-plane/paper/ARXIV_SUBMISSION_INFO.txt +122 -0
- modules/control-plane/paper/ETHICS_STATEMENT.md +248 -0
- modules/control-plane/paper/PAPER_CHECKLIST.md +72 -0
- modules/control-plane/paper/Paper.pdf +0 -0
- modules/control-plane/paper/README.md +71 -0
- modules/control-plane/paper/appendix.md +152 -0
- modules/control-plane/paper/architecture.md +15 -0
- modules/control-plane/paper/arxiv/figures/ablation_chart.png +0 -0
- modules/control-plane/paper/arxiv/figures/architecture.png +0 -0
- modules/control-plane/paper/arxiv/figures/constraint_graphs.png +0 -0
- modules/control-plane/paper/arxiv/figures/results_chart.png +0 -0
- modules/control-plane/paper/arxiv/main.aux +97 -0
- modules/control-plane/paper/arxiv/main.bbl +112 -0
- modules/control-plane/paper/arxiv/main.blg +48 -0
- modules/control-plane/paper/arxiv/main.out +33 -0
- modules/control-plane/paper/arxiv/main.pdf +0 -0
- modules/control-plane/paper/arxiv/main.tex +479 -0
- modules/control-plane/paper/arxiv/references.bib +234 -0
- modules/control-plane/paper/arxiv_submission.tar +0 -0
- modules/control-plane/paper/arxiv_submission.zip +0 -0
- modules/control-plane/paper/build.sh +68 -0
- modules/control-plane/paper/figures/README.md +47 -0
- modules/control-plane/paper/figures/ablation_chart.pdf +0 -0
- modules/control-plane/paper/figures/ablation_chart.png +0 -0
- modules/control-plane/paper/figures/architecture.pdf +0 -0
- modules/control-plane/paper/figures/architecture.png +0 -0
- modules/control-plane/paper/figures/constraint_graphs.pdf +0 -0
- modules/control-plane/paper/figures/constraint_graphs.png +0 -0
- modules/control-plane/paper/figures/generate_figures.py +252 -0
- modules/control-plane/paper/figures/results_chart.pdf +0 -0
- modules/control-plane/paper/figures/results_chart.png +0 -0
- modules/control-plane/paper/main.md +273 -0
- modules/control-plane/paper/main.tex +214 -0
- modules/control-plane/paper/main_arxiv.aux +53 -0
- modules/control-plane/paper/main_arxiv.out +17 -0
- modules/control-plane/paper/main_arxiv.pdf +0 -0
- modules/control-plane/paper/main_arxiv.tex +264 -0
- modules/control-plane/paper/references.bib +234 -0
- modules/control-plane/pyproject.toml +124 -0
- modules/control-plane/reproducibility/ABLATIONS.md +136 -0
- modules/control-plane/reproducibility/README.md +288 -0
- modules/control-plane/reproducibility/commands.md +467 -0
- modules/control-plane/reproducibility/docker_config/Dockerfile +39 -0
- modules/control-plane/reproducibility/experiment_configs/purge_config.json +46 -0
- modules/control-plane/reproducibility/experiment_configs/rag_config.json +36 -0
- modules/control-plane/reproducibility/hardware_specs.md +317 -0
- modules/control-plane/reproducibility/requirements_frozen.txt +0 -0
- modules/control-plane/reproducibility/run_all_experiments.sh +45 -0
- modules/control-plane/reproducibility/seeds.json +106 -0
- modules/control-plane/scripts/prepare_pypi.py +46 -0
- modules/control-plane/scripts/prepare_release.py +176 -0
- modules/control-plane/scripts/upload_dataset_to_hf.py +316 -0
- modules/control-plane/setup.py +69 -0
- modules/control-plane/src/agent_control_plane/__init__.py +639 -0
- modules/control-plane/src/agent_control_plane/a2a_adapter.py +541 -0
- modules/control-plane/src/agent_control_plane/adapter.py +415 -0
- modules/control-plane/src/agent_control_plane/agent_hibernation.py +364 -0
- modules/control-plane/src/agent_control_plane/agent_kernel.py +464 -0
- modules/control-plane/src/agent_control_plane/compliance.py +718 -0
- modules/control-plane/src/agent_control_plane/constraint_graphs.py +475 -0
- modules/control-plane/src/agent_control_plane/control_plane.py +848 -0
- modules/control-plane/src/agent_control_plane/example_executors.py +193 -0
- modules/control-plane/src/agent_control_plane/execution_engine.py +229 -0
- modules/control-plane/src/agent_control_plane/flight_recorder.py +600 -0
- modules/control-plane/src/agent_control_plane/governance_layer.py +432 -0
- modules/control-plane/src/agent_control_plane/hf_utils.py +561 -0
- modules/control-plane/src/agent_control_plane/interfaces/__init__.py +53 -0
- modules/control-plane/src/agent_control_plane/interfaces/kernel_interface.py +359 -0
- modules/control-plane/src/agent_control_plane/interfaces/plugin_interface.py +495 -0
- modules/control-plane/src/agent_control_plane/interfaces/protocol_interfaces.py +385 -0
- modules/control-plane/src/agent_control_plane/kernel_space.py +707 -0
- modules/control-plane/src/agent_control_plane/langchain_adapter.py +422 -0
- modules/control-plane/src/agent_control_plane/lifecycle.py +3111 -0
- modules/control-plane/src/agent_control_plane/mcp_adapter.py +517 -0
- modules/control-plane/src/agent_control_plane/ml_safety.py +560 -0
- modules/control-plane/src/agent_control_plane/multimodal.py +724 -0
- modules/control-plane/src/agent_control_plane/mute_agent.py +419 -0
- modules/control-plane/src/agent_control_plane/observability.py +785 -0
- modules/control-plane/src/agent_control_plane/orchestrator.py +480 -0
- modules/control-plane/src/agent_control_plane/plugin_registry.py +748 -0
- modules/control-plane/src/agent_control_plane/policy_engine.py +525 -0
- modules/control-plane/src/agent_control_plane/shadow_mode.py +307 -0
- modules/control-plane/src/agent_control_plane/signals.py +491 -0
- modules/control-plane/src/agent_control_plane/supervisor_agents.py +427 -0
- modules/control-plane/src/agent_control_plane/time_travel_debugger.py +554 -0
- modules/control-plane/src/agent_control_plane/tool_registry.py +350 -0
- modules/control-plane/src/agent_control_plane/vfs.py +695 -0
- modules/control-plane/tests/README.md +33 -0
- modules/control-plane/tests/test_a2a_adapter.py +336 -0
- modules/control-plane/tests/test_adapter.py +422 -0
- modules/control-plane/tests/test_advanced_features.py +389 -0
- modules/control-plane/tests/test_benchmark.py +223 -0
- modules/control-plane/tests/test_compliance.py +214 -0
- modules/control-plane/tests/test_control_plane.py +295 -0
- modules/control-plane/tests/test_hibernation.py +274 -0
- modules/control-plane/tests/test_kernel_interception.py +284 -0
- modules/control-plane/tests/test_langchain_adapter.py +258 -0
- modules/control-plane/tests/test_lifecycle.py +1174 -0
- modules/control-plane/tests/test_mcp_adapter.py +293 -0
- modules/control-plane/tests/test_ml_safety.py +142 -0
- modules/control-plane/tests/test_multimodal.py +317 -0
- modules/control-plane/tests/test_new_features.py +435 -0
- modules/control-plane/tests/test_observability.py +338 -0
- modules/control-plane/tests/test_time_travel.py +387 -0
- modules/emk/.github/workflows/ci.yml +105 -0
- modules/emk/.github/workflows/publish.yml +144 -0
- modules/emk/.gitignore +74 -0
- modules/emk/CHANGELOG.md +41 -0
- modules/emk/CONTRIBUTING.md +295 -0
- modules/emk/IMPLEMENTATION.md +174 -0
- modules/emk/LICENSE +21 -0
- modules/emk/MANIFEST.in +8 -0
- modules/emk/README.md +135 -0
- modules/emk/RELEASE_NOTES.md +82 -0
- modules/emk/SECURITY.md +52 -0
- modules/emk/codecov.yml +39 -0
- modules/emk/docs/MEMORY_MANAGEMENT.md +285 -0
- modules/emk/emk/__init__.py +106 -0
- modules/emk/emk/hf_utils.py +419 -0
- modules/emk/emk/indexer.py +144 -0
- modules/emk/emk/py.typed +0 -0
- modules/emk/emk/schema.py +204 -0
- modules/emk/emk/sleep_cycle.py +345 -0
- modules/emk/emk/store.py +479 -0
- modules/emk/examples/basic_usage.py +123 -0
- modules/emk/examples/memory_features_demo.py +154 -0
- modules/emk/experiments/README.md +59 -0
- modules/emk/experiments/reproduce_results.py +461 -0
- modules/emk/experiments/results.json +61 -0
- modules/emk/paper/structure.tex +192 -0
- modules/emk/paper/whitepaper.md +273 -0
- modules/emk/pyproject.toml +91 -0
- modules/emk/setup.py +5 -0
- modules/emk/tests/test_file_adapter.py +195 -0
- modules/emk/tests/test_indexer.py +174 -0
- modules/emk/tests/test_init.py +55 -0
- modules/emk/tests/test_negative_memory.py +83 -0
- modules/emk/tests/test_schema.py +150 -0
- modules/emk/tests/test_semantic_rules.py +175 -0
- modules/emk/tests/test_sleep_cycle.py +335 -0
- modules/emk/tests/test_store_anti_patterns.py +239 -0
- modules/iatp/.github/workflows/docker-build.yml +124 -0
- modules/iatp/.github/workflows/publish.yml +174 -0
- modules/iatp/.github/workflows/python-package.yml +121 -0
- modules/iatp/.gitignore +67 -0
- modules/iatp/.pre-commit-config.yaml +64 -0
- modules/iatp/CHANGELOG.md +120 -0
- modules/iatp/Dockerfile +91 -0
- modules/iatp/IMPLEMENTATION_SUMMARY.md +218 -0
- modules/iatp/MANIFEST.in +9 -0
- modules/iatp/README.md +180 -0
- modules/iatp/docker/Dockerfile.agent +27 -0
- modules/iatp/docker/Dockerfile.sidecar-python +86 -0
- modules/iatp/docker/README.md +258 -0
- modules/iatp/docker-compose.yml +194 -0
- modules/iatp/docs/ARCHITECTURE.md +243 -0
- modules/iatp/docs/CLI_GUIDE.md +220 -0
- modules/iatp/docs/DEPLOYMENT.md +304 -0
- modules/iatp/examples/README.md +132 -0
- modules/iatp/examples/backend_agent.py +39 -0
- modules/iatp/examples/client.py +168 -0
- modules/iatp/examples/demo_attestation_reputation.py +274 -0
- modules/iatp/examples/demo_client.py +240 -0
- modules/iatp/examples/demo_rbac.py +143 -0
- modules/iatp/examples/integration_demo.py +245 -0
- modules/iatp/examples/manifests/coder_agent.json +20 -0
- modules/iatp/examples/manifests/reviewer_agent.json +19 -0
- modules/iatp/examples/manifests/secure_bank.json +14 -0
- modules/iatp/examples/manifests/standard_agent.json +14 -0
- modules/iatp/examples/manifests/untrusted_honeypot.json +14 -0
- modules/iatp/examples/run_secure_bank_sidecar.py +85 -0
- modules/iatp/examples/run_sidecar.py +105 -0
- modules/iatp/examples/run_untrusted_sidecar.py +77 -0
- modules/iatp/examples/secure_bank_agent.py +138 -0
- modules/iatp/examples/test_untrusted.py +82 -0
- modules/iatp/examples/untrusted_agent.py +119 -0
- modules/iatp/experiments/README.md +58 -0
- modules/iatp/experiments/cascading_hallucination/README.md +149 -0
- modules/iatp/experiments/cascading_hallucination/agent_a_user.py +41 -0
- modules/iatp/experiments/cascading_hallucination/agent_b_summarizer.py +54 -0
- modules/iatp/experiments/cascading_hallucination/agent_c_database.py +47 -0
- modules/iatp/experiments/cascading_hallucination/proof_of_concept.py +290 -0
- modules/iatp/experiments/cascading_hallucination/run_experiment.py +226 -0
- modules/iatp/experiments/cascading_hallucination/sidecar_c.py +61 -0
- modules/iatp/experiments/reproduce_results.py +574 -0
- modules/iatp/experiments/results.json +2336 -0
- modules/iatp/iatp/__init__.py +164 -0
- modules/iatp/iatp/attestation.py +401 -0
- modules/iatp/iatp/cli.py +253 -0
- modules/iatp/iatp/hf_utils.py +469 -0
- modules/iatp/iatp/ipc_pipes.py +578 -0
- modules/iatp/iatp/main.py +410 -0
- modules/iatp/iatp/models/__init__.py +445 -0
- modules/iatp/iatp/policy_engine.py +335 -0
- modules/iatp/iatp/py.typed +2 -0
- modules/iatp/iatp/recovery.py +319 -0
- modules/iatp/iatp/security/__init__.py +268 -0
- modules/iatp/iatp/sidecar/__init__.py +517 -0
- modules/iatp/iatp/telemetry/__init__.py +162 -0
- modules/iatp/iatp/tests/__init__.py +1 -0
- modules/iatp/iatp/tests/test_attestation.py +368 -0
- modules/iatp/iatp/tests/test_cli.py +129 -0
- modules/iatp/iatp/tests/test_models.py +128 -0
- modules/iatp/iatp/tests/test_policy_engine.py +345 -0
- modules/iatp/iatp/tests/test_recovery.py +279 -0
- modules/iatp/iatp/tests/test_security.py +220 -0
- modules/iatp/iatp/tests/test_sidecar.py +165 -0
- modules/iatp/iatp/tests/test_telemetry.py +173 -0
- modules/iatp/paper/BLOG.md +307 -0
- modules/iatp/paper/PAPER.md +236 -0
- modules/iatp/paper/RFC_SUBMISSION.md +299 -0
- modules/iatp/paper/whitepaper.md +369 -0
- modules/iatp/proto/README.md +200 -0
- modules/iatp/proto/generate_stubs.py +81 -0
- modules/iatp/proto/iatp.proto +552 -0
- modules/iatp/pyproject.toml +180 -0
- modules/iatp/requirements-dev.txt +2 -0
- modules/iatp/requirements.txt +6 -0
- modules/iatp/setup.py +60 -0
- modules/iatp/sidecar/README.md +487 -0
- modules/iatp/sidecar/go/Dockerfile +32 -0
- modules/iatp/sidecar/go/README.md +237 -0
- modules/iatp/sidecar/go/go.mod +8 -0
- modules/iatp/sidecar/go/main.go +488 -0
- modules/iatp/spec/001-handshake.md +436 -0
- modules/iatp/spec/002-reversibility.md +394 -0
- modules/iatp/spec/schema/capability_manifest.json +266 -0
- modules/iatp/test_integration.py +310 -0
- modules/mcp-kernel-server/README.md +261 -0
- modules/mcp-kernel-server/pyproject.toml +60 -0
- modules/mcp-kernel-server/src/mcp_kernel_server/__init__.py +26 -0
- modules/mcp-kernel-server/src/mcp_kernel_server/cli.py +229 -0
- modules/mcp-kernel-server/src/mcp_kernel_server/resources.py +215 -0
- modules/mcp-kernel-server/src/mcp_kernel_server/server.py +562 -0
- modules/mcp-kernel-server/src/mcp_kernel_server/tools.py +1172 -0
- modules/mute-agent/.github/workflows/safety_check.yml +45 -0
- modules/mute-agent/.gitignore +53 -0
- modules/mute-agent/ARCHITECTURE.md +531 -0
- modules/mute-agent/BENCHMARK_GUIDE.md +384 -0
- modules/mute-agent/COMPLETION_SUMMARY.md +293 -0
- modules/mute-agent/EXPERIMENT_SUMMARY.md +318 -0
- modules/mute-agent/IMPLEMENTATION_SUMMARY.md +212 -0
- modules/mute-agent/LICENSE +21 -0
- modules/mute-agent/PHASE3_SUMMARY.md +297 -0
- modules/mute-agent/README.md +360 -0
- modules/mute-agent/STEEL_MAN_RESULTS.md +353 -0
- modules/mute-agent/USAGE.md +505 -0
- modules/mute-agent/V2_IMPLEMENTATION_SUMMARY.md +253 -0
- modules/mute-agent/V2_STEEL_MAN_IMPLEMENTATION.md +274 -0
- modules/mute-agent/VERIFICATION_REPORT.md +435 -0
- modules/mute-agent/charts/cost_comparison.png +0 -0
- modules/mute-agent/charts/cost_vs_ambiguity.png +0 -0
- modules/mute-agent/charts/metrics_comparison.png +0 -0
- modules/mute-agent/charts/scenario_breakdown.png +0 -0
- modules/mute-agent/charts/trace_attack_blocked.html +140 -0
- modules/mute-agent/charts/trace_attack_blocked.png +0 -0
- modules/mute-agent/charts/trace_failure.html +140 -0
- modules/mute-agent/charts/trace_failure.png +0 -0
- modules/mute-agent/charts/trace_success.html +140 -0
- modules/mute-agent/charts/trace_success.png +0 -0
- modules/mute-agent/examples/__init__.py +1 -0
- modules/mute-agent/examples/advanced_example.py +384 -0
- modules/mute-agent/examples/graph_debugger_demo.py +241 -0
- modules/mute-agent/examples/listener_example.py +297 -0
- modules/mute-agent/examples/simple_example.py +242 -0
- modules/mute-agent/examples/steel_man_demo.py +297 -0
- modules/mute-agent/experiments/README.md +135 -0
- modules/mute-agent/experiments/__init__.py +3 -0
- modules/mute-agent/experiments/agent_comparison.csv +6 -0
- modules/mute-agent/experiments/agent_comparison_50runs.csv +6 -0
- modules/mute-agent/experiments/ambiguity_test.py +335 -0
- modules/mute-agent/experiments/ambiguity_test_results.csv +31 -0
- modules/mute-agent/experiments/ambiguity_test_results_50runs.csv +51 -0
- modules/mute-agent/experiments/baseline_agent.py +189 -0
- modules/mute-agent/experiments/benchmark.py +402 -0
- modules/mute-agent/experiments/demo.py +172 -0
- modules/mute-agent/experiments/generate_cost_curve.py +474 -0
- modules/mute-agent/experiments/jailbreak_test.py +137 -0
- modules/mute-agent/experiments/latent_state_scenario.py +361 -0
- modules/mute-agent/experiments/mute_agent_experiment.py +349 -0
- modules/mute-agent/experiments/run_extended_experiment.py +40 -0
- modules/mute-agent/experiments/run_v2_experiments.py +266 -0
- modules/mute-agent/experiments/run_v2_experiments_auto.py +247 -0
- modules/mute-agent/experiments/v2_scenarios/README.md +214 -0
- modules/mute-agent/experiments/v2_scenarios/__init__.py +4 -0
- modules/mute-agent/experiments/v2_scenarios/scenario_1_deep_dependency.py +325 -0
- modules/mute-agent/experiments/v2_scenarios/scenario_2_adversarial.py +328 -0
- modules/mute-agent/experiments/v2_scenarios/scenario_3_false_positive.py +303 -0
- modules/mute-agent/experiments/v2_scenarios/scenario_4_performance.py +319 -0
- modules/mute-agent/experiments/visualize.py +400 -0
- modules/mute-agent/mute_agent/__init__.py +66 -0
- modules/mute-agent/mute_agent/core/__init__.py +1 -0
- modules/mute-agent/mute_agent/core/execution_agent.py +164 -0
- modules/mute-agent/mute_agent/core/handshake_protocol.py +199 -0
- modules/mute-agent/mute_agent/core/reasoning_agent.py +236 -0
- modules/mute-agent/mute_agent/knowledge_graph/__init__.py +1 -0
- modules/mute-agent/mute_agent/knowledge_graph/graph_elements.py +63 -0
- modules/mute-agent/mute_agent/knowledge_graph/multidimensional_graph.py +168 -0
- modules/mute-agent/mute_agent/knowledge_graph/subgraph.py +222 -0
- modules/mute-agent/mute_agent/listener/__init__.py +41 -0
- modules/mute-agent/mute_agent/listener/adapters/__init__.py +29 -0
- modules/mute-agent/mute_agent/listener/adapters/base_adapter.py +187 -0
- modules/mute-agent/mute_agent/listener/adapters/caas_adapter.py +342 -0
- modules/mute-agent/mute_agent/listener/adapters/control_plane_adapter.py +434 -0
- modules/mute-agent/mute_agent/listener/adapters/iatp_adapter.py +330 -0
- modules/mute-agent/mute_agent/listener/adapters/scak_adapter.py +249 -0
- modules/mute-agent/mute_agent/listener/listener.py +608 -0
- modules/mute-agent/mute_agent/listener/state_observer.py +434 -0
- modules/mute-agent/mute_agent/listener/threshold_config.py +311 -0
- modules/mute-agent/mute_agent/super_system/__init__.py +1 -0
- modules/mute-agent/mute_agent/super_system/router.py +202 -0
- modules/mute-agent/mute_agent/visualization/__init__.py +8 -0
- modules/mute-agent/mute_agent/visualization/graph_debugger.py +495 -0
- modules/mute-agent/requirements-dev.txt +6 -0
- modules/mute-agent/requirements.txt +9 -0
- modules/mute-agent/setup.py +64 -0
- modules/mute-agent/src/__init__.py +0 -0
- modules/mute-agent/src/agents/__init__.py +0 -0
- modules/mute-agent/src/agents/baseline_agent.py +524 -0
- modules/mute-agent/src/agents/interactive_agent.py +113 -0
- modules/mute-agent/src/agents/mute_agent.py +622 -0
- modules/mute-agent/src/benchmarks/__init__.py +0 -0
- modules/mute-agent/src/benchmarks/evaluator.py +481 -0
- modules/mute-agent/src/benchmarks/scenarios.json +985 -0
- modules/mute-agent/src/core/__init__.py +0 -0
- modules/mute-agent/src/core/mock_state.py +320 -0
- modules/mute-agent/src/core/tools.py +441 -0
- modules/nexus/__init__.py +49 -0
- modules/nexus/arbiter.py +357 -0
- modules/nexus/client.py +464 -0
- modules/nexus/dmz.py +417 -0
- modules/nexus/escrow.py +428 -0
- modules/nexus/exceptions.py +284 -0
- modules/nexus/registry.py +391 -0
- modules/nexus/reputation.py +423 -0
- modules/nexus/schemas/__init__.py +49 -0
- modules/nexus/schemas/compliance.py +274 -0
- modules/nexus/schemas/escrow.py +249 -0
- modules/nexus/schemas/manifest.py +223 -0
- modules/nexus/schemas/receipt.py +206 -0
- modules/observability/README.md +192 -0
- modules/observability/alertmanager/alertmanager.yml +116 -0
- modules/observability/alerts/agent-os-alerts.yaml +197 -0
- modules/observability/docker-compose.yml +128 -0
- modules/observability/grafana/dashboards/agent-os-amb.json +448 -0
- modules/observability/grafana/dashboards/agent-os-cmvk.json +441 -0
- modules/observability/grafana/dashboards/agent-os-overview.json +268 -0
- modules/observability/grafana/dashboards/agent-os-performance.json +15 -0
- modules/observability/grafana/dashboards/agent-os-safety.json +50 -0
- modules/observability/grafana/provisioning/dashboards/dashboards.yml +15 -0
- modules/observability/grafana/provisioning/datasources/datasources.yml +33 -0
- modules/observability/otel/otel-collector-config.yml +61 -0
- modules/observability/prometheus/prometheus.yml +63 -0
- modules/observability/pyproject.toml +53 -0
- modules/observability/scripts/export_dashboards.py +55 -0
- modules/observability/src/agent_os_observability/__init__.py +25 -0
- modules/observability/src/agent_os_observability/dashboards.py +896 -0
- modules/observability/src/agent_os_observability/metrics.py +396 -0
- modules/observability/src/agent_os_observability/server.py +221 -0
- modules/observability/src/agent_os_observability/tracer.py +226 -0
- modules/primitives/.gitignore +8 -0
- modules/primitives/README.md +62 -0
- modules/primitives/agent_primitives/__init__.py +22 -0
- modules/primitives/agent_primitives/failures.py +82 -0
- modules/primitives/agent_primitives/py.typed +0 -0
- modules/primitives/pyproject.toml +68 -0
- modules/scak/.github/copilot-instructions.md +396 -0
- modules/scak/.github/workflows/release.yml +117 -0
- modules/scak/.gitignore +32 -0
- modules/scak/CHANGELOG.md +173 -0
- modules/scak/CITATION.cff +62 -0
- modules/scak/CONTRIBUTING.md +429 -0
- modules/scak/Dockerfile +58 -0
- modules/scak/ENTERPRISE_FEATURES.md +518 -0
- modules/scak/IMPLEMENTATION_SUMMARY.md +206 -0
- modules/scak/LIMITATIONS.md +565 -0
- modules/scak/MANIFEST.in +16 -0
- modules/scak/NOVELTY.md +535 -0
- modules/scak/README.md +928 -0
- modules/scak/RESEARCH.md +670 -0
- modules/scak/agent_kernel/__init__.py +66 -0
- modules/scak/agent_kernel/analyzer.py +432 -0
- modules/scak/agent_kernel/auditor.py +31 -0
- modules/scak/agent_kernel/completeness_auditor.py +234 -0
- modules/scak/agent_kernel/detector.py +200 -0
- modules/scak/agent_kernel/kernel.py +741 -0
- modules/scak/agent_kernel/memory_manager.py +82 -0
- modules/scak/agent_kernel/models.py +372 -0
- modules/scak/agent_kernel/nudge_mechanism.py +260 -0
- modules/scak/agent_kernel/outcome_analyzer.py +335 -0
- modules/scak/agent_kernel/patcher.py +579 -0
- modules/scak/agent_kernel/semantic_analyzer.py +313 -0
- modules/scak/agent_kernel/semantic_purge.py +346 -0
- modules/scak/agent_kernel/simulator.py +447 -0
- modules/scak/agent_kernel/teacher.py +82 -0
- modules/scak/agent_kernel/triage.py +149 -0
- modules/scak/build_and_publish.ps1 +74 -0
- modules/scak/build_and_publish.sh +74 -0
- modules/scak/cli.py +471 -0
- modules/scak/dashboard.py +462 -0
- modules/scak/datasets/DATASET_CARD.md +219 -0
- modules/scak/datasets/README.md +143 -0
- modules/scak/datasets/gaia_vague_queries/vague_queries.json +262 -0
- modules/scak/datasets/hf_upload/README.md +219 -0
- modules/scak/datasets/hf_upload/scak_gaia_laziness.jsonl +50 -0
- modules/scak/datasets/prepare_hf_datasets.py +145 -0
- modules/scak/datasets/red_team/jailbreak_patterns.json +202 -0
- modules/scak/docker-compose.yml +99 -0
- modules/scak/docs/Adaptive-Memory-Hierarchy.md +319 -0
- modules/scak/docs/Data-Contracts-and-Schemas.md +285 -0
- modules/scak/docs/Dual-Loop-Architecture.md +344 -0
- modules/scak/docs/Enhanced-Features.md +612 -0
- modules/scak/docs/LANGCHAIN_INTEGRATION.md +572 -0
- modules/scak/docs/README.md +128 -0
- modules/scak/docs/Reference-Implementations.md +163 -0
- modules/scak/docs/SCAK_V2.md +374 -0
- modules/scak/docs/Three-Failure-Types.md +178 -0
- modules/scak/examples/basic_example.py +155 -0
- modules/scak/examples/circuit_breaker_lazy_eval_demo.py +243 -0
- modules/scak/examples/langchain_integration_example.py +339 -0
- modules/scak/examples/layer4_demo.py +243 -0
- modules/scak/examples/production_features_demo.py +353 -0
- modules/scak/examples/quick_demo.py +79 -0
- modules/scak/examples/scak_v2_demo.py +252 -0
- modules/scak/experiments/README.md +438 -0
- modules/scak/experiments/ablation_studies/README.md +192 -0
- modules/scak/experiments/ablation_studies/ablation_no_audit.py +116 -0
- modules/scak/experiments/ablation_studies/ablation_no_purge.py +133 -0
- modules/scak/experiments/chaos_engineering/README.md +332 -0
- modules/scak/experiments/context_efficiency_test.py +328 -0
- modules/scak/experiments/gaia_benchmark/README.md +208 -0
- modules/scak/experiments/laziness_benchmark.py +179 -0
- modules/scak/experiments/long_horizon_task_experiment.py +252 -0
- modules/scak/experiments/multi_agent_rag_experiment.py +284 -0
- modules/scak/experiments/results/ablation_table.md +12 -0
- modules/scak/experiments/results/long_horizon.json +36 -0
- modules/scak/experiments/results/multi_agent_rag.json +66 -0
- modules/scak/experiments/run_comprehensive_ablations.py +332 -0
- modules/scak/experiments/test_auditor_patcher_integration.py +251 -0
- modules/scak/notebooks/getting_started.ipynb +33 -0
- modules/scak/paper/ARXIV_SUBMISSION_METADATA.txt +109 -0
- modules/scak/paper/PAPER_CHECKLIST.md +304 -0
- modules/scak/paper/Paper.pdf +0 -0
- modules/scak/paper/README.md +113 -0
- modules/scak/paper/appendix.md +351 -0
- modules/scak/paper/arxiv/bibliography.bib +284 -0
- modules/scak/paper/arxiv/fig1_ooda_architecture.pdf +0 -0
- modules/scak/paper/arxiv/fig2_memory_hierarchy.pdf +0 -0
- modules/scak/paper/arxiv/fig3_gaia_results.pdf +0 -0
- modules/scak/paper/arxiv/fig4_ablation_heatmap.pdf +0 -0
- modules/scak/paper/arxiv/fig5_context_reduction.pdf +0 -0
- modules/scak/paper/arxiv/fig6_mttr_boxplot.pdf +0 -0
- modules/scak/paper/arxiv/main.aux +103 -0
- modules/scak/paper/arxiv/main.bbl +113 -0
- modules/scak/paper/arxiv/main.blg +55 -0
- modules/scak/paper/arxiv/main.out +31 -0
- modules/scak/paper/arxiv/main.pdf +0 -0
- modules/scak/paper/arxiv/main.tex +482 -0
- modules/scak/paper/arxiv_submission/bibliography.bib +284 -0
- modules/scak/paper/arxiv_submission/fig1_ooda_architecture.pdf +0 -0
- modules/scak/paper/arxiv_submission/fig2_memory_hierarchy.pdf +0 -0
- modules/scak/paper/arxiv_submission/fig3_gaia_results.pdf +0 -0
- modules/scak/paper/arxiv_submission/fig4_ablation_heatmap.pdf +0 -0
- modules/scak/paper/arxiv_submission/fig5_context_reduction.pdf +0 -0
- modules/scak/paper/arxiv_submission/fig6_mttr_boxplot.pdf +0 -0
- modules/scak/paper/arxiv_submission/main.aux +103 -0
- modules/scak/paper/arxiv_submission/main.bbl +113 -0
- modules/scak/paper/arxiv_submission/main.blg +55 -0
- modules/scak/paper/arxiv_submission/main.out +31 -0
- modules/scak/paper/arxiv_submission/main.pdf +0 -0
- modules/scak/paper/arxiv_submission/main.tex +482 -0
- modules/scak/paper/arxiv_submission.tar.gz +0 -0
- modules/scak/paper/bibliography.bib +284 -0
- modules/scak/paper/build.sh +55 -0
- modules/scak/paper/figures/README.md +32 -0
- modules/scak/paper/figures/fig1_ooda_architecture.md +75 -0
- modules/scak/paper/figures/fig1_ooda_architecture.pdf +0 -0
- modules/scak/paper/figures/fig1_ooda_architecture.png +0 -0
- modules/scak/paper/figures/fig2_memory_hierarchy.md +83 -0
- modules/scak/paper/figures/fig2_memory_hierarchy.pdf +0 -0
- modules/scak/paper/figures/fig2_memory_hierarchy.png +0 -0
- modules/scak/paper/figures/fig3_gaia_results.md +64 -0
- modules/scak/paper/figures/fig3_gaia_results.pdf +0 -0
- modules/scak/paper/figures/fig3_gaia_results.png +0 -0
- modules/scak/paper/figures/fig4_ablation_heatmap.md +64 -0
- modules/scak/paper/figures/fig4_ablation_heatmap.pdf +0 -0
- modules/scak/paper/figures/fig4_ablation_heatmap.png +0 -0
- modules/scak/paper/figures/fig5_context_reduction.md +71 -0
- modules/scak/paper/figures/fig5_context_reduction.pdf +0 -0
- modules/scak/paper/figures/fig5_context_reduction.png +0 -0
- modules/scak/paper/figures/fig6_mttr_boxplot.md +80 -0
- modules/scak/paper/figures/fig6_mttr_boxplot.pdf +0 -0
- modules/scak/paper/figures/fig6_mttr_boxplot.png +0 -0
- modules/scak/paper/figures/generate_figures.py +463 -0
- modules/scak/paper/main.aux +103 -0
- modules/scak/paper/main.bbl +113 -0
- modules/scak/paper/main.blg +55 -0
- modules/scak/paper/main.md +192 -0
- modules/scak/paper/main.out +31 -0
- modules/scak/paper/main.pdf +0 -0
- modules/scak/paper/main.tex +482 -0
- modules/scak/reproducibility/ABLATIONS.md +225 -0
- modules/scak/reproducibility/Dockerfile.reproducibility +34 -0
- modules/scak/reproducibility/README.md +421 -0
- modules/scak/reproducibility/requirements-pinned.txt +32 -0
- modules/scak/reproducibility/run_all_experiments.py +395 -0
- modules/scak/reproducibility/seed_control.py +53 -0
- modules/scak/reproducibility/statistical_analysis.py +302 -0
- modules/scak/requirements.txt +50 -0
- modules/scak/setup.py +93 -0
- modules/scak/src/__init__.py +124 -0
- modules/scak/src/agents/__init__.py +13 -0
- modules/scak/src/agents/conflict_resolution.py +732 -0
- modules/scak/src/agents/orchestrator.py +761 -0
- modules/scak/src/agents/pubsub.py +484 -0
- modules/scak/src/agents/shadow_teacher.py +344 -0
- modules/scak/src/agents/swarm.py +661 -0
- modules/scak/src/agents/worker.py +357 -0
- modules/scak/src/integrations/__init__.py +81 -0
- modules/scak/src/integrations/cmvk_adapter.py +430 -0
- modules/scak/src/integrations/control_plane_adapter.py +601 -0
- modules/scak/src/integrations/langchain_integration.py +902 -0
- modules/scak/src/interfaces/__init__.py +59 -0
- modules/scak/src/interfaces/llm_clients.py +505 -0
- modules/scak/src/interfaces/openapi_tools.py +611 -0
- modules/scak/src/interfaces/plugin_system.py +605 -0
- modules/scak/src/interfaces/protocols.py +365 -0
- modules/scak/src/interfaces/telemetry.py +464 -0
- modules/scak/src/interfaces/tool_registry.py +547 -0
- modules/scak/src/kernel/__init__.py +100 -0
- modules/scak/src/kernel/auditor.py +305 -0
- modules/scak/src/kernel/circuit_breaker.py +398 -0
- modules/scak/src/kernel/core.py +724 -0
- modules/scak/src/kernel/distributed.py +667 -0
- modules/scak/src/kernel/evolution.py +455 -0
- modules/scak/src/kernel/failover.py +621 -0
- modules/scak/src/kernel/governance.py +710 -0
- modules/scak/src/kernel/governance_v2.py +603 -0
- modules/scak/src/kernel/lazy_evaluator.py +514 -0
- modules/scak/src/kernel/load_testing.py +633 -0
- modules/scak/src/kernel/memory.py +945 -0
- modules/scak/src/kernel/patcher.py +581 -0
- modules/scak/src/kernel/rubric.py +419 -0
- modules/scak/src/kernel/schemas.py +390 -0
- modules/scak/src/kernel/skill_mapper.py +309 -0
- modules/scak/src/kernel/triage.py +149 -0
- modules/scak/src/mocks/__init__.py +99 -0
- modules/scak/tests/__init__.py +1 -0
- modules/scak/tests/test_circuit_breaker.py +403 -0
- modules/scak/tests/test_conflict_resolution.py +287 -0
- modules/scak/tests/test_dual_loop.py +463 -0
- modules/scak/tests/test_enhanced_features.py +421 -0
- modules/scak/tests/test_failover_and_load.py +438 -0
- modules/scak/tests/test_governance.py +185 -0
- modules/scak/tests/test_kernel.py +359 -0
- modules/scak/tests/test_langchain_integration.py +451 -0
- modules/scak/tests/test_lazy_evaluator.py +465 -0
- modules/scak/tests/test_llm_clients.py +122 -0
- modules/scak/tests/test_memory_controller.py +528 -0
- modules/scak/tests/test_orchestrator.py +181 -0
- modules/scak/tests/test_phase3_integration.py +265 -0
- modules/scak/tests/test_pubsub_swarm.py +203 -0
- modules/scak/tests/test_reference_implementations.py +240 -0
- modules/scak/tests/test_rubric.py +363 -0
- modules/scak/tests/test_scak_v2.py +651 -0
- modules/scak/tests/test_skill_mapper.py +217 -0
- modules/scak/tests/test_specific_failures.py +393 -0
- modules/scak/tests/test_tool_registry.py +264 -0
- modules/scak/tests/test_tools_and_plugins.py +303 -0
- modules/scak/tests/test_triage.py +596 -0
- modules/scak/tests/test_write_through.py +319 -0
- agent_os_kernel-1.1.0.dist-info/METADATA +0 -400
- agent_os_kernel-1.1.0.dist-info/RECORD +0 -12
- {agent_os_kernel-1.1.0.dist-info → agent_os_kernel-1.2.0.dist-info}/WHEEL +0 -0
- {agent_os_kernel-1.1.0.dist-info → agent_os_kernel-1.2.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,902 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Constitutional Validator for CMVK
|
|
3
|
+
|
|
4
|
+
This module provides a Constitutional Validator that checks AI outputs against
|
|
5
|
+
natural language safety rules (principles). Inspired by Anthropic's Constitutional AI,
|
|
6
|
+
this allows defining human-readable rules that are evaluated against outputs.
|
|
7
|
+
|
|
8
|
+
Key Features:
|
|
9
|
+
- Define principles in natural language (not regex or code)
|
|
10
|
+
- Evaluate outputs against multiple principles
|
|
11
|
+
- Support for custom principle sets (safety, ethics, brand, regulatory)
|
|
12
|
+
- Async and sync interfaces
|
|
13
|
+
- Pluggable LLM backends for evaluation
|
|
14
|
+
- Detailed violation reports with explanations
|
|
15
|
+
|
|
16
|
+
Example Usage:
|
|
17
|
+
|
|
18
|
+
from cmvk.constitutional import (
|
|
19
|
+
ConstitutionalValidator,
|
|
20
|
+
Principle,
|
|
21
|
+
PrincipleSet,
|
|
22
|
+
SAFETY_PRINCIPLES,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
# Create validator with built-in safety principles
|
|
26
|
+
validator = ConstitutionalValidator(principles=SAFETY_PRINCIPLES)
|
|
27
|
+
|
|
28
|
+
# Check an output
|
|
29
|
+
result = validator.validate("Here's how to hack a computer...")
|
|
30
|
+
|
|
31
|
+
if not result.passed:
|
|
32
|
+
for violation in result.violations:
|
|
33
|
+
print(f"Violated: {violation.principle.name}")
|
|
34
|
+
print(f"Reason: {violation.explanation}")
|
|
35
|
+
|
|
36
|
+
# Define custom principles
|
|
37
|
+
brand_principles = PrincipleSet(
|
|
38
|
+
name="brand",
|
|
39
|
+
principles=[
|
|
40
|
+
Principle(
|
|
41
|
+
name="professional_tone",
|
|
42
|
+
description="Responses must maintain a professional tone",
|
|
43
|
+
severity="medium"
|
|
44
|
+
),
|
|
45
|
+
Principle(
|
|
46
|
+
name="no_competitor_mentions",
|
|
47
|
+
description="Never mention competitor products by name",
|
|
48
|
+
severity="high"
|
|
49
|
+
),
|
|
50
|
+
]
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
validator = ConstitutionalValidator(principles=brand_principles)
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
from __future__ import annotations
|
|
57
|
+
|
|
58
|
+
import asyncio
|
|
59
|
+
from abc import ABC, abstractmethod
|
|
60
|
+
from dataclasses import dataclass, field
|
|
61
|
+
from enum import Enum
|
|
62
|
+
from typing import Any, Callable, Optional, Protocol, Sequence, Union
|
|
63
|
+
from datetime import datetime, timezone
|
|
64
|
+
import json
|
|
65
|
+
import re
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class Severity(str, Enum):
|
|
69
|
+
"""Severity level for principle violations."""
|
|
70
|
+
CRITICAL = "critical" # Must block output
|
|
71
|
+
HIGH = "high" # Should block unless overridden
|
|
72
|
+
MEDIUM = "medium" # Warning, may proceed
|
|
73
|
+
LOW = "low" # Informational
|
|
74
|
+
|
|
75
|
+
def __lt__(self, other: "Severity") -> bool:
|
|
76
|
+
order = [Severity.LOW, Severity.MEDIUM, Severity.HIGH, Severity.CRITICAL]
|
|
77
|
+
return order.index(self) < order.index(other)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
@dataclass(frozen=True)
|
|
81
|
+
class Principle:
|
|
82
|
+
"""
|
|
83
|
+
A single constitutional principle.
|
|
84
|
+
|
|
85
|
+
Principles are natural language rules that outputs must comply with.
|
|
86
|
+
They are evaluated by an LLM to determine if the output violates them.
|
|
87
|
+
|
|
88
|
+
Attributes:
|
|
89
|
+
name: Short identifier for the principle
|
|
90
|
+
description: Natural language description of the rule
|
|
91
|
+
severity: How serious a violation of this principle is
|
|
92
|
+
category: Optional category for grouping (e.g., "safety", "ethics")
|
|
93
|
+
examples: Optional list of (input, is_violation, explanation) tuples
|
|
94
|
+
"""
|
|
95
|
+
name: str
|
|
96
|
+
description: str
|
|
97
|
+
severity: Severity = Severity.MEDIUM
|
|
98
|
+
category: Optional[str] = None
|
|
99
|
+
examples: tuple[tuple[str, bool, str], ...] = field(default_factory=tuple)
|
|
100
|
+
|
|
101
|
+
def __hash__(self) -> int:
|
|
102
|
+
return hash((self.name, self.description))
|
|
103
|
+
|
|
104
|
+
def to_dict(self) -> dict[str, Any]:
|
|
105
|
+
"""Convert to dictionary."""
|
|
106
|
+
return {
|
|
107
|
+
"name": self.name,
|
|
108
|
+
"description": self.description,
|
|
109
|
+
"severity": self.severity.value,
|
|
110
|
+
"category": self.category,
|
|
111
|
+
"examples": list(self.examples),
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
@dataclass
|
|
116
|
+
class PrincipleSet:
|
|
117
|
+
"""
|
|
118
|
+
A named collection of principles.
|
|
119
|
+
|
|
120
|
+
Attributes:
|
|
121
|
+
name: Name of this principle set
|
|
122
|
+
principles: List of principles in this set
|
|
123
|
+
description: Optional description of what this set covers
|
|
124
|
+
version: Version string for tracking changes
|
|
125
|
+
"""
|
|
126
|
+
name: str
|
|
127
|
+
principles: list[Principle]
|
|
128
|
+
description: Optional[str] = None
|
|
129
|
+
version: str = "1.0.0"
|
|
130
|
+
|
|
131
|
+
def __iter__(self):
|
|
132
|
+
return iter(self.principles)
|
|
133
|
+
|
|
134
|
+
def __len__(self) -> int:
|
|
135
|
+
return len(self.principles)
|
|
136
|
+
|
|
137
|
+
def get_by_name(self, name: str) -> Optional[Principle]:
|
|
138
|
+
"""Get a principle by name."""
|
|
139
|
+
for p in self.principles:
|
|
140
|
+
if p.name == name:
|
|
141
|
+
return p
|
|
142
|
+
return None
|
|
143
|
+
|
|
144
|
+
def get_by_category(self, category: str) -> list[Principle]:
|
|
145
|
+
"""Get all principles in a category."""
|
|
146
|
+
return [p for p in self.principles if p.category == category]
|
|
147
|
+
|
|
148
|
+
def merge(self, other: "PrincipleSet") -> "PrincipleSet":
|
|
149
|
+
"""Merge with another principle set."""
|
|
150
|
+
combined = list(self.principles)
|
|
151
|
+
existing_names = {p.name for p in combined}
|
|
152
|
+
for p in other.principles:
|
|
153
|
+
if p.name not in existing_names:
|
|
154
|
+
combined.append(p)
|
|
155
|
+
return PrincipleSet(
|
|
156
|
+
name=f"{self.name}+{other.name}",
|
|
157
|
+
principles=combined,
|
|
158
|
+
description=f"Merged: {self.description or ''} + {other.description or ''}"
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
def to_dict(self) -> dict[str, Any]:
|
|
162
|
+
"""Convert to dictionary."""
|
|
163
|
+
return {
|
|
164
|
+
"name": self.name,
|
|
165
|
+
"description": self.description,
|
|
166
|
+
"version": self.version,
|
|
167
|
+
"principles": [p.to_dict() for p in self.principles],
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
@classmethod
|
|
171
|
+
def from_dict(cls, data: dict[str, Any]) -> "PrincipleSet":
|
|
172
|
+
"""Create from dictionary."""
|
|
173
|
+
principles = [
|
|
174
|
+
Principle(
|
|
175
|
+
name=p["name"],
|
|
176
|
+
description=p["description"],
|
|
177
|
+
severity=Severity(p.get("severity", "medium")),
|
|
178
|
+
category=p.get("category"),
|
|
179
|
+
examples=tuple(p.get("examples", [])),
|
|
180
|
+
)
|
|
181
|
+
for p in data.get("principles", [])
|
|
182
|
+
]
|
|
183
|
+
return cls(
|
|
184
|
+
name=data["name"],
|
|
185
|
+
principles=principles,
|
|
186
|
+
description=data.get("description"),
|
|
187
|
+
version=data.get("version", "1.0.0"),
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
# ============================================================================
|
|
192
|
+
# Built-in Principle Sets
|
|
193
|
+
# ============================================================================
|
|
194
|
+
|
|
195
|
+
SAFETY_PRINCIPLES = PrincipleSet(
|
|
196
|
+
name="safety",
|
|
197
|
+
description="Core safety principles for AI outputs",
|
|
198
|
+
version="1.0.0",
|
|
199
|
+
principles=[
|
|
200
|
+
Principle(
|
|
201
|
+
name="no_harm_instructions",
|
|
202
|
+
description="The output must not provide instructions for causing physical harm to people or property",
|
|
203
|
+
severity=Severity.CRITICAL,
|
|
204
|
+
category="safety",
|
|
205
|
+
examples=(
|
|
206
|
+
("Here's how to make a weapon at home...", True, "Provides weapon-making instructions"),
|
|
207
|
+
("I cannot provide instructions for making weapons.", False, "Appropriately refuses"),
|
|
208
|
+
),
|
|
209
|
+
),
|
|
210
|
+
Principle(
|
|
211
|
+
name="no_illegal_activity",
|
|
212
|
+
description="The output must not encourage or provide detailed guidance for illegal activities",
|
|
213
|
+
severity=Severity.CRITICAL,
|
|
214
|
+
category="safety",
|
|
215
|
+
examples=(
|
|
216
|
+
("Here's how to bypass security systems...", True, "Provides hacking guidance"),
|
|
217
|
+
("I can't help with bypassing security systems.", False, "Appropriately refuses"),
|
|
218
|
+
),
|
|
219
|
+
),
|
|
220
|
+
Principle(
|
|
221
|
+
name="no_malware",
|
|
222
|
+
description="The output must not contain or describe how to create malware, viruses, or malicious code",
|
|
223
|
+
severity=Severity.CRITICAL,
|
|
224
|
+
category="safety",
|
|
225
|
+
),
|
|
226
|
+
Principle(
|
|
227
|
+
name="no_personal_data_exposure",
|
|
228
|
+
description="The output must not reveal or encourage collection of personal identifiable information",
|
|
229
|
+
severity=Severity.HIGH,
|
|
230
|
+
category="privacy",
|
|
231
|
+
),
|
|
232
|
+
Principle(
|
|
233
|
+
name="no_deception",
|
|
234
|
+
description="The output must not encourage deception, fraud, or manipulation of others",
|
|
235
|
+
severity=Severity.HIGH,
|
|
236
|
+
category="ethics",
|
|
237
|
+
),
|
|
238
|
+
Principle(
|
|
239
|
+
name="acknowledge_uncertainty",
|
|
240
|
+
description="The output should acknowledge uncertainty rather than presenting speculation as fact",
|
|
241
|
+
severity=Severity.MEDIUM,
|
|
242
|
+
category="accuracy",
|
|
243
|
+
),
|
|
244
|
+
Principle(
|
|
245
|
+
name="no_hate_speech",
|
|
246
|
+
description="The output must not contain hate speech, slurs, or content that demeans groups based on protected characteristics",
|
|
247
|
+
severity=Severity.CRITICAL,
|
|
248
|
+
category="ethics",
|
|
249
|
+
),
|
|
250
|
+
Principle(
|
|
251
|
+
name="respect_autonomy",
|
|
252
|
+
description="The output should respect user autonomy and not be manipulative or coercive",
|
|
253
|
+
severity=Severity.MEDIUM,
|
|
254
|
+
category="ethics",
|
|
255
|
+
),
|
|
256
|
+
],
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
MEDICAL_PRINCIPLES = PrincipleSet(
|
|
261
|
+
name="medical",
|
|
262
|
+
description="Principles for medical/health-related AI outputs",
|
|
263
|
+
version="1.0.0",
|
|
264
|
+
principles=[
|
|
265
|
+
Principle(
|
|
266
|
+
name="no_diagnosis",
|
|
267
|
+
description="The output must not provide specific medical diagnoses",
|
|
268
|
+
severity=Severity.CRITICAL,
|
|
269
|
+
category="medical",
|
|
270
|
+
),
|
|
271
|
+
Principle(
|
|
272
|
+
name="recommend_professional",
|
|
273
|
+
description="The output should recommend consulting healthcare professionals for medical concerns",
|
|
274
|
+
severity=Severity.HIGH,
|
|
275
|
+
category="medical",
|
|
276
|
+
),
|
|
277
|
+
Principle(
|
|
278
|
+
name="no_medication_dosing",
|
|
279
|
+
description="The output must not provide specific medication dosing recommendations",
|
|
280
|
+
severity=Severity.CRITICAL,
|
|
281
|
+
category="medical",
|
|
282
|
+
),
|
|
283
|
+
Principle(
|
|
284
|
+
name="emergency_awareness",
|
|
285
|
+
description="The output should recognize and appropriately respond to potential medical emergencies",
|
|
286
|
+
severity=Severity.CRITICAL,
|
|
287
|
+
category="medical",
|
|
288
|
+
),
|
|
289
|
+
],
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
FINANCIAL_PRINCIPLES = PrincipleSet(
|
|
294
|
+
name="financial",
|
|
295
|
+
description="Principles for financial/investment-related AI outputs",
|
|
296
|
+
version="1.0.0",
|
|
297
|
+
principles=[
|
|
298
|
+
Principle(
|
|
299
|
+
name="no_specific_advice",
|
|
300
|
+
description="The output must not provide specific investment advice or recommendations",
|
|
301
|
+
severity=Severity.HIGH,
|
|
302
|
+
category="financial",
|
|
303
|
+
),
|
|
304
|
+
Principle(
|
|
305
|
+
name="risk_disclosure",
|
|
306
|
+
description="The output should include appropriate risk disclosures when discussing investments",
|
|
307
|
+
severity=Severity.MEDIUM,
|
|
308
|
+
category="financial",
|
|
309
|
+
),
|
|
310
|
+
Principle(
|
|
311
|
+
name="not_financial_advisor",
|
|
312
|
+
description="The output should clarify that it is not a licensed financial advisor",
|
|
313
|
+
severity=Severity.MEDIUM,
|
|
314
|
+
category="financial",
|
|
315
|
+
),
|
|
316
|
+
],
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
# ============================================================================
|
|
321
|
+
# Violation Types
|
|
322
|
+
# ============================================================================
|
|
323
|
+
|
|
324
|
+
@dataclass
|
|
325
|
+
class Violation:
|
|
326
|
+
"""
|
|
327
|
+
A detected principle violation.
|
|
328
|
+
|
|
329
|
+
Attributes:
|
|
330
|
+
principle: The principle that was violated
|
|
331
|
+
confidence: Confidence that this is a violation (0.0 to 1.0)
|
|
332
|
+
explanation: Human-readable explanation of why this is a violation
|
|
333
|
+
evidence: The specific text/content that triggered the violation
|
|
334
|
+
suggested_revision: Optional suggested revision to fix the violation
|
|
335
|
+
"""
|
|
336
|
+
principle: Principle
|
|
337
|
+
confidence: float
|
|
338
|
+
explanation: str
|
|
339
|
+
evidence: Optional[str] = None
|
|
340
|
+
suggested_revision: Optional[str] = None
|
|
341
|
+
|
|
342
|
+
def to_dict(self) -> dict[str, Any]:
|
|
343
|
+
"""Convert to dictionary."""
|
|
344
|
+
return {
|
|
345
|
+
"principle": self.principle.to_dict(),
|
|
346
|
+
"confidence": self.confidence,
|
|
347
|
+
"explanation": self.explanation,
|
|
348
|
+
"evidence": self.evidence,
|
|
349
|
+
"suggested_revision": self.suggested_revision,
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
@dataclass
|
|
354
|
+
class ValidationResult:
|
|
355
|
+
"""
|
|
356
|
+
Result of validating an output against principles.
|
|
357
|
+
|
|
358
|
+
Attributes:
|
|
359
|
+
passed: Whether the output passed all critical/high severity principles
|
|
360
|
+
violations: List of detected violations
|
|
361
|
+
output_text: The original text that was validated
|
|
362
|
+
principles_checked: Number of principles that were checked
|
|
363
|
+
timestamp: When the validation was performed
|
|
364
|
+
metadata: Additional metadata from the validation
|
|
365
|
+
"""
|
|
366
|
+
passed: bool
|
|
367
|
+
violations: list[Violation]
|
|
368
|
+
output_text: str
|
|
369
|
+
principles_checked: int
|
|
370
|
+
timestamp: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
|
|
371
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
372
|
+
|
|
373
|
+
@property
|
|
374
|
+
def critical_violations(self) -> list[Violation]:
|
|
375
|
+
"""Get only critical severity violations."""
|
|
376
|
+
return [v for v in self.violations if v.principle.severity == Severity.CRITICAL]
|
|
377
|
+
|
|
378
|
+
@property
|
|
379
|
+
def high_violations(self) -> list[Violation]:
|
|
380
|
+
"""Get only high severity violations."""
|
|
381
|
+
return [v for v in self.violations if v.principle.severity == Severity.HIGH]
|
|
382
|
+
|
|
383
|
+
@property
|
|
384
|
+
def blocking_violations(self) -> list[Violation]:
|
|
385
|
+
"""Get violations that should block the output (critical + high)."""
|
|
386
|
+
return [v for v in self.violations
|
|
387
|
+
if v.principle.severity in (Severity.CRITICAL, Severity.HIGH)]
|
|
388
|
+
|
|
389
|
+
def to_dict(self) -> dict[str, Any]:
|
|
390
|
+
"""Convert to dictionary."""
|
|
391
|
+
return {
|
|
392
|
+
"passed": self.passed,
|
|
393
|
+
"violations": [v.to_dict() for v in self.violations],
|
|
394
|
+
"output_text": self.output_text,
|
|
395
|
+
"principles_checked": self.principles_checked,
|
|
396
|
+
"timestamp": self.timestamp.isoformat(),
|
|
397
|
+
"metadata": self.metadata,
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
# ============================================================================
|
|
402
|
+
# Evaluator Interface
|
|
403
|
+
# ============================================================================
|
|
404
|
+
|
|
405
|
+
class PrincipleEvaluator(Protocol):
|
|
406
|
+
"""
|
|
407
|
+
Protocol for principle evaluation backends.
|
|
408
|
+
|
|
409
|
+
Implementations can use different LLMs or even rule-based systems
|
|
410
|
+
to evaluate whether an output violates a principle.
|
|
411
|
+
"""
|
|
412
|
+
|
|
413
|
+
def evaluate(
|
|
414
|
+
self,
|
|
415
|
+
output: str,
|
|
416
|
+
principle: Principle,
|
|
417
|
+
context: Optional[str] = None,
|
|
418
|
+
) -> tuple[bool, float, str]:
|
|
419
|
+
"""
|
|
420
|
+
Evaluate if an output violates a principle.
|
|
421
|
+
|
|
422
|
+
Args:
|
|
423
|
+
output: The text to evaluate
|
|
424
|
+
principle: The principle to check against
|
|
425
|
+
context: Optional context about the conversation/task
|
|
426
|
+
|
|
427
|
+
Returns:
|
|
428
|
+
Tuple of (is_violation, confidence, explanation)
|
|
429
|
+
"""
|
|
430
|
+
...
|
|
431
|
+
|
|
432
|
+
async def evaluate_async(
|
|
433
|
+
self,
|
|
434
|
+
output: str,
|
|
435
|
+
principle: Principle,
|
|
436
|
+
context: Optional[str] = None,
|
|
437
|
+
) -> tuple[bool, float, str]:
|
|
438
|
+
"""Async version of evaluate."""
|
|
439
|
+
...
|
|
440
|
+
|
|
441
|
+
|
|
442
|
+
class RuleBasedEvaluator:
|
|
443
|
+
"""
|
|
444
|
+
Simple rule-based evaluator using pattern matching.
|
|
445
|
+
|
|
446
|
+
This is a fallback evaluator that uses regex patterns and keywords.
|
|
447
|
+
It's fast but less sophisticated than LLM-based evaluation.
|
|
448
|
+
"""
|
|
449
|
+
|
|
450
|
+
# Keyword patterns for common violations
|
|
451
|
+
HARM_PATTERNS = [
|
|
452
|
+
r"\bhow to (make|build|create) (a |an )?(bomb|weapon|explosive)",
|
|
453
|
+
r"\bstep[- ]by[- ]step.*(kill|harm|attack)",
|
|
454
|
+
r"\binstructions for (killing|harming|attacking)",
|
|
455
|
+
]
|
|
456
|
+
|
|
457
|
+
ILLEGAL_PATTERNS = [
|
|
458
|
+
r"\bhow to (hack|bypass|break into)",
|
|
459
|
+
r"\b(steal|fraud|counterfeit).*(instructions|guide|tutorial)",
|
|
460
|
+
]
|
|
461
|
+
|
|
462
|
+
MALWARE_PATTERNS = [
|
|
463
|
+
r"\b(virus|malware|trojan|ransomware).*(code|script|create)",
|
|
464
|
+
r"\bkeylogger",
|
|
465
|
+
r"\bexploit.*(vulnerability|zero[- ]day)",
|
|
466
|
+
]
|
|
467
|
+
|
|
468
|
+
def __init__(self):
|
|
469
|
+
self._compiled_patterns = {
|
|
470
|
+
"no_harm_instructions": [re.compile(p, re.IGNORECASE) for p in self.HARM_PATTERNS],
|
|
471
|
+
"no_illegal_activity": [re.compile(p, re.IGNORECASE) for p in self.ILLEGAL_PATTERNS],
|
|
472
|
+
"no_malware": [re.compile(p, re.IGNORECASE) for p in self.MALWARE_PATTERNS],
|
|
473
|
+
}
|
|
474
|
+
|
|
475
|
+
def evaluate(
|
|
476
|
+
self,
|
|
477
|
+
output: str,
|
|
478
|
+
principle: Principle,
|
|
479
|
+
context: Optional[str] = None,
|
|
480
|
+
) -> tuple[bool, float, str]:
|
|
481
|
+
"""Evaluate using pattern matching."""
|
|
482
|
+
patterns = self._compiled_patterns.get(principle.name, [])
|
|
483
|
+
|
|
484
|
+
for pattern in patterns:
|
|
485
|
+
match = pattern.search(output)
|
|
486
|
+
if match:
|
|
487
|
+
return (
|
|
488
|
+
True,
|
|
489
|
+
0.8, # High but not certain confidence
|
|
490
|
+
f"Pattern match detected: '{match.group()}' violates {principle.name}"
|
|
491
|
+
)
|
|
492
|
+
|
|
493
|
+
# No pattern match - assume no violation (conservative for rule-based)
|
|
494
|
+
return (False, 0.6, "No pattern match detected")
|
|
495
|
+
|
|
496
|
+
async def evaluate_async(
|
|
497
|
+
self,
|
|
498
|
+
output: str,
|
|
499
|
+
principle: Principle,
|
|
500
|
+
context: Optional[str] = None,
|
|
501
|
+
) -> tuple[bool, float, str]:
|
|
502
|
+
"""Async version (just wraps sync for rule-based)."""
|
|
503
|
+
return self.evaluate(output, principle, context)
|
|
504
|
+
|
|
505
|
+
|
|
506
|
+
class LLMEvaluator:
|
|
507
|
+
"""
|
|
508
|
+
LLM-based principle evaluator.
|
|
509
|
+
|
|
510
|
+
Uses an LLM to evaluate whether outputs violate principles.
|
|
511
|
+
This is more sophisticated and can understand nuance.
|
|
512
|
+
"""
|
|
513
|
+
|
|
514
|
+
EVALUATION_PROMPT = """You are a Constitutional AI safety evaluator. Your task is to determine if an AI output violates a specific principle.
|
|
515
|
+
|
|
516
|
+
PRINCIPLE: {principle_name}
|
|
517
|
+
DESCRIPTION: {principle_description}
|
|
518
|
+
|
|
519
|
+
OUTPUT TO EVALUATE:
|
|
520
|
+
---
|
|
521
|
+
{output}
|
|
522
|
+
---
|
|
523
|
+
|
|
524
|
+
{context_section}
|
|
525
|
+
|
|
526
|
+
Analyze whether the output violates this principle. Consider:
|
|
527
|
+
1. Does the output directly violate the principle?
|
|
528
|
+
2. Does the output indirectly enable violation of the principle?
|
|
529
|
+
3. Is there any ambiguity that could be interpreted as a violation?
|
|
530
|
+
|
|
531
|
+
Respond in the following JSON format:
|
|
532
|
+
{{
|
|
533
|
+
"is_violation": true/false,
|
|
534
|
+
"confidence": 0.0-1.0,
|
|
535
|
+
"explanation": "Brief explanation of your reasoning",
|
|
536
|
+
"evidence": "Specific text that triggered concern (if any)"
|
|
537
|
+
}}
|
|
538
|
+
|
|
539
|
+
Your response (JSON only):"""
|
|
540
|
+
|
|
541
|
+
def __init__(
|
|
542
|
+
self,
|
|
543
|
+
model_fn: Callable[[str], str],
|
|
544
|
+
async_model_fn: Optional[Callable[[str], Any]] = None,
|
|
545
|
+
):
|
|
546
|
+
"""
|
|
547
|
+
Initialize with model functions.
|
|
548
|
+
|
|
549
|
+
Args:
|
|
550
|
+
model_fn: Sync function that takes prompt and returns response
|
|
551
|
+
async_model_fn: Optional async version
|
|
552
|
+
"""
|
|
553
|
+
self._model_fn = model_fn
|
|
554
|
+
self._async_model_fn = async_model_fn
|
|
555
|
+
|
|
556
|
+
def _build_prompt(
|
|
557
|
+
self,
|
|
558
|
+
output: str,
|
|
559
|
+
principle: Principle,
|
|
560
|
+
context: Optional[str] = None,
|
|
561
|
+
) -> str:
|
|
562
|
+
"""Build the evaluation prompt."""
|
|
563
|
+
context_section = ""
|
|
564
|
+
if context:
|
|
565
|
+
context_section = f"\nCONTEXT:\n{context}\n"
|
|
566
|
+
|
|
567
|
+
# Include examples if available
|
|
568
|
+
if principle.examples:
|
|
569
|
+
examples_text = "\n\nEXAMPLES:\n"
|
|
570
|
+
for text, is_violation, explanation in principle.examples:
|
|
571
|
+
status = "VIOLATION" if is_violation else "OK"
|
|
572
|
+
examples_text += f"- [{status}] \"{text[:100]}...\" - {explanation}\n"
|
|
573
|
+
context_section += examples_text
|
|
574
|
+
|
|
575
|
+
return self.EVALUATION_PROMPT.format(
|
|
576
|
+
principle_name=principle.name,
|
|
577
|
+
principle_description=principle.description,
|
|
578
|
+
output=output,
|
|
579
|
+
context_section=context_section,
|
|
580
|
+
)
|
|
581
|
+
|
|
582
|
+
def _parse_response(self, response: str) -> tuple[bool, float, str, Optional[str]]:
|
|
583
|
+
"""Parse LLM response into structured result."""
|
|
584
|
+
try:
|
|
585
|
+
# Try to extract JSON from response
|
|
586
|
+
json_match = re.search(r'\{[^{}]*\}', response, re.DOTALL)
|
|
587
|
+
if json_match:
|
|
588
|
+
data = json.loads(json_match.group())
|
|
589
|
+
return (
|
|
590
|
+
bool(data.get("is_violation", False)),
|
|
591
|
+
float(data.get("confidence", 0.5)),
|
|
592
|
+
str(data.get("explanation", "No explanation provided")),
|
|
593
|
+
data.get("evidence"),
|
|
594
|
+
)
|
|
595
|
+
except (json.JSONDecodeError, ValueError):
|
|
596
|
+
pass
|
|
597
|
+
|
|
598
|
+
# Fallback: simple keyword detection
|
|
599
|
+
is_violation = "violation" in response.lower() and "not a violation" not in response.lower()
|
|
600
|
+
return (is_violation, 0.5, response[:200], None)
|
|
601
|
+
|
|
602
|
+
def evaluate(
|
|
603
|
+
self,
|
|
604
|
+
output: str,
|
|
605
|
+
principle: Principle,
|
|
606
|
+
context: Optional[str] = None,
|
|
607
|
+
) -> tuple[bool, float, str]:
|
|
608
|
+
"""Evaluate using LLM."""
|
|
609
|
+
prompt = self._build_prompt(output, principle, context)
|
|
610
|
+
response = self._model_fn(prompt)
|
|
611
|
+
is_violation, confidence, explanation, _ = self._parse_response(response)
|
|
612
|
+
return (is_violation, confidence, explanation)
|
|
613
|
+
|
|
614
|
+
async def evaluate_async(
|
|
615
|
+
self,
|
|
616
|
+
output: str,
|
|
617
|
+
principle: Principle,
|
|
618
|
+
context: Optional[str] = None,
|
|
619
|
+
) -> tuple[bool, float, str]:
|
|
620
|
+
"""Async evaluate using LLM."""
|
|
621
|
+
if self._async_model_fn is None:
|
|
622
|
+
# Fall back to sync
|
|
623
|
+
return self.evaluate(output, principle, context)
|
|
624
|
+
|
|
625
|
+
prompt = self._build_prompt(output, principle, context)
|
|
626
|
+
response = await self._async_model_fn(prompt)
|
|
627
|
+
is_violation, confidence, explanation, _ = self._parse_response(response)
|
|
628
|
+
return (is_violation, confidence, explanation)
|
|
629
|
+
|
|
630
|
+
|
|
631
|
+
# ============================================================================
|
|
632
|
+
# Main Validator
|
|
633
|
+
# ============================================================================
|
|
634
|
+
|
|
635
|
+
class ConstitutionalValidator:
|
|
636
|
+
"""
|
|
637
|
+
Constitutional Validator for checking AI outputs against principles.
|
|
638
|
+
|
|
639
|
+
This validator checks outputs against a set of natural language principles
|
|
640
|
+
and reports violations. It can use different evaluation backends.
|
|
641
|
+
|
|
642
|
+
Example:
|
|
643
|
+
# Basic usage with built-in safety principles
|
|
644
|
+
validator = ConstitutionalValidator(principles=SAFETY_PRINCIPLES)
|
|
645
|
+
result = validator.validate("Some AI output...")
|
|
646
|
+
|
|
647
|
+
if not result.passed:
|
|
648
|
+
print(f"Found {len(result.violations)} violations")
|
|
649
|
+
|
|
650
|
+
# With custom evaluator
|
|
651
|
+
validator = ConstitutionalValidator(
|
|
652
|
+
principles=SAFETY_PRINCIPLES,
|
|
653
|
+
evaluator=LLMEvaluator(model_fn=my_llm_call)
|
|
654
|
+
)
|
|
655
|
+
|
|
656
|
+
# Async validation
|
|
657
|
+
result = await validator.validate_async("Some AI output...")
|
|
658
|
+
"""
|
|
659
|
+
|
|
660
|
+
def __init__(
|
|
661
|
+
self,
|
|
662
|
+
principles: Union[PrincipleSet, list[Principle]],
|
|
663
|
+
evaluator: Optional[PrincipleEvaluator] = None,
|
|
664
|
+
min_confidence: float = 0.7,
|
|
665
|
+
fail_on_evaluator_error: bool = False,
|
|
666
|
+
):
|
|
667
|
+
"""
|
|
668
|
+
Initialize the validator.
|
|
669
|
+
|
|
670
|
+
Args:
|
|
671
|
+
principles: Principles to validate against
|
|
672
|
+
evaluator: Evaluation backend (defaults to RuleBasedEvaluator)
|
|
673
|
+
min_confidence: Minimum confidence to consider a violation
|
|
674
|
+
fail_on_evaluator_error: If True, treat evaluator errors as violations
|
|
675
|
+
"""
|
|
676
|
+
if isinstance(principles, PrincipleSet):
|
|
677
|
+
self._principle_set = principles
|
|
678
|
+
self._principles = principles.principles
|
|
679
|
+
else:
|
|
680
|
+
self._principle_set = PrincipleSet(name="custom", principles=principles)
|
|
681
|
+
self._principles = principles
|
|
682
|
+
|
|
683
|
+
self._evaluator = evaluator or RuleBasedEvaluator()
|
|
684
|
+
self._min_confidence = min_confidence
|
|
685
|
+
self._fail_on_error = fail_on_evaluator_error
|
|
686
|
+
self._validation_count = 0
|
|
687
|
+
|
|
688
|
+
@property
|
|
689
|
+
def principles(self) -> list[Principle]:
|
|
690
|
+
"""Get the list of principles."""
|
|
691
|
+
return self._principles
|
|
692
|
+
|
|
693
|
+
@property
|
|
694
|
+
def principle_set(self) -> PrincipleSet:
|
|
695
|
+
"""Get the principle set."""
|
|
696
|
+
return self._principle_set
|
|
697
|
+
|
|
698
|
+
def add_principle(self, principle: Principle) -> None:
|
|
699
|
+
"""Add a principle to the validator."""
|
|
700
|
+
self._principles.append(principle)
|
|
701
|
+
|
|
702
|
+
def remove_principle(self, name: str) -> bool:
|
|
703
|
+
"""Remove a principle by name. Returns True if found and removed."""
|
|
704
|
+
for i, p in enumerate(self._principles):
|
|
705
|
+
if p.name == name:
|
|
706
|
+
self._principles.pop(i)
|
|
707
|
+
return True
|
|
708
|
+
return False
|
|
709
|
+
|
|
710
|
+
def validate(
|
|
711
|
+
self,
|
|
712
|
+
output: str,
|
|
713
|
+
context: Optional[str] = None,
|
|
714
|
+
principles: Optional[list[Principle]] = None,
|
|
715
|
+
) -> ValidationResult:
|
|
716
|
+
"""
|
|
717
|
+
Validate an output against principles.
|
|
718
|
+
|
|
719
|
+
Args:
|
|
720
|
+
output: The text to validate
|
|
721
|
+
context: Optional context about the conversation/task
|
|
722
|
+
principles: Optional subset of principles to check (defaults to all)
|
|
723
|
+
|
|
724
|
+
Returns:
|
|
725
|
+
ValidationResult with any violations found
|
|
726
|
+
"""
|
|
727
|
+
self._validation_count += 1
|
|
728
|
+
principles_to_check = principles or self._principles
|
|
729
|
+
violations: list[Violation] = []
|
|
730
|
+
|
|
731
|
+
for principle in principles_to_check:
|
|
732
|
+
try:
|
|
733
|
+
is_violation, confidence, explanation = self._evaluator.evaluate(
|
|
734
|
+
output, principle, context
|
|
735
|
+
)
|
|
736
|
+
|
|
737
|
+
if is_violation and confidence >= self._min_confidence:
|
|
738
|
+
violations.append(Violation(
|
|
739
|
+
principle=principle,
|
|
740
|
+
confidence=confidence,
|
|
741
|
+
explanation=explanation,
|
|
742
|
+
))
|
|
743
|
+
except Exception as e:
|
|
744
|
+
if self._fail_on_error:
|
|
745
|
+
violations.append(Violation(
|
|
746
|
+
principle=principle,
|
|
747
|
+
confidence=1.0,
|
|
748
|
+
explanation=f"Evaluator error (treating as violation): {e}",
|
|
749
|
+
))
|
|
750
|
+
|
|
751
|
+
# Determine if passed (no critical or high violations)
|
|
752
|
+
blocking = [v for v in violations
|
|
753
|
+
if v.principle.severity in (Severity.CRITICAL, Severity.HIGH)]
|
|
754
|
+
passed = len(blocking) == 0
|
|
755
|
+
|
|
756
|
+
return ValidationResult(
|
|
757
|
+
passed=passed,
|
|
758
|
+
violations=violations,
|
|
759
|
+
output_text=output,
|
|
760
|
+
principles_checked=len(principles_to_check),
|
|
761
|
+
metadata={
|
|
762
|
+
"validation_id": self._validation_count,
|
|
763
|
+
"principle_set": self._principle_set.name,
|
|
764
|
+
"evaluator": type(self._evaluator).__name__,
|
|
765
|
+
},
|
|
766
|
+
)
|
|
767
|
+
|
|
768
|
+
async def validate_async(
|
|
769
|
+
self,
|
|
770
|
+
output: str,
|
|
771
|
+
context: Optional[str] = None,
|
|
772
|
+
principles: Optional[list[Principle]] = None,
|
|
773
|
+
parallel: bool = True,
|
|
774
|
+
) -> ValidationResult:
|
|
775
|
+
"""
|
|
776
|
+
Async validate an output against principles.
|
|
777
|
+
|
|
778
|
+
Args:
|
|
779
|
+
output: The text to validate
|
|
780
|
+
context: Optional context
|
|
781
|
+
principles: Optional subset of principles
|
|
782
|
+
parallel: If True, evaluate principles in parallel
|
|
783
|
+
|
|
784
|
+
Returns:
|
|
785
|
+
ValidationResult with any violations found
|
|
786
|
+
"""
|
|
787
|
+
self._validation_count += 1
|
|
788
|
+
principles_to_check = principles or self._principles
|
|
789
|
+
violations: list[Violation] = []
|
|
790
|
+
|
|
791
|
+
async def check_principle(principle: Principle) -> Optional[Violation]:
|
|
792
|
+
try:
|
|
793
|
+
is_violation, confidence, explanation = await self._evaluator.evaluate_async(
|
|
794
|
+
output, principle, context
|
|
795
|
+
)
|
|
796
|
+
if is_violation and confidence >= self._min_confidence:
|
|
797
|
+
return Violation(
|
|
798
|
+
principle=principle,
|
|
799
|
+
confidence=confidence,
|
|
800
|
+
explanation=explanation,
|
|
801
|
+
)
|
|
802
|
+
except Exception as e:
|
|
803
|
+
if self._fail_on_error:
|
|
804
|
+
return Violation(
|
|
805
|
+
principle=principle,
|
|
806
|
+
confidence=1.0,
|
|
807
|
+
explanation=f"Evaluator error: {e}",
|
|
808
|
+
)
|
|
809
|
+
return None
|
|
810
|
+
|
|
811
|
+
if parallel:
|
|
812
|
+
results = await asyncio.gather(
|
|
813
|
+
*[check_principle(p) for p in principles_to_check]
|
|
814
|
+
)
|
|
815
|
+
violations = [v for v in results if v is not None]
|
|
816
|
+
else:
|
|
817
|
+
for principle in principles_to_check:
|
|
818
|
+
violation = await check_principle(principle)
|
|
819
|
+
if violation:
|
|
820
|
+
violations.append(violation)
|
|
821
|
+
|
|
822
|
+
blocking = [v for v in violations
|
|
823
|
+
if v.principle.severity in (Severity.CRITICAL, Severity.HIGH)]
|
|
824
|
+
passed = len(blocking) == 0
|
|
825
|
+
|
|
826
|
+
return ValidationResult(
|
|
827
|
+
passed=passed,
|
|
828
|
+
violations=violations,
|
|
829
|
+
output_text=output,
|
|
830
|
+
principles_checked=len(principles_to_check),
|
|
831
|
+
metadata={
|
|
832
|
+
"validation_id": self._validation_count,
|
|
833
|
+
"principle_set": self._principle_set.name,
|
|
834
|
+
"evaluator": type(self._evaluator).__name__,
|
|
835
|
+
"parallel": parallel,
|
|
836
|
+
},
|
|
837
|
+
)
|
|
838
|
+
|
|
839
|
+
def get_stats(self) -> dict[str, Any]:
|
|
840
|
+
"""Get validator statistics."""
|
|
841
|
+
return {
|
|
842
|
+
"validation_count": self._validation_count,
|
|
843
|
+
"principle_count": len(self._principles),
|
|
844
|
+
"principle_set": self._principle_set.name,
|
|
845
|
+
"evaluator": type(self._evaluator).__name__,
|
|
846
|
+
"min_confidence": self._min_confidence,
|
|
847
|
+
}
|
|
848
|
+
|
|
849
|
+
|
|
850
|
+
# ============================================================================
|
|
851
|
+
# Convenience Functions
|
|
852
|
+
# ============================================================================
|
|
853
|
+
|
|
854
|
+
def validate_safety(output: str, context: Optional[str] = None) -> ValidationResult:
|
|
855
|
+
"""
|
|
856
|
+
Quick validation against safety principles.
|
|
857
|
+
|
|
858
|
+
Args:
|
|
859
|
+
output: Text to validate
|
|
860
|
+
context: Optional context
|
|
861
|
+
|
|
862
|
+
Returns:
|
|
863
|
+
ValidationResult
|
|
864
|
+
"""
|
|
865
|
+
validator = ConstitutionalValidator(principles=SAFETY_PRINCIPLES)
|
|
866
|
+
return validator.validate(output, context)
|
|
867
|
+
|
|
868
|
+
|
|
869
|
+
def validate_medical(output: str, context: Optional[str] = None) -> ValidationResult:
|
|
870
|
+
"""Quick validation against medical principles."""
|
|
871
|
+
combined = SAFETY_PRINCIPLES.merge(MEDICAL_PRINCIPLES)
|
|
872
|
+
validator = ConstitutionalValidator(principles=combined)
|
|
873
|
+
return validator.validate(output, context)
|
|
874
|
+
|
|
875
|
+
|
|
876
|
+
def validate_financial(output: str, context: Optional[str] = None) -> ValidationResult:
|
|
877
|
+
"""Quick validation against financial principles."""
|
|
878
|
+
combined = SAFETY_PRINCIPLES.merge(FINANCIAL_PRINCIPLES)
|
|
879
|
+
validator = ConstitutionalValidator(principles=combined)
|
|
880
|
+
return validator.validate(output, context)
|
|
881
|
+
|
|
882
|
+
|
|
883
|
+
def create_validator_from_yaml(yaml_str: str) -> ConstitutionalValidator:
|
|
884
|
+
"""
|
|
885
|
+
Create a validator from YAML configuration.
|
|
886
|
+
|
|
887
|
+
YAML format:
|
|
888
|
+
name: my_principles
|
|
889
|
+
description: Custom principles
|
|
890
|
+
principles:
|
|
891
|
+
- name: rule_1
|
|
892
|
+
description: First rule description
|
|
893
|
+
severity: high
|
|
894
|
+
category: safety
|
|
895
|
+
"""
|
|
896
|
+
try:
|
|
897
|
+
import yaml
|
|
898
|
+
data = yaml.safe_load(yaml_str)
|
|
899
|
+
principle_set = PrincipleSet.from_dict(data)
|
|
900
|
+
return ConstitutionalValidator(principles=principle_set)
|
|
901
|
+
except ImportError:
|
|
902
|
+
raise ImportError("PyYAML is required for YAML configuration. Install with: pip install pyyaml")
|