evalvault 1.69.0__tar.gz → 1.70.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {evalvault-1.69.0 → evalvault-1.70.0}/PKG-INFO +1 -1
- evalvault-1.70.0/config/regressions/ci.json +20 -0
- evalvault-1.70.0/docs/guides/CI_REGRESSION_GATE.md +36 -0
- evalvault-1.70.0/docs/guides/MULTITURN_EVAL_GUIDE.md +45 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/pyproject.toml +1 -1
- evalvault-1.70.0/scripts/ci/run_regression_gate.py +97 -0
- evalvault-1.70.0/scripts/offline/bundle_datasets.sh +34 -0
- evalvault-1.70.0/scripts/offline/restore_datasets.sh +16 -0
- evalvault-1.70.0/src/evalvault/adapters/outbound/analysis/multiturn_analyzer_module.py +212 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/uv.lock +1 -1
- {evalvault-1.69.0 → evalvault-1.70.0}/.dockerignore +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/.env.example +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/.env.offline.example +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/.github/workflows/ci.yml +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/.github/workflows/release.yml +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/.github/workflows/stale.yml +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/.gitignore +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/.pre-commit-config.yaml +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/.python-version +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/AGENTS.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/CHANGELOG.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/CLAUDE.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/CODE_OF_CONDUCT.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/CONTRIBUTING.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/Dockerfile +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/LICENSE.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/README.en.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/README.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/SECURITY.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/agent/README.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/agent/agent.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/agent/client.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/agent/config.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/agent/main.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/agent/memory/README.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/agent/memory/shared/decisions.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/agent/memory/shared/dependencies.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/agent/memory/templates/coordinator_guide.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/agent/memory/templates/work_log_template.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/agent/memory_integration.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/agent/progress.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/agent/prompts/app_spec.txt +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/agent/prompts/baseline.txt +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/agent/prompts/coding_prompt.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/agent/prompts/existing_project_prompt.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/agent/prompts/improvement/architecture_prompt.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/agent/prompts/improvement/base_prompt.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/agent/prompts/improvement/coordinator_prompt.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/agent/prompts/improvement/observability_prompt.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/agent/prompts/initializer_prompt.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/agent/prompts/prompt_manifest.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/agent/prompts/system.txt +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/agent/prompts.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/agent/requirements.txt +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/agent/security.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/config/domains/insurance/memory.yaml +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/config/domains/insurance/terms_dictionary_en.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/config/domains/insurance/terms_dictionary_ko.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/config/methods.yaml +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/config/models.yaml +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/config/ragas_prompts_override.yaml +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/config/regressions/default.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/config/regressions/ux.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/config/stage_metric_playbook.yaml +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/config/stage_metric_thresholds.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/data/datasets/dummy_test_dataset.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/data/datasets/insurance_qa_korean.csv +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/data/datasets/insurance_qa_korean.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/data/datasets/insurance_qa_korean_2.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/data/datasets/insurance_qa_korean_3.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/data/datasets/ragas_ko90_en10.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/data/datasets/sample.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/data/datasets/visualization_20q_cluster_map.csv +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/data/datasets/visualization_20q_korean.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/data/datasets/visualization_2q_cluster_map.csv +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/data/datasets/visualization_2q_korean.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/data/kg/knowledge_graph.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/data/rag/user_guide_bm25.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/data/raw/The Complete Guide to Mastering Suno Advanced Strategies for Professional Music Generation.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/data/raw/edge_cases.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/data/raw/run_mode_full_domain_memory.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/data/raw/sample_rag_knowledge.txt +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/dataset_templates/dataset_template.csv +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/dataset_templates/dataset_template.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/dataset_templates/dataset_template.xlsx +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/dataset_templates/method_input_template.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docker-compose.langfuse.yml +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docker-compose.offline.yml +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docker-compose.phoenix.yaml +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docker-compose.yml +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/INDEX.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/README.ko.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/ROADMAP.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/STATUS.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/api/adapters/inbound.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/api/adapters/outbound.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/api/config.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/api/domain/entities.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/api/domain/metrics.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/api/domain/services.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/api/ports/inbound.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/api/ports/outbound.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/architecture/open-rag-trace-collector.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/architecture/open-rag-trace-spec.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/getting-started/INSTALLATION.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/guides/AGENTS_SYSTEM_GUIDE.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/guides/CHAINLIT_INTEGRATION_PLAN.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/guides/CLI_MCP_PLAN.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/guides/CLI_PARALLEL_FEATURES_SPEC.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/guides/DEV_GUIDE.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/guides/DOCS_REFRESH_PLAN.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/guides/EVALVAULT_DIAGNOSTIC_PLAYBOOK.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/guides/EVALVAULT_RUN_EXCEL_SHEETS.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/guides/EVALVAULT_WORK_PLAN.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/guides/EXTERNAL_TRACE_API_SPEC.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/guides/Extension_2.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/guides/Extension_Data_Difficulty_Profiling_Custom_Judge_Model.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/guides/INSURANCE_SUMMARY_METRICS_PLAN.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/guides/LENA_MVP_IMPLEMENTATION_PLAN.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/guides/LENA_RAGAS_CALIBRATION_DEV_PLAN.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/guides/NEXT_STEPS_EXECUTION_PLAN.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/guides/OFFLINE_DOCKER.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/guides/OPEN_RAG_TRACE_INTERNAL_ADAPTER.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/guides/OPEN_RAG_TRACE_SAMPLES.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/guides/P0_P3_EXECUTION_REPORT.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/guides/PARALLEL_WORK_APPROVAL_RULES.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/guides/PRD_LENA.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/guides/PROJECT_STATUS_AND_PLAN.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/guides/RAGAS_HUMAN_FEEDBACK_CALIBRATION_GUIDE.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/guides/RAG_CLI_WORKFLOW_TEMPLATES.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/guides/RAG_NOISE_REDUCTION_GUIDE.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/guides/RAG_PERFORMANCE_IMPLEMENTATION_LOG.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/guides/RAG_PERFORMANCE_IMPROVEMENT_PROPOSAL.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/guides/RELEASE_CHECKLIST.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/guides/USER_GUIDE.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/guides/WEBUI_CLI_ROLLOUT_PLAN.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/guides/cli_process.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/guides/prompt_suggestions_design.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/guides/rag_human_feedback_calibration_implementation_plan.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/guides/refactoring_strategy.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/guides/repeat_query.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/mapping/component-to-whitepaper.yaml +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/new_whitepaper/00_frontmatter.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/new_whitepaper/01_overview.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/new_whitepaper/02_architecture.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/new_whitepaper/03_data_flow.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/new_whitepaper/04_components.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/new_whitepaper/05_expert_lenses.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/new_whitepaper/06_implementation.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/new_whitepaper/07_advanced.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/new_whitepaper/08_customization.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/new_whitepaper/09_quality.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/new_whitepaper/10_performance.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/new_whitepaper/11_security.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/new_whitepaper/12_operations.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/new_whitepaper/13_standards.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/new_whitepaper/14_roadmap.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/new_whitepaper/INDEX.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/new_whitepaper/STYLE_GUIDE.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/refactor/REFAC_000_master_plan.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/refactor/REFAC_010_agent_playbook.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/refactor/REFAC_020_logging_policy.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/refactor/REFAC_030_phase0_responsibility_map.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/refactor/REFAC_040_wbs_parallel_plan.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/refactor/logs/phase-0-baseline.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/refactor/logs/phase-1-evaluator.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/refactor/logs/phase-2-cli-run.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/refactor/logs/phase-3-analysis.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/security_audit_worklog.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/stylesheets/extra.css +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/templates/dataset_template.csv +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/templates/dataset_template.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/templates/dataset_template.xlsx +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/templates/eval_report_templates.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/templates/kg_template.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/templates/otel_openinference_trace_example.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/templates/ragas_dataset_example_ko90_en10.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/templates/retriever_docs_template.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/tools/generate-whitepaper.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/docs/web_ui_analysis_migration_plan.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/dummy_test_dataset.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/examples/README.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/examples/benchmarks/README.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/examples/benchmarks/korean_rag/faithfulness_test.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/examples/benchmarks/korean_rag/insurance_qa_100.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/examples/benchmarks/korean_rag/keyword_extraction_test.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/examples/benchmarks/korean_rag/retrieval_test.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/examples/benchmarks/output/comparison.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/examples/benchmarks/output/full_results.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/examples/benchmarks/output/leaderboard.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/examples/benchmarks/output/results_mteb.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/examples/benchmarks/output/retrieval_result.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/examples/benchmarks/run_korean_benchmark.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/examples/kg_generator_demo.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/examples/method_plugin_template/README.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/examples/method_plugin_template/pyproject.toml +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/examples/method_plugin_template/src/method_plugin_template/__init__.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/examples/method_plugin_template/src/method_plugin_template/methods.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/examples/stage_events.jsonl +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/examples/usecase/comprehensive_workflow_test.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/examples/usecase/insurance_eval_dataset.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/examples/usecase/output/comprehensive_report.html +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/.env.example +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/.gitignore +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/Dockerfile +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/README.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/e2e/analysis-compare.spec.ts +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/e2e/analysis-lab.spec.ts +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/e2e/compare-runs.spec.ts +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/e2e/dashboard.spec.ts +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/e2e/domain-memory.spec.ts +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/e2e/evaluation-studio.spec.ts +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/e2e/knowledge-base.spec.ts +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/e2e/mocks/intents.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/e2e/mocks/run_details.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/e2e/mocks/runs.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/e2e/run-details.spec.ts +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/eslint.config.js +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/index.html +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/nginx.conf +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/package-lock.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/package.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/playwright.config.ts +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/public/vite.svg +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/src/App.css +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/src/App.tsx +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/src/assets/react.svg +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/src/components/AnalysisNodeOutputs.tsx +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/src/components/InsightSpacePanel.tsx +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/src/components/Layout.tsx +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/src/components/MarkdownContent.tsx +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/src/components/PrioritySummaryPanel.tsx +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/src/components/SpaceLegend.tsx +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/src/components/SpacePlot2D.tsx +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/src/components/SpacePlot3D.tsx +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/src/components/StatusBadge.tsx +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/src/components/VirtualizedText.tsx +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/src/components/ai-elements/Conversation.tsx +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/src/components/ai-elements/Message.tsx +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/src/components/ai-elements/PromptInput.tsx +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/src/components/ai-elements/Response.tsx +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/src/components/ai-elements/index.ts +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/src/config/ui.ts +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/src/config.ts +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/src/hooks/useInsightSpace.ts +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/src/index.css +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/src/main.tsx +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/src/pages/AnalysisCompareView.tsx +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/src/pages/AnalysisLab.tsx +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/src/pages/AnalysisResultView.tsx +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/src/pages/Chat.tsx +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/src/pages/CompareRuns.tsx +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/src/pages/ComprehensiveAnalysis.tsx +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/src/pages/CustomerReport.tsx +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/src/pages/Dashboard.tsx +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/src/pages/DomainMemory.tsx +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/src/pages/EvaluationStudio.tsx +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/src/pages/KnowledgeBase.tsx +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/src/pages/RunDetails.tsx +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/src/pages/Settings.tsx +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/src/pages/Visualization.tsx +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/src/pages/VisualizationHome.tsx +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/src/services/api.ts +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/src/types/plotly.d.ts +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/src/utils/format.ts +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/src/utils/phoenix.ts +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/src/utils/runAnalytics.ts +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/src/utils/score.ts +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/src/utils/summaryMetrics.ts +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/tailwind.config.js +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/tsconfig.app.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/tsconfig.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/tsconfig.node.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/frontend/vite.config.ts +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/mkdocs.yml +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/package-lock.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/prompts/system_override.txt +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/.gitkeep +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/README.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/comparison/artifacts/comparison_0aa9fab0_f1287e90/final_output.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/comparison/artifacts/comparison_0aa9fab0_f1287e90/index.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/comparison/artifacts/comparison_0aa9fab0_f1287e90/load_runs.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/comparison/artifacts/comparison_0aa9fab0_f1287e90/report.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/comparison/artifacts/comparison_0aa9fab0_f1287e90/run_change_detection.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/comparison/artifacts/comparison_0aa9fab0_f1287e90/run_metric_comparison.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/comparison/artifacts/comparison_8f825b22_4516d358/final_output.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/comparison/artifacts/comparison_8f825b22_4516d358/index.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/comparison/artifacts/comparison_8f825b22_4516d358/load_runs.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/comparison/artifacts/comparison_8f825b22_4516d358/report.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/comparison/artifacts/comparison_8f825b22_4516d358/run_change_detection.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/comparison/artifacts/comparison_8f825b22_4516d358/run_metric_comparison.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/comparison/artifacts/comparison_f1287e90_8f825b22/final_output.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/comparison/artifacts/comparison_f1287e90_8f825b22/index.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/comparison/artifacts/comparison_f1287e90_8f825b22/load_runs.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/comparison/artifacts/comparison_f1287e90_8f825b22/report.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/comparison/artifacts/comparison_f1287e90_8f825b22/run_change_detection.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/comparison/artifacts/comparison_f1287e90_8f825b22/run_metric_comparison.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/comparison/comparison_0aa9fab0_9fbf4776.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/comparison/comparison_0aa9fab0_9fbf4776.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/comparison/comparison_0aa9fab0_f1287e90.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/comparison/comparison_0aa9fab0_f1287e90.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/comparison/comparison_8f825b22_4516d358.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/comparison/comparison_8f825b22_4516d358.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/comparison/comparison_9fbf4776_a491fa0e.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/comparison/comparison_9fbf4776_a491fa0e.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/comparison/comparison_f1287e90_8f825b22.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/comparison/comparison_f1287e90_8f825b22.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/debug_report_r1_smoke.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/debug_report_r2_graphrag.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/debug_report_r2_graphrag_openai.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/debug_report_r3_bm25.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/debug_report_r3_bm25_langfuse3.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/debug_report_r3_dense_faiss.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/improvement_1d91a667-4288-4742-be3a-a8f5310c5140.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/r2_graphrag_openai_stage_events.jsonl +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/r2_graphrag_openai_stage_report.txt +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/r2_graphrag_stage_events.jsonl +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/r2_graphrag_stage_report.txt +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/r3_bm25_langfuse2_stage_events.jsonl +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/r3_bm25_langfuse3_stage_events.jsonl +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/r3_bm25_langfuse_stage_events.jsonl +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/r3_bm25_phoenix_stage_events.jsonl +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/r3_bm25_stage_events.jsonl +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/r3_bm25_stage_report.txt +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/r3_dense_faiss_stage_events.jsonl +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/r3_dense_faiss_stage_report.txt +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/retrieval_benchmark_smoke_precision.csv +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/retrieval_benchmark_smoke_precision_graphrag.csv +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/reports/retrieval_benchmark_smoke_precision_multi.csv +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/scripts/benchmark/download_kmmlu.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/scripts/dev/open_rag_trace_demo.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/scripts/dev/open_rag_trace_integration_template.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/scripts/dev/otel-collector-config.yaml +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/scripts/dev/start_web_ui_with_phoenix.sh +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/scripts/dev/validate_open_rag_trace.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/scripts/dev_seed_pipeline_results.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/scripts/docs/__init__.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/scripts/docs/analyzer/__init__.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/scripts/docs/analyzer/ast_scanner.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/scripts/docs/analyzer/confidence_scorer.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/scripts/docs/analyzer/graph_builder.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/scripts/docs/analyzer/side_effect_detector.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/scripts/docs/generate_api_docs.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/scripts/docs/models/__init__.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/scripts/docs/models/schema.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/scripts/docs/renderer/__init__.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/scripts/docs/renderer/html_generator.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/scripts/offline/export_images.sh +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/scripts/offline/import_images.sh +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/scripts/offline/smoke_test.sh +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/scripts/ops/phoenix_watch.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/scripts/perf/backfill_langfuse_trace_url.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/scripts/perf/r3_dense_smoke.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/scripts/perf/r3_evalvault_run_dataset.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/scripts/perf/r3_retriever_docs.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/scripts/perf/r3_smoke_real.jsonl +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/scripts/perf/r3_stage_events_sample.jsonl +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/scripts/pipeline_template_inspect.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/scripts/reports/generate_release_notes.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/scripts/run_with_timeout.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/scripts/test_full_evaluation.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/scripts/tests/run_regressions.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/scripts/tests/run_retriever_stage_report_smoke.sh +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/scripts/validate_tutorials.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/scripts/verify_ragas_compliance.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/scripts/verify_workflows.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/__init__.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/__init__.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/__init__.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/api/__init__.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/api/adapter.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/api/main.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/api/routers/__init__.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/api/routers/benchmark.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/api/routers/chat.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/api/routers/config.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/api/routers/domain.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/api/routers/knowledge.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/api/routers/mcp.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/api/routers/pipeline.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/api/routers/runs.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/cli/__init__.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/cli/app.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/cli/commands/__init__.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/cli/commands/agent.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/cli/commands/analyze.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/cli/commands/api.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/cli/commands/artifacts.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/cli/commands/benchmark.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/cli/commands/calibrate.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/cli/commands/calibrate_judge.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/cli/commands/compare.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/cli/commands/config.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/cli/commands/debug.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/cli/commands/domain.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/cli/commands/experiment.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/cli/commands/gate.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/cli/commands/generate.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/cli/commands/history.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/cli/commands/init.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/cli/commands/kg.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/cli/commands/langfuse.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/cli/commands/method.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/cli/commands/ops.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/cli/commands/phoenix.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/cli/commands/pipeline.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/cli/commands/profile_difficulty.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/cli/commands/prompts.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/cli/commands/regress.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/cli/commands/run.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/cli/commands/run_helpers.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/cli/commands/stage.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/cli/utils/__init__.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/cli/utils/analysis_io.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/cli/utils/console.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/cli/utils/errors.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/cli/utils/formatters.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/cli/utils/options.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/cli/utils/presets.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/cli/utils/progress.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/cli/utils/validators.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/mcp/__init__.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/mcp/schemas.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/inbound/mcp/tools.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/__init__.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/analysis/__init__.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/analysis/analysis_report_module.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/analysis/base_module.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/analysis/bm25_searcher_module.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/analysis/causal_adapter.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/analysis/causal_analyzer_module.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/analysis/common.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/analysis/comparison_pipeline_adapter.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/analysis/comparison_report_module.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/analysis/data_loader_module.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/analysis/detailed_report_module.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/analysis/diagnostic_playbook_module.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/analysis/embedding_analyzer_module.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/analysis/embedding_distribution_module.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/analysis/embedding_searcher_module.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/analysis/hybrid_rrf_module.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/analysis/hybrid_weighted_module.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/analysis/hypothesis_generator_module.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/analysis/llm_report_module.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/analysis/low_performer_extractor_module.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/analysis/model_analyzer_module.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/analysis/morpheme_analyzer_module.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/analysis/morpheme_quality_checker_module.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/analysis/network_analyzer_module.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/analysis/nlp_adapter.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/analysis/nlp_analyzer_module.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/analysis/pattern_detector_module.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/analysis/pipeline_factory.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/analysis/pipeline_helpers.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/analysis/priority_summary_module.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/analysis/ragas_evaluator_module.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/analysis/retrieval_analyzer_module.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/analysis/retrieval_benchmark_module.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/analysis/retrieval_quality_checker_module.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/analysis/root_cause_analyzer_module.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/analysis/run_analyzer_module.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/analysis/run_change_detector_module.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/analysis/run_comparator_module.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/analysis/run_loader_module.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/analysis/run_metric_comparator_module.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/analysis/search_comparator_module.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/analysis/statistical_adapter.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/analysis/statistical_analyzer_module.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/analysis/statistical_comparator_module.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/analysis/summary_report_module.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/analysis/time_series_analyzer_module.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/analysis/timeseries_advanced_module.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/analysis/trend_detector_module.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/analysis/verification_report_module.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/artifact_fs.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/benchmark/__init__.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/benchmark/lm_eval_adapter.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/cache/__init__.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/cache/hybrid_cache.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/cache/memory_cache.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/dataset/__init__.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/dataset/base.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/dataset/csv_loader.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/dataset/excel_loader.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/dataset/json_loader.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/dataset/loader_factory.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/dataset/method_input_loader.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/dataset/streaming_loader.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/dataset/templates.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/dataset/thresholds.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/debug/__init__.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/debug/report_renderer.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/documents/__init__.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/documents/ocr/__init__.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/documents/ocr/paddleocr_backend.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/documents/pdf_extractor.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/documents/versioned_loader.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/domain_memory/__init__.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/domain_memory/domain_memory_schema.sql +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/domain_memory/sqlite_adapter.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/filesystem/__init__.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/filesystem/difficulty_profile_writer.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/filesystem/ops_snapshot_writer.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/improvement/__init__.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/improvement/insight_generator.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/improvement/pattern_detector.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/improvement/playbook_loader.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/improvement/stage_metric_playbook_loader.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/judge_calibration_adapter.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/judge_calibration_reporter.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/kg/__init__.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/kg/graph_rag_retriever.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/kg/networkx_adapter.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/kg/parallel_kg_builder.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/kg/query_strategies.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/llm/__init__.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/llm/anthropic_adapter.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/llm/azure_adapter.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/llm/base.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/llm/factory.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/llm/instructor_factory.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/llm/llm_relation_augmenter.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/llm/ollama_adapter.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/llm/openai_adapter.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/llm/token_aware_chat.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/llm/vllm_adapter.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/methods/__init__.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/methods/baseline_oracle.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/methods/external_command.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/methods/registry.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/nlp/__init__.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/nlp/korean/__init__.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/nlp/korean/bm25_retriever.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/nlp/korean/dense_retriever.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/nlp/korean/document_chunker.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/nlp/korean/hybrid_retriever.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/nlp/korean/kiwi_tokenizer.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/nlp/korean/korean_evaluation.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/nlp/korean/korean_stopwords.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/nlp/korean/toolkit.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/nlp/korean/toolkit_factory.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/phoenix/sync_service.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/report/__init__.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/report/dashboard_generator.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/report/llm_report_generator.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/report/markdown_adapter.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/storage/__init__.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/storage/base_sql.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/storage/benchmark_storage_adapter.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/storage/postgres_adapter.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/storage/postgres_schema.sql +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/storage/schema.sql +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/storage/sqlite_adapter.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/tracer/__init__.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/tracer/open_rag_log_handler.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/tracer/open_rag_trace_adapter.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/tracer/open_rag_trace_decorators.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/tracer/open_rag_trace_helpers.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/tracer/phoenix_tracer_adapter.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/tracker/__init__.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/tracker/langfuse_adapter.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/tracker/log_sanitizer.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/tracker/mlflow_adapter.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/adapters/outbound/tracker/phoenix_adapter.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/config/__init__.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/config/agent_types.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/config/domain_config.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/config/instrumentation.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/config/langfuse_support.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/config/model_config.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/config/phoenix_support.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/config/playbooks/improvement_playbook.yaml +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/config/secret_manager.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/config/settings.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/debug_ragas.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/debug_ragas_real.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/__init__.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/entities/__init__.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/entities/analysis.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/entities/analysis_pipeline.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/entities/benchmark.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/entities/benchmark_run.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/entities/dataset.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/entities/debug.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/entities/experiment.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/entities/feedback.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/entities/improvement.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/entities/judge_calibration.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/entities/kg.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/entities/memory.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/entities/method.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/entities/prompt.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/entities/prompt_suggestion.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/entities/rag_trace.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/entities/result.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/entities/stage.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/metrics/__init__.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/metrics/analysis_registry.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/metrics/confidence.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/metrics/contextual_relevancy.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/metrics/entity_preservation.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/metrics/insurance.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/metrics/no_answer.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/metrics/registry.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/metrics/retrieval_rank.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/metrics/summary_accuracy.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/metrics/summary_needs_followup.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/metrics/summary_non_definitive.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/metrics/summary_risk_coverage.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/metrics/terms_dictionary.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/metrics/text_match.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/__init__.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/analysis_service.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/artifact_lint_service.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/async_batch_executor.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/batch_executor.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/benchmark_report_service.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/benchmark_runner.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/benchmark_service.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/cache_metrics.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/cluster_map_builder.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/custom_metric_snapshot.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/dataset_preprocessor.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/debug_report_service.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/difficulty_profile_reporter.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/difficulty_profiling_service.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/document_chunker.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/document_versioning.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/domain_learning_hook.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/embedding_overlay.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/entity_extractor.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/evaluator.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/experiment_comparator.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/experiment_manager.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/experiment_reporter.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/experiment_repository.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/experiment_statistics.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/holdout_splitter.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/improvement_guide_service.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/intent_classifier.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/judge_calibration_service.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/kg_generator.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/memory_aware_evaluator.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/memory_based_analysis.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/method_runner.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/ops_snapshot_service.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/pipeline_orchestrator.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/pipeline_template_registry.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/prompt_candidate_service.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/prompt_manifest.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/prompt_registry.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/prompt_scoring_service.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/prompt_status.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/prompt_suggestion_reporter.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/ragas_prompt_overrides.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/regression_gate_service.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/retrieval_metrics.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/retriever_context.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/run_comparison_service.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/satisfaction_calibration_service.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/stage_event_builder.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/stage_metric_guide_service.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/stage_metric_service.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/stage_summary_service.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/synthetic_qa_generator.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/testset_generator.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/threshold_profiles.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/unified_report_service.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/domain/services/visual_space_service.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/mkdocs_helpers.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/ports/__init__.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/ports/inbound/__init__.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/ports/inbound/analysis_pipeline_port.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/ports/inbound/evaluator_port.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/ports/inbound/learning_hook_port.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/ports/inbound/web_port.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/ports/outbound/__init__.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/ports/outbound/analysis_cache_port.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/ports/outbound/analysis_module_port.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/ports/outbound/analysis_port.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/ports/outbound/artifact_fs_port.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/ports/outbound/benchmark_port.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/ports/outbound/causal_analysis_port.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/ports/outbound/comparison_pipeline_port.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/ports/outbound/dataset_port.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/ports/outbound/difficulty_profile_port.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/ports/outbound/domain_memory_port.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/ports/outbound/embedding_port.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/ports/outbound/improvement_port.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/ports/outbound/intent_classifier_port.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/ports/outbound/judge_calibration_port.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/ports/outbound/korean_nlp_port.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/ports/outbound/llm_factory_port.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/ports/outbound/llm_port.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/ports/outbound/method_port.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/ports/outbound/nlp_analysis_port.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/ports/outbound/ops_snapshot_port.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/ports/outbound/relation_augmenter_port.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/ports/outbound/report_port.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/ports/outbound/stage_storage_port.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/ports/outbound/storage_port.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/ports/outbound/tracer_port.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/ports/outbound/tracker_port.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/reports/__init__.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/reports/release_notes.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/scripts/__init__.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/src/evalvault/scripts/regression_runner.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/__init__.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/conftest.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/fixtures/README.md +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/fixtures/benchmark/retrieval_ground_truth_min.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/fixtures/benchmark/retrieval_ground_truth_multi.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/fixtures/e2e/auto_insurance_qa_korean_full.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/fixtures/e2e/callcenter_summary_5cases.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/fixtures/e2e/comprehensive_dataset.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/fixtures/e2e/edge_cases.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/fixtures/e2e/edge_cases.xlsx +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/fixtures/e2e/evaluation_test_sample.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/fixtures/e2e/graphrag_retriever_docs.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/fixtures/e2e/graphrag_smoke.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/fixtures/e2e/insurance_document.txt +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/fixtures/e2e/insurance_qa_english.csv +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/fixtures/e2e/insurance_qa_english.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/fixtures/e2e/insurance_qa_english.xlsx +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/fixtures/e2e/insurance_qa_korean.csv +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/fixtures/e2e/insurance_qa_korean.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/fixtures/e2e/insurance_qa_korean.xlsx +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/fixtures/e2e/insurance_qa_korean_versioned_pdf.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/fixtures/e2e/run_mode_full_domain_memory.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/fixtures/e2e/run_mode_simple.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/fixtures/e2e/summary_eval_minimal.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/fixtures/kg/minimal_graph.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/fixtures/sample_dataset.csv +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/fixtures/sample_dataset.json +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/fixtures/sample_dataset.xlsx +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/integration/__init__.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/integration/benchmark/test_benchmark_service_integration.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/integration/conftest.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/integration/test_cli_integration.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/integration/test_data_flow.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/integration/test_e2e_scenarios.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/integration/test_evaluation_flow.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/integration/test_full_workflow.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/integration/test_langfuse_flow.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/integration/test_phoenix_flow.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/integration/test_pipeline_api_contracts.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/integration/test_storage_flow.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/integration/test_summary_eval_fixture.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/optional_deps.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/__init__.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/adapters/inbound/mcp/test_execute_tools.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/adapters/inbound/mcp/test_read_tools.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/adapters/outbound/documents/test_pdf_extractor.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/adapters/outbound/documents/test_versioned_loader.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/adapters/outbound/improvement/__init__.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/adapters/outbound/improvement/test_insight_generator.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/adapters/outbound/improvement/test_pattern_detector.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/adapters/outbound/improvement/test_playbook_loader.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/adapters/outbound/improvement/test_stage_metric_playbook_loader.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/adapters/outbound/kg/test_graph_rag_retriever.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/adapters/outbound/kg/test_parallel_kg_builder.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/adapters/outbound/storage/test_benchmark_storage_adapter.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/config/test_phoenix_support.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/conftest.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/domain/metrics/test_analysis_metric_registry.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/domain/metrics/test_confidence.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/domain/metrics/test_contextual_relevancy.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/domain/metrics/test_entity_preservation.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/domain/metrics/test_metric_registry.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/domain/metrics/test_no_answer.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/domain/metrics/test_retrieval_rank.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/domain/metrics/test_text_match.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/domain/services/test_cache_metrics.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/domain/services/test_claim_level.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/domain/services/test_dataset_preprocessor.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/domain/services/test_document_versioning.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/domain/services/test_evaluator_comprehensive.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/domain/services/test_holdout_splitter.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/domain/services/test_improvement_guide_service.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/domain/services/test_judge_calibration_service.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/domain/services/test_ops_snapshot_service.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/domain/services/test_regression_gate_service.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/domain/services/test_retrieval_metrics.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/domain/services/test_retriever_context.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/domain/services/test_stage_event_builder.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/domain/services/test_stage_metric_guide_service.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/domain/services/test_synthetic_qa_generator.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/domain/test_embedding_overlay.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/domain/test_prompt_manifest.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/domain/test_prompt_status.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/reports/test_release_notes.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/scripts/test_regression_runner.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_agent_types.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_analysis_entities.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_analysis_modules.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_analysis_pipeline.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_analysis_service.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_anthropic_adapter.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_artifact_lint_service.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_async_batch_executor.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_azure_adapter.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_benchmark_helpers.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_benchmark_runner.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_causal_adapter.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_cli.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_cli_artifacts.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_cli_calibrate_judge.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_cli_domain.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_cli_init.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_cli_ops.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_cli_progress.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_cli_utils.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_data_loaders.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_difficulty_profiling_service.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_domain_config.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_domain_memory.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_entities.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_entities_kg.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_entity_extractor.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_evaluator.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_experiment.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_hybrid_cache.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_instrumentation.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_insurance_metric.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_intent_classifier.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_kg_generator.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_kg_networkx.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_kiwi_tokenizer.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_kiwi_warning_suppression.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_korean_dense.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_korean_evaluation.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_korean_retrieval.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_langfuse_tracker.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_llm_relation_augmenter.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_lm_eval_adapter.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_markdown_report.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_memory_cache.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_memory_services.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_method_plugins.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_mlflow_tracker.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_model_config.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_nlp_adapter.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_nlp_entities.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_ollama_adapter.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_openai_adapter.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_phoenix_adapter.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_pipeline_orchestrator.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_ports.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_postgres_storage.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_prompt_candidate_service.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_rag_trace_entities.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_regress_cli.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_run_comparison_service.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_run_memory_helpers.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_run_mode_fixtures.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_settings.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_sqlite_storage.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_stage_cli.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_stage_event_schema.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_stage_metric_service.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_stage_storage.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_stage_summary_service.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_statistical_adapter.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_streaming_loader.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_summary_eval_fixture.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_testset_generator.py +0 -0
- {evalvault-1.69.0 → evalvault-1.70.0}/tests/unit/test_web_adapter.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: evalvault
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.70.0
|
|
4
4
|
Summary: RAG evaluation system using Ragas with Phoenix/Langfuse tracing
|
|
5
5
|
Project-URL: Homepage, https://github.com/ntts9990/EvalVault
|
|
6
6
|
Project-URL: Documentation, https://github.com/ntts9990/EvalVault#readme
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
{
|
|
2
|
+
"suites": [
|
|
3
|
+
{
|
|
4
|
+
"name": "unit-cli-gate",
|
|
5
|
+
"description": "Quality gate related CLI behavior",
|
|
6
|
+
"command": ["pytest", "tests/unit/test_cli.py", "-k", "gate", "-q"],
|
|
7
|
+
"timeout": 600
|
|
8
|
+
},
|
|
9
|
+
{
|
|
10
|
+
"name": "integration-cli-e2e",
|
|
11
|
+
"description": "CLI E2E smoke tests without API keys",
|
|
12
|
+
"command": [
|
|
13
|
+
"pytest",
|
|
14
|
+
"tests/integration/test_e2e_scenarios.py::TestCLIIntegrationE2E",
|
|
15
|
+
"-vv"
|
|
16
|
+
],
|
|
17
|
+
"timeout": 900
|
|
18
|
+
}
|
|
19
|
+
]
|
|
20
|
+
}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# CI 회귀 게이트 (Regression Gate)
|
|
2
|
+
|
|
3
|
+
EvalVault의 회귀 게이트는 CI에서 **핵심 CLI 흐름이 깨지지 않았는지** 빠르게 확인하는 안전장치입니다.
|
|
4
|
+
|
|
5
|
+
## 목적
|
|
6
|
+
- PR/릴리즈마다 핵심 CLI 경로를 최소 비용으로 재검증
|
|
7
|
+
- API 키 없이 실행 가능한 스위트만 사용
|
|
8
|
+
|
|
9
|
+
## 구성
|
|
10
|
+
|
|
11
|
+
### 설정 파일
|
|
12
|
+
- `config/regressions/ci.json`
|
|
13
|
+
- `unit-cli-gate`: gate 관련 CLI 유닛 테스트
|
|
14
|
+
- `integration-cli-e2e`: API 키 없이 가능한 CLI e2e 스모크
|
|
15
|
+
|
|
16
|
+
### 실행 스크립트
|
|
17
|
+
- `scripts/ci/run_regression_gate.py`
|
|
18
|
+
|
|
19
|
+
## 로컬 실행
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
uv run python scripts/ci/run_regression_gate.py \
|
|
23
|
+
--config config/regressions/ci.json \
|
|
24
|
+
--format text
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## CI 통합
|
|
28
|
+
|
|
29
|
+
- `.github/workflows/ci.yml`의 `regression-gate` job에서 실행
|
|
30
|
+
- 실패 시 CI가 실패하며, GitHub Actions 로그에 실패 스위트가 표시됩니다.
|
|
31
|
+
|
|
32
|
+
## 실패 기준
|
|
33
|
+
- 어떤 스위트든 실패 시 게이트 실패
|
|
34
|
+
|
|
35
|
+
## 요약 파일
|
|
36
|
+
- `reports/regression/ci_gate.json`에 요약이 저장됩니다.
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# 멀티턴 평가 가이드
|
|
2
|
+
|
|
3
|
+
이 문서는 멀티턴(대화형) RAG 평가를 **단일 턴 데이터셋 구조** 안에서 운영하는 최소 기준을 정의합니다.
|
|
4
|
+
|
|
5
|
+
## 핵심 원칙
|
|
6
|
+
- 멀티턴은 `test_cases`를 평탄화(flatten)하고, 메타데이터로 세션/턴을 연결합니다.
|
|
7
|
+
- 기존 로더/평가/분석 파이프라인을 변경하지 않고, 추가 메타데이터로 멀티턴 집계를 수행합니다.
|
|
8
|
+
|
|
9
|
+
## 데이터셋 필드 규약 (필수)
|
|
10
|
+
`test_cases[].metadata`에 아래 키를 넣습니다.
|
|
11
|
+
|
|
12
|
+
```json
|
|
13
|
+
{
|
|
14
|
+
"metadata": {
|
|
15
|
+
"conversation_id": "conv-001",
|
|
16
|
+
"turn_index": 1,
|
|
17
|
+
"turn_id": "t01"
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
### 필드 정의
|
|
23
|
+
- `conversation_id`: 동일 대화 세션 식별자
|
|
24
|
+
- `turn_index`: 턴 순서(정수)
|
|
25
|
+
- `turn_id`: 턴 고유 ID (선택적으로 문자열)
|
|
26
|
+
|
|
27
|
+
## 실행/분석 흐름
|
|
28
|
+
1. `evalvault run`으로 실행 후 `--auto-analyze` 또는 별도 분석 파이프라인 실행
|
|
29
|
+
2. 분석 파이프라인의 `multiturn_analyzer` 모듈이 대화/턴 집계를 생성
|
|
30
|
+
3. 산출물은 `reports/analysis/artifacts/analysis_<RUN_ID>/index.json`에 등록
|
|
31
|
+
|
|
32
|
+
## 산출물 요약
|
|
33
|
+
`multiturn_analyzer` 모듈 출력:
|
|
34
|
+
- `summary`: 대화 수, 평균 턴 수, 대화 단위 통과율, 최초 실패 턴 분포
|
|
35
|
+
- `conversations`: 대화별 요약(최악 턴, 메트릭 평균)
|
|
36
|
+
- `turns`: 턴 단위 상세
|
|
37
|
+
- `coverage`: conversation_id/turn_index 커버리지
|
|
38
|
+
|
|
39
|
+
## 주의사항
|
|
40
|
+
- `turn_index`가 누락되면 대화 순서를 정확히 복원할 수 없습니다.
|
|
41
|
+
- `conversation_id`가 없는 케이스는 대화 집계에서 제외됩니다.
|
|
42
|
+
|
|
43
|
+
## 예시 템플릿
|
|
44
|
+
- `docs/templates/dataset_template.json`
|
|
45
|
+
- `docs/templates/ragas_dataset_example_ko90_en10.json`
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""Run regression suites for CI quality gate."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
from collections.abc import Sequence
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
from evalvault.scripts.regression_runner import (
|
|
10
|
+
append_issue_log,
|
|
11
|
+
format_summary,
|
|
12
|
+
load_regression_config,
|
|
13
|
+
run_regression_suites,
|
|
14
|
+
select_suites,
|
|
15
|
+
write_json_summary,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _parse_args() -> argparse.Namespace:
|
|
20
|
+
parser = argparse.ArgumentParser(description="EvalVault regression gate runner")
|
|
21
|
+
parser.add_argument("--config", type=Path, default=None, help="Regression config path")
|
|
22
|
+
parser.add_argument(
|
|
23
|
+
"--suites",
|
|
24
|
+
type=str,
|
|
25
|
+
default=None,
|
|
26
|
+
help="Comma-separated suite names to run",
|
|
27
|
+
)
|
|
28
|
+
parser.add_argument(
|
|
29
|
+
"--summary",
|
|
30
|
+
type=Path,
|
|
31
|
+
default=None,
|
|
32
|
+
help="Write JSON summary to a file",
|
|
33
|
+
)
|
|
34
|
+
parser.add_argument(
|
|
35
|
+
"--issue-log",
|
|
36
|
+
type=Path,
|
|
37
|
+
default=None,
|
|
38
|
+
help="Append summary to a markdown log",
|
|
39
|
+
)
|
|
40
|
+
parser.add_argument("--tag", type=str, default=None, help="Label for the run")
|
|
41
|
+
parser.add_argument(
|
|
42
|
+
"--format",
|
|
43
|
+
type=str,
|
|
44
|
+
default="text",
|
|
45
|
+
choices=["text", "github-actions"],
|
|
46
|
+
help="Output format",
|
|
47
|
+
)
|
|
48
|
+
parser.add_argument(
|
|
49
|
+
"--stop-on-failure",
|
|
50
|
+
action="store_true",
|
|
51
|
+
help="Stop on first suite failure",
|
|
52
|
+
)
|
|
53
|
+
return parser.parse_args()
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _split_names(raw: str | None) -> Sequence[str] | None:
|
|
57
|
+
if not raw:
|
|
58
|
+
return None
|
|
59
|
+
return [name.strip() for name in raw.split(",") if name.strip()]
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _emit_github_actions(results) -> None:
|
|
63
|
+
for result in results:
|
|
64
|
+
status = "✅" if result.succeeded else "❌"
|
|
65
|
+
print(f"{status} {result.name} — {result.status.upper()} ({result.duration:.1f}s)")
|
|
66
|
+
if not result.succeeded:
|
|
67
|
+
message = result.stderr.splitlines()[-1] if result.stderr else "Suite failed"
|
|
68
|
+
print(f"::error::Regression suite failed: {result.name} ({message})")
|
|
69
|
+
passed = all(result.succeeded for result in results)
|
|
70
|
+
print(f"::set-output name=passed::{str(passed).lower()}")
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def main() -> int:
|
|
74
|
+
args = _parse_args()
|
|
75
|
+
|
|
76
|
+
suites = load_regression_config(args.config)
|
|
77
|
+
selected = select_suites(suites, _split_names(args.suites))
|
|
78
|
+
results = run_regression_suites(selected, stop_on_failure=args.stop_on_failure)
|
|
79
|
+
|
|
80
|
+
summary = format_summary(results, tag=args.tag)
|
|
81
|
+
if args.summary:
|
|
82
|
+
write_json_summary(args.summary, results, tag=args.tag)
|
|
83
|
+
if args.issue_log:
|
|
84
|
+
append_issue_log(args.issue_log, summary)
|
|
85
|
+
|
|
86
|
+
if args.format == "github-actions":
|
|
87
|
+
_emit_github_actions(results)
|
|
88
|
+
else:
|
|
89
|
+
print(summary)
|
|
90
|
+
|
|
91
|
+
if any(not result.succeeded for result in results):
|
|
92
|
+
return 1
|
|
93
|
+
return 0
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
if __name__ == "__main__":
|
|
97
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
set -euo pipefail
|
|
3
|
+
|
|
4
|
+
ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)
|
|
5
|
+
cd "$ROOT_DIR"
|
|
6
|
+
|
|
7
|
+
OUTPUT_TAR=${OUTPUT_TAR:-dist/evalvault_datasets.tar}
|
|
8
|
+
INCLUDE_DATA=${INCLUDE_DATA:-1}
|
|
9
|
+
INCLUDE_FIXTURES=${INCLUDE_FIXTURES:-1}
|
|
10
|
+
INCLUDE_TEMPLATES=${INCLUDE_TEMPLATES:-1}
|
|
11
|
+
|
|
12
|
+
ITEMS=()
|
|
13
|
+
|
|
14
|
+
if [ "$INCLUDE_DATA" = "1" ] && [ -d "data" ]; then
|
|
15
|
+
ITEMS+=("data")
|
|
16
|
+
fi
|
|
17
|
+
if [ "$INCLUDE_FIXTURES" = "1" ] && [ -d "tests/fixtures" ]; then
|
|
18
|
+
ITEMS+=("tests/fixtures")
|
|
19
|
+
fi
|
|
20
|
+
if [ "$INCLUDE_TEMPLATES" = "1" ] && [ -d "dataset_templates" ]; then
|
|
21
|
+
ITEMS+=("dataset_templates")
|
|
22
|
+
fi
|
|
23
|
+
|
|
24
|
+
if [ ${#ITEMS[@]} -eq 0 ]; then
|
|
25
|
+
echo "No dataset assets to bundle." >&2
|
|
26
|
+
exit 1
|
|
27
|
+
fi
|
|
28
|
+
|
|
29
|
+
mkdir -p "$(dirname "$OUTPUT_TAR")"
|
|
30
|
+
tar -cf "$OUTPUT_TAR" "${ITEMS[@]}"
|
|
31
|
+
sha256sum "$OUTPUT_TAR" > "${OUTPUT_TAR}.sha256"
|
|
32
|
+
|
|
33
|
+
echo "Saved: $OUTPUT_TAR"
|
|
34
|
+
echo "SHA256: ${OUTPUT_TAR}.sha256"
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
set -euo pipefail
|
|
3
|
+
|
|
4
|
+
ARCHIVE=${1:-dist/evalvault_datasets.tar}
|
|
5
|
+
|
|
6
|
+
if [ ! -f "$ARCHIVE" ]; then
|
|
7
|
+
echo "Archive not found: $ARCHIVE" >&2
|
|
8
|
+
exit 1
|
|
9
|
+
fi
|
|
10
|
+
|
|
11
|
+
if [ -f "${ARCHIVE}.sha256" ]; then
|
|
12
|
+
sha256sum -c "${ARCHIVE}.sha256"
|
|
13
|
+
fi
|
|
14
|
+
|
|
15
|
+
tar -xf "$ARCHIVE"
|
|
16
|
+
echo "Restored dataset assets from $ARCHIVE"
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
"""
|
|
2
|
+
멀티턴 평가 요약 모듈입니다.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from collections import defaultdict
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from evalvault.adapters.outbound.analysis.base_module import BaseAnalysisModule
|
|
11
|
+
from evalvault.adapters.outbound.analysis.pipeline_helpers import get_upstream_output, safe_mean
|
|
12
|
+
from evalvault.domain.entities import EvaluationRun
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class MultiTurnAnalyzerModule(BaseAnalysisModule):
|
|
16
|
+
"""멀티턴(대화) 단위로 결과를 집계합니다."""
|
|
17
|
+
|
|
18
|
+
module_id = "multiturn_analyzer"
|
|
19
|
+
name = "멀티턴 분석"
|
|
20
|
+
description = "대화/턴 메타데이터를 기준으로 멀티턴 성능을 요약합니다."
|
|
21
|
+
input_types = ["run"]
|
|
22
|
+
output_types = ["multiturn_summary", "multiturn_conversations", "multiturn_turns"]
|
|
23
|
+
requires = ["data_loader"]
|
|
24
|
+
tags = ["analysis", "multiturn"]
|
|
25
|
+
|
|
26
|
+
def execute(
|
|
27
|
+
self,
|
|
28
|
+
inputs: dict[str, Any],
|
|
29
|
+
params: dict[str, Any] | None = None,
|
|
30
|
+
) -> dict[str, Any]:
|
|
31
|
+
loader_output = get_upstream_output(inputs, "load_data", "data_loader") or {}
|
|
32
|
+
run = loader_output.get("run")
|
|
33
|
+
if not isinstance(run, EvaluationRun):
|
|
34
|
+
return {
|
|
35
|
+
"available": False,
|
|
36
|
+
"summary": {},
|
|
37
|
+
"conversations": [],
|
|
38
|
+
"turns": [],
|
|
39
|
+
"coverage": {},
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
retrieval_meta = run.retrieval_metadata or {}
|
|
43
|
+
cases = run.results
|
|
44
|
+
total_cases = len(cases)
|
|
45
|
+
|
|
46
|
+
coverage = {
|
|
47
|
+
"total_cases": total_cases,
|
|
48
|
+
"has_conversation_id": 0,
|
|
49
|
+
"has_turn_index": 0,
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
grouped: dict[str, list[dict[str, Any]]] = defaultdict(list)
|
|
53
|
+
turns: list[dict[str, Any]] = []
|
|
54
|
+
|
|
55
|
+
for result in cases:
|
|
56
|
+
case_meta = _resolve_case_metadata(retrieval_meta, result.test_case_id)
|
|
57
|
+
conversation_id = _coerce_text(case_meta.get("conversation_id"))
|
|
58
|
+
turn_index = _coerce_turn_index(case_meta.get("turn_index"))
|
|
59
|
+
turn_id = _coerce_text(case_meta.get("turn_id"))
|
|
60
|
+
|
|
61
|
+
if conversation_id:
|
|
62
|
+
coverage["has_conversation_id"] += 1
|
|
63
|
+
if turn_index is not None:
|
|
64
|
+
coverage["has_turn_index"] += 1
|
|
65
|
+
|
|
66
|
+
metrics = {
|
|
67
|
+
metric.name: metric.score for metric in result.metrics if metric.score is not None
|
|
68
|
+
}
|
|
69
|
+
avg_score = safe_mean(metrics.values()) if metrics else 0.0
|
|
70
|
+
failed_metrics = [metric.name for metric in result.metrics if not metric.passed]
|
|
71
|
+
entry = {
|
|
72
|
+
"test_case_id": result.test_case_id,
|
|
73
|
+
"conversation_id": conversation_id,
|
|
74
|
+
"turn_index": turn_index,
|
|
75
|
+
"turn_id": turn_id,
|
|
76
|
+
"avg_score": round(avg_score, 4),
|
|
77
|
+
"metrics": metrics,
|
|
78
|
+
"failed_metrics": failed_metrics,
|
|
79
|
+
"passed_all": result.all_passed,
|
|
80
|
+
}
|
|
81
|
+
turns.append(entry)
|
|
82
|
+
if conversation_id:
|
|
83
|
+
grouped[conversation_id].append(entry)
|
|
84
|
+
|
|
85
|
+
conversations: list[dict[str, Any]] = []
|
|
86
|
+
first_failure_hist: dict[str, int] = defaultdict(int)
|
|
87
|
+
|
|
88
|
+
for conversation_id, entries in grouped.items():
|
|
89
|
+
entries_sorted = _sort_turns(entries)
|
|
90
|
+
avg_scores = [item["avg_score"] for item in entries_sorted]
|
|
91
|
+
metric_scores: dict[str, list[float]] = defaultdict(list)
|
|
92
|
+
for item in entries_sorted:
|
|
93
|
+
for name, score in (item.get("metrics") or {}).items():
|
|
94
|
+
metric_scores[name].append(float(score))
|
|
95
|
+
|
|
96
|
+
metric_means = {
|
|
97
|
+
name: round(safe_mean(values), 4) for name, values in metric_scores.items()
|
|
98
|
+
}
|
|
99
|
+
passed_all = all(item.get("passed_all") for item in entries_sorted)
|
|
100
|
+
failure_turn = _first_failure_turn(entries_sorted)
|
|
101
|
+
if failure_turn is not None:
|
|
102
|
+
first_failure_hist[str(failure_turn)] += 1
|
|
103
|
+
|
|
104
|
+
worst_turn = _select_worst_turn(entries_sorted)
|
|
105
|
+
|
|
106
|
+
conversations.append(
|
|
107
|
+
{
|
|
108
|
+
"conversation_id": conversation_id,
|
|
109
|
+
"turn_count": len(entries_sorted),
|
|
110
|
+
"avg_score": round(safe_mean(avg_scores), 4),
|
|
111
|
+
"passed_all_turns": passed_all,
|
|
112
|
+
"first_failure_turn_index": failure_turn,
|
|
113
|
+
"worst_turn": worst_turn,
|
|
114
|
+
"metric_means": metric_means,
|
|
115
|
+
}
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
conversation_count = len(grouped)
|
|
119
|
+
turn_count = sum(len(items) for items in grouped.values())
|
|
120
|
+
summary = {
|
|
121
|
+
"conversation_count": conversation_count,
|
|
122
|
+
"turn_count": turn_count,
|
|
123
|
+
"avg_turns_per_conversation": round(
|
|
124
|
+
(turn_count / conversation_count) if conversation_count else 0.0, 3
|
|
125
|
+
),
|
|
126
|
+
"conversation_pass_rate": round(
|
|
127
|
+
(
|
|
128
|
+
sum(1 for item in conversations if item.get("passed_all_turns"))
|
|
129
|
+
/ conversation_count
|
|
130
|
+
)
|
|
131
|
+
if conversation_count
|
|
132
|
+
else 0.0,
|
|
133
|
+
4,
|
|
134
|
+
),
|
|
135
|
+
"first_failure_turn_histogram": dict(first_failure_hist),
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
if total_cases:
|
|
139
|
+
coverage["has_conversation_id"] = round(
|
|
140
|
+
coverage["has_conversation_id"] / total_cases, 4
|
|
141
|
+
)
|
|
142
|
+
coverage["has_turn_index"] = round(coverage["has_turn_index"] / total_cases, 4)
|
|
143
|
+
|
|
144
|
+
return {
|
|
145
|
+
"available": True,
|
|
146
|
+
"summary": summary,
|
|
147
|
+
"conversations": conversations,
|
|
148
|
+
"turns": turns,
|
|
149
|
+
"coverage": coverage,
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def _resolve_case_metadata(
|
|
154
|
+
retrieval_metadata: dict[str, dict[str, Any]],
|
|
155
|
+
test_case_id: str,
|
|
156
|
+
) -> dict[str, Any]:
|
|
157
|
+
meta = retrieval_metadata.get(test_case_id)
|
|
158
|
+
if isinstance(meta, dict):
|
|
159
|
+
nested = meta.get("test_case_metadata")
|
|
160
|
+
if isinstance(nested, dict):
|
|
161
|
+
merged = dict(nested)
|
|
162
|
+
merged.update({k: v for k, v in meta.items() if k != "test_case_metadata"})
|
|
163
|
+
return merged
|
|
164
|
+
return dict(meta)
|
|
165
|
+
return {}
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _coerce_text(value: Any) -> str | None:
|
|
169
|
+
if value is None:
|
|
170
|
+
return None
|
|
171
|
+
if isinstance(value, str):
|
|
172
|
+
trimmed = value.strip()
|
|
173
|
+
return trimmed or None
|
|
174
|
+
return str(value)
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def _coerce_turn_index(value: Any) -> int | None:
|
|
178
|
+
if value is None:
|
|
179
|
+
return None
|
|
180
|
+
if isinstance(value, int):
|
|
181
|
+
return value
|
|
182
|
+
if isinstance(value, float) and value.is_integer():
|
|
183
|
+
return int(value)
|
|
184
|
+
if isinstance(value, str) and value.strip().isdigit():
|
|
185
|
+
return int(value.strip())
|
|
186
|
+
return None
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def _sort_turns(entries: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
190
|
+
if all(item.get("turn_index") is None for item in entries):
|
|
191
|
+
return list(entries)
|
|
192
|
+
return sorted(
|
|
193
|
+
entries, key=lambda item: (item.get("turn_index") is None, item.get("turn_index") or 0)
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def _first_failure_turn(entries: list[dict[str, Any]]) -> int | None:
|
|
198
|
+
for item in entries:
|
|
199
|
+
if not item.get("passed_all"):
|
|
200
|
+
return item.get("turn_index")
|
|
201
|
+
return None
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def _select_worst_turn(entries: list[dict[str, Any]]) -> dict[str, Any] | None:
|
|
205
|
+
if not entries:
|
|
206
|
+
return None
|
|
207
|
+
worst = min(entries, key=lambda item: item.get("avg_score", 0.0))
|
|
208
|
+
return {
|
|
209
|
+
"test_case_id": worst.get("test_case_id"),
|
|
210
|
+
"avg_score": worst.get("avg_score"),
|
|
211
|
+
"failed_metrics": worst.get("failed_metrics", []),
|
|
212
|
+
}
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|