evalvault 1.64.0__py3-none-any.whl → 1.66.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. evalvault/adapters/inbound/api/adapter.py +14 -0
  2. evalvault/adapters/inbound/api/main.py +14 -4
  3. evalvault/adapters/inbound/api/routers/chat.py +543 -0
  4. evalvault/adapters/inbound/cli/commands/__init__.py +14 -7
  5. evalvault/adapters/inbound/cli/commands/artifacts.py +107 -0
  6. evalvault/adapters/inbound/cli/commands/calibrate_judge.py +283 -0
  7. evalvault/adapters/inbound/cli/commands/compare.py +290 -0
  8. evalvault/adapters/inbound/cli/commands/history.py +13 -85
  9. evalvault/adapters/inbound/cli/commands/ops.py +110 -0
  10. evalvault/adapters/inbound/cli/commands/profile_difficulty.py +160 -0
  11. evalvault/adapters/inbound/cli/commands/regress.py +251 -0
  12. evalvault/adapters/inbound/cli/commands/run.py +14 -0
  13. evalvault/adapters/inbound/cli/commands/run_helpers.py +21 -2
  14. evalvault/adapters/outbound/analysis/comparison_pipeline_adapter.py +49 -0
  15. evalvault/adapters/outbound/artifact_fs.py +16 -0
  16. evalvault/adapters/outbound/filesystem/__init__.py +3 -0
  17. evalvault/adapters/outbound/filesystem/difficulty_profile_writer.py +50 -0
  18. evalvault/adapters/outbound/filesystem/ops_snapshot_writer.py +13 -0
  19. evalvault/adapters/outbound/judge_calibration_adapter.py +36 -0
  20. evalvault/adapters/outbound/judge_calibration_reporter.py +57 -0
  21. evalvault/adapters/outbound/report/llm_report_generator.py +13 -1
  22. evalvault/adapters/outbound/storage/base_sql.py +41 -1
  23. evalvault/adapters/outbound/tracker/langfuse_adapter.py +13 -7
  24. evalvault/adapters/outbound/tracker/mlflow_adapter.py +5 -0
  25. evalvault/adapters/outbound/tracker/phoenix_adapter.py +68 -14
  26. evalvault/config/settings.py +21 -0
  27. evalvault/domain/entities/__init__.py +10 -0
  28. evalvault/domain/entities/judge_calibration.py +50 -0
  29. evalvault/domain/entities/prompt.py +1 -1
  30. evalvault/domain/entities/stage.py +11 -3
  31. evalvault/domain/metrics/__init__.py +8 -0
  32. evalvault/domain/metrics/registry.py +39 -3
  33. evalvault/domain/metrics/summary_accuracy.py +189 -0
  34. evalvault/domain/metrics/summary_needs_followup.py +45 -0
  35. evalvault/domain/metrics/summary_non_definitive.py +41 -0
  36. evalvault/domain/metrics/summary_risk_coverage.py +45 -0
  37. evalvault/domain/services/artifact_lint_service.py +268 -0
  38. evalvault/domain/services/benchmark_runner.py +1 -6
  39. evalvault/domain/services/custom_metric_snapshot.py +233 -0
  40. evalvault/domain/services/dataset_preprocessor.py +26 -0
  41. evalvault/domain/services/difficulty_profile_reporter.py +25 -0
  42. evalvault/domain/services/difficulty_profiling_service.py +304 -0
  43. evalvault/domain/services/evaluator.py +282 -27
  44. evalvault/domain/services/judge_calibration_service.py +495 -0
  45. evalvault/domain/services/ops_snapshot_service.py +159 -0
  46. evalvault/domain/services/prompt_registry.py +39 -10
  47. evalvault/domain/services/regression_gate_service.py +199 -0
  48. evalvault/domain/services/run_comparison_service.py +159 -0
  49. evalvault/domain/services/stage_event_builder.py +6 -1
  50. evalvault/domain/services/stage_metric_service.py +83 -18
  51. evalvault/domain/services/threshold_profiles.py +4 -0
  52. evalvault/domain/services/visual_space_service.py +79 -4
  53. evalvault/ports/outbound/__init__.py +4 -0
  54. evalvault/ports/outbound/artifact_fs_port.py +12 -0
  55. evalvault/ports/outbound/comparison_pipeline_port.py +22 -0
  56. evalvault/ports/outbound/difficulty_profile_port.py +15 -0
  57. evalvault/ports/outbound/judge_calibration_port.py +22 -0
  58. evalvault/ports/outbound/ops_snapshot_port.py +8 -0
  59. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/METADATA +25 -1
  60. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/RECORD +63 -31
  61. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/WHEEL +0 -0
  62. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/entry_points.txt +0 -0
  63. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -233,13 +233,13 @@ def _aggregate_stage_metrics(metrics: Iterable[StageMetric]) -> dict[str, dict[s
233
233
 
234
234
  aggregated: dict[str, dict[str, float]] = {}
235
235
  for name, entries in buckets.items():
236
- scores = [m.score for m in entries]
236
+ scores = [m.score for m in entries if m.score is not None]
237
237
  threshold = next(
238
238
  (m.threshold for m in entries if m.threshold is not None),
239
239
  DEFAULT_STAGE_THRESHOLDS.get(name),
240
240
  )
241
241
  aggregated[name] = {
242
- "avg": mean(scores) if scores else None,
242
+ "avg": mean(scores) if scores else 0.0,
243
243
  "threshold": threshold if threshold is not None else DEFAULT_METRIC_THRESHOLD,
244
244
  }
245
245
  return aggregated
@@ -770,6 +770,77 @@ def _build_case_coords(result: TestCaseResult) -> dict[str, float | None]:
770
770
  ),
771
771
  ]
772
772
  )
773
+
774
+ if x_value is None:
775
+ x_value = _weighted_average(
776
+ [
777
+ (
778
+ _centered_norm(
779
+ scores.get("summary_accuracy"), thresholds.get("summary_accuracy")
780
+ ),
781
+ 0.4,
782
+ ),
783
+ (
784
+ _centered_norm(
785
+ scores.get("summary_risk_coverage"),
786
+ thresholds.get("summary_risk_coverage"),
787
+ ),
788
+ 0.3,
789
+ ),
790
+ (
791
+ _centered_norm(
792
+ scores.get("summary_faithfulness"),
793
+ thresholds.get("summary_faithfulness"),
794
+ ),
795
+ 0.2,
796
+ ),
797
+ (
798
+ _centered_norm(scores.get("summary_score"), thresholds.get("summary_score")),
799
+ 0.1,
800
+ ),
801
+ (
802
+ _centered_norm(
803
+ scores.get("entity_preservation"),
804
+ thresholds.get("entity_preservation"),
805
+ ),
806
+ 0.2,
807
+ ),
808
+ ]
809
+ )
810
+
811
+ if y_value is None:
812
+ y_value = _weighted_average(
813
+ [
814
+ (
815
+ _centered_norm(
816
+ scores.get("summary_accuracy"), thresholds.get("summary_accuracy")
817
+ ),
818
+ 0.35,
819
+ ),
820
+ (
821
+ _centered_norm(
822
+ scores.get("summary_non_definitive"),
823
+ thresholds.get("summary_non_definitive"),
824
+ ),
825
+ 0.35,
826
+ ),
827
+ (
828
+ _centered_norm(
829
+ scores.get("summary_needs_followup"),
830
+ thresholds.get("summary_needs_followup"),
831
+ ),
832
+ 0.3,
833
+ ),
834
+ (
835
+ _centered_norm(
836
+ scores.get("entity_preservation"),
837
+ thresholds.get("entity_preservation"),
838
+ ),
839
+ 0.2,
840
+ ),
841
+ ]
842
+ )
843
+
773
844
  return {"x": x_value, "y": y_value}
774
845
 
775
846
 
@@ -799,8 +870,12 @@ def _build_cluster_points(
799
870
 
800
871
  points = []
801
872
  for cluster_id, coords_list in clusters.items():
802
- x_values = [c.get("x") for c in coords_list if c.get("x") is not None]
803
- y_values = [c.get("y") for c in coords_list if c.get("y") is not None]
873
+ x_values = [
874
+ value for value in (c.get("x") for c in coords_list) if isinstance(value, (int, float))
875
+ ]
876
+ y_values = [
877
+ value for value in (c.get("y") for c in coords_list) if isinstance(value, (int, float))
878
+ ]
804
879
  x_avg = mean(x_values) if x_values else None
805
880
  y_avg = mean(y_values) if y_values else None
806
881
  quadrant = _quadrant_label(x_avg, y_avg)
@@ -11,6 +11,7 @@ from evalvault.ports.outbound.benchmark_port import (
11
11
  BenchmarkTaskResult,
12
12
  )
13
13
  from evalvault.ports.outbound.causal_analysis_port import CausalAnalysisPort
14
+ from evalvault.ports.outbound.comparison_pipeline_port import ComparisonPipelinePort
14
15
  from evalvault.ports.outbound.dataset_port import DatasetPort
15
16
  from evalvault.ports.outbound.domain_memory_port import (
16
17
  BehaviorMemoryPort,
@@ -38,6 +39,7 @@ from evalvault.ports.outbound.improvement_port import (
38
39
  PlaybookPort,
39
40
  )
40
41
  from evalvault.ports.outbound.intent_classifier_port import IntentClassifierPort
42
+ from evalvault.ports.outbound.judge_calibration_port import JudgeCalibrationPort
41
43
  from evalvault.ports.outbound.korean_nlp_port import (
42
44
  FaithfulnessResultProtocol,
43
45
  KoreanNLPToolkitPort,
@@ -58,6 +60,7 @@ from evalvault.ports.outbound.tracker_port import TrackerPort
58
60
  __all__ = [
59
61
  "AnalysisCachePort",
60
62
  "AnalysisPort",
63
+ "ComparisonPipelinePort",
61
64
  "CausalAnalysisPort",
62
65
  "DatasetPort",
63
66
  "DomainMemoryPort",
@@ -83,6 +86,7 @@ __all__ = [
83
86
  "PatternDefinitionProtocol",
84
87
  "MetricPlaybookProtocol",
85
88
  "ClaimImprovementProtocol",
89
+ "JudgeCalibrationPort",
86
90
  "LLMFactoryPort",
87
91
  "LLMPort",
88
92
  "MethodRuntime",
@@ -0,0 +1,12 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import Protocol
5
+
6
+
7
+ class ArtifactFileSystemPort(Protocol):
8
+ def exists(self, path: Path) -> bool: ...
9
+
10
+ def is_dir(self, path: Path) -> bool: ...
11
+
12
+ def read_text(self, path: Path) -> str: ...
@@ -0,0 +1,22 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Protocol
4
+
5
+ from evalvault.domain.entities.analysis_pipeline import PipelineResult
6
+
7
+
8
+ class ComparisonPipelinePort(Protocol):
9
+ def run_comparison(
10
+ self,
11
+ *,
12
+ run_ids: list[str],
13
+ compare_metrics: list[str] | None,
14
+ test_type: str,
15
+ parallel: bool,
16
+ concurrency: int | None,
17
+ report_type: str,
18
+ use_llm_report: bool,
19
+ ) -> PipelineResult: ...
20
+
21
+
22
+ __all__ = ["ComparisonPipelinePort"]
@@ -0,0 +1,15 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import Protocol
5
+
6
+
7
+ class DifficultyProfileWriterPort(Protocol):
8
+ def write_profile(
9
+ self,
10
+ *,
11
+ output_path: Path,
12
+ artifacts_dir: Path,
13
+ envelope: dict[str, object],
14
+ artifacts: dict[str, object],
15
+ ) -> dict[str, object]: ...
@@ -0,0 +1,22 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Protocol
4
+
5
+ from evalvault.domain.entities import EvaluationRun, SatisfactionFeedback
6
+ from evalvault.domain.entities.judge_calibration import JudgeCalibrationResult
7
+
8
+
9
+ class JudgeCalibrationPort(Protocol):
10
+ def calibrate(
11
+ self,
12
+ run: EvaluationRun,
13
+ feedbacks: list[SatisfactionFeedback],
14
+ *,
15
+ labels_source: str,
16
+ method: str,
17
+ metrics: list[str],
18
+ holdout_ratio: float,
19
+ seed: int,
20
+ parallel: bool = False,
21
+ concurrency: int = 8,
22
+ ) -> JudgeCalibrationResult: ...
@@ -0,0 +1,8 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import Any, Protocol
5
+
6
+
7
+ class OpsSnapshotWriterPort(Protocol):
8
+ def write_snapshot(self, path: Path, payload: dict[str, Any]) -> None: ...
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: evalvault
3
- Version: 1.64.0
3
+ Version: 1.66.0
4
4
  Summary: RAG evaluation system using Ragas with Phoenix/Langfuse tracing
5
5
  Project-URL: Homepage, https://github.com/ntts9990/EvalVault
6
6
  Project-URL: Documentation, https://github.com/ntts9990/EvalVault#readme
@@ -25,6 +25,7 @@ Classifier: Topic :: Software Development :: Quality Assurance
25
25
  Classifier: Topic :: Software Development :: Testing
26
26
  Classifier: Typing :: Typed
27
27
  Requires-Python: >=3.12
28
+ Requires-Dist: chainlit>=2.9.5
28
29
  Requires-Dist: chardet
29
30
  Requires-Dist: fastapi>=0.128.0
30
31
  Requires-Dist: instructor
@@ -137,12 +138,17 @@ English version? See `README.en.md`.
137
138
  ## Quick Links
138
139
 
139
140
  - 문서 허브: `docs/INDEX.md`
141
+ - CLI 실행 시나리오 가이드: `docs/guides/RAG_CLI_WORKFLOW_TEMPLATES.md`
140
142
  - 사용자 가이드: `docs/guides/USER_GUIDE.md`
141
143
  - 개발 가이드: `docs/guides/DEV_GUIDE.md`
142
144
  - 상태/로드맵: `docs/STATUS.md`, `docs/ROADMAP.md`
143
145
  - 개발 백서(설계/운영/품질 기준): `docs/new_whitepaper/INDEX.md`
144
146
  - Open RAG Trace: `docs/architecture/open-rag-trace-spec.md`
145
147
 
148
+ ### 다음 개선 작업 메모
149
+ - 보험 요약 메트릭 확장 계획: `docs/guides/INSURANCE_SUMMARY_METRICS_PLAN.md`
150
+ - Prompt 반복 적용 계획: `docs/guides/repeat_query.md`
151
+
146
152
  ---
147
153
 
148
154
  ## EvalVault가 해결하는 문제
@@ -470,6 +476,24 @@ npm run dev
470
476
  - Ragas 계열: `faithfulness`, `answer_relevancy`, `context_precision`, `context_recall`, `factual_correctness`, `semantic_similarity`
471
477
  - 커스텀 예시(도메인): `insurance_term_accuracy`
472
478
 
479
+ ### 요약 메트릭 설계 근거 (summary_score, summary_faithfulness, entity_preservation)
480
+
481
+ ### 커스텀 메트릭 스냅샷 (평가 방식/과정/결과 기록)
482
+ - 평가 방식/입출력/규칙/구현 파일 해시를 `run.tracker_metadata.custom_metric_snapshot`에 기록합니다.
483
+ - Excel `CustomMetrics` 시트와 Langfuse/Phoenix/MLflow artifact에도 함께 저장됩니다.
484
+
485
+ - `summary_faithfulness`: 요약의 모든 주장이 컨텍스트에 근거하는지 평가합니다. 환각/왜곡 리스크를 직접적으로 측정합니다.
486
+ - `summary_score`: 컨텍스트 대비 요약의 핵심 정보 보존/간결성 균형을 평가합니다. 정답 요약 단일 기준의 편향을 줄입니다.
487
+ - `entity_preservation`: 금액·기간·조건·면책 등 보험 약관에서 중요한 엔티티가 요약에 유지되는지 측정합니다.
488
+
489
+ **보험 도메인 특화 근거**
490
+ - 보험 약관에서 치명적인 요소(면책, 자기부담, 한도, 조건 등)를 키워드로 직접 반영하고, 금액/기간/비율 같은 핵심 엔티티를 보존하도록 설계했습니다.
491
+ - 범용 규칙(숫자/기간/금액)과 보험 특화 키워드를 함께 사용하므로, 현재 상태는 “보험 리스크 중심의 약한 도메인 특화”로 보는 것이 정확합니다.
492
+
493
+ **해석 주의사항**
494
+ - 세 메트릭 모두 `contexts` 품질에 크게 의존합니다. 컨텍스트가 부정확/과도하면 점수가 낮아질 수 있습니다.
495
+ - `summary_score`는 키프레이즈 기반이므로, 표현이 달라지면 점수가 낮게 나올 수 있습니다.
496
+
473
497
  정확한 옵션/운영 레시피는 `docs/guides/USER_GUIDE.md`를 기준으로 최신화합니다.
474
498
 
475
499
  ---
@@ -5,10 +5,11 @@ evalvault/mkdocs_helpers.py,sha256=1AKVQ1W2_VO4qclhfyefyU9Dz1Hzkh1DWDwsFMe24jc,3
5
5
  evalvault/adapters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  evalvault/adapters/inbound/__init__.py,sha256=SG1svel1PwqetnqVpKFLSv612_WwGwLTbFpYgwk6FMw,166
7
7
  evalvault/adapters/inbound/api/__init__.py,sha256=LeVVttCA3tLKoHA2PO4z3y8VkfVcf3Bq8CZSzo91lf4,34
8
- evalvault/adapters/inbound/api/adapter.py,sha256=tYkJciUUFOK80QcSwzrqkXP1G4qUFItFV7uBYbjBGqU,68473
9
- evalvault/adapters/inbound/api/main.py,sha256=lRuyg3aBs5jIk7tq4p4d7jrRkFpV_brZypoOq8s56Rk,6896
8
+ evalvault/adapters/inbound/api/adapter.py,sha256=HgWSYyUxvJPlaSG158WVzpPckpPCYV9Ec3CWN8rLFdI,69118
9
+ evalvault/adapters/inbound/api/main.py,sha256=skYtmDngdOBryyLXQpNGlSd2Te6RF6GtfIwcMACPHFU,7068
10
10
  evalvault/adapters/inbound/api/routers/__init__.py,sha256=q07_YF9TnBl68bqcRCvhPU4-zRTyvmPoHVehwO6W7QM,19
11
11
  evalvault/adapters/inbound/api/routers/benchmark.py,sha256=yevntbZcNtMvbVODsITUBgR1Ka4pdFQrXBJJ4K4Jyr4,4477
12
+ evalvault/adapters/inbound/api/routers/chat.py,sha256=3S6-ljiY1COlDuVDH5yzMJs9SO0EkuosRcJIYScHWvI,18143
12
13
  evalvault/adapters/inbound/api/routers/config.py,sha256=LygN0fVMr8NFtj5zuQXnVFhoafx56Txa98vpwtPa4Jc,4104
13
14
  evalvault/adapters/inbound/api/routers/domain.py,sha256=RsR7GIFMjccDN7vpG1uDyk9n1DnCTH18JDGAX7o4Qqc,3648
14
15
  evalvault/adapters/inbound/api/routers/knowledge.py,sha256=yb_e7OEPtwldOAzHTGiWe7jShHw2JdpOFnzGPMceRsg,7109
@@ -16,28 +17,34 @@ evalvault/adapters/inbound/api/routers/pipeline.py,sha256=8UgQzNFHcuqS61s69mOrPe
16
17
  evalvault/adapters/inbound/api/routers/runs.py,sha256=rydOvwWk24QIYafu3XYS3oL_VVCE_jHDmjADhA19T1s,40059
17
18
  evalvault/adapters/inbound/cli/__init__.py,sha256=a42flC5NK-VfbdbBrE49IrUL5zAyKdXZYJVM6E3NTE0,675
18
19
  evalvault/adapters/inbound/cli/app.py,sha256=ytNgHRg9ZTAl33AkB1wIL8RKfQ_Cf8fsy0gSsLTs7Ew,1603
19
- evalvault/adapters/inbound/cli/commands/__init__.py,sha256=cNPPhsudTQWdlh_OJm9mU8LGBnJLGMswJBcIV9MAlkI,3530
20
+ evalvault/adapters/inbound/cli/commands/__init__.py,sha256=kw0SAEwOce1v92Pd6YpQjSYsdwLU95TQqbKGM44fNhY,3995
20
21
  evalvault/adapters/inbound/cli/commands/agent.py,sha256=YlOYMEzzS1aSKDKD_a7UK3St18X6GXGkdTatrzyd8Zc,7555
21
22
  evalvault/adapters/inbound/cli/commands/analyze.py,sha256=aMi1BEDOX3yhN-ppBftDssPQLB5TdzIfpx9U7CZEgWo,48932
22
23
  evalvault/adapters/inbound/cli/commands/api.py,sha256=YdbJ_-QEajnFcjTa7P2heLMjFKpeQ4nWP_p-HvfYkEo,1943
24
+ evalvault/adapters/inbound/cli/commands/artifacts.py,sha256=bE8FQxmnU0mMIAPx5en8aKrtfNNkrbWoLxIX4ZT9D5c,3776
23
25
  evalvault/adapters/inbound/cli/commands/benchmark.py,sha256=RZ4nRTF7d6hDZug-Pw8dGcFEyWdOKclwqkvS-gN4VWo,41097
24
26
  evalvault/adapters/inbound/cli/commands/calibrate.py,sha256=-UnT0LQH40U5lzMLqMJ7DOTLa3mt5P_fJL2XzqIkvu4,4223
27
+ evalvault/adapters/inbound/cli/commands/calibrate_judge.py,sha256=hJBlNl9Rt-ZtoIu-HKfudhZb2j2HOoEnRbiG4n5TOTE,10348
28
+ evalvault/adapters/inbound/cli/commands/compare.py,sha256=X_uyJoT_yQP43RTWMLCwMuHwhOb8wCqFShjy477V-2c,10384
25
29
  evalvault/adapters/inbound/cli/commands/config.py,sha256=Mv9IQHBFHZ3I2stUzHDgLDn-Znt_Awdy3j-sk5ruUmw,6069
26
30
  evalvault/adapters/inbound/cli/commands/debug.py,sha256=KU-hL1gLhpjV2ZybDQgGMwRfm-hCynkrqY4UzETfL9k,2234
27
31
  evalvault/adapters/inbound/cli/commands/domain.py,sha256=dL9iqBlnr5mDeS1unXW6uxE0qp6yfnxj-ls6k3EenwI,27279
28
32
  evalvault/adapters/inbound/cli/commands/experiment.py,sha256=jficaFOsZ9EMHrPHCOZjq6jpFrgmqCwmIo--wA_OcvQ,10389
29
33
  evalvault/adapters/inbound/cli/commands/gate.py,sha256=SxBSHALhekw9OVuJcuk64tkS8YMDDsgmhMALTE38wwY,9956
30
34
  evalvault/adapters/inbound/cli/commands/generate.py,sha256=7IPvd0WAwPxt9uaxmzqWCwt0b2VC_wXiVxyJ3lP-xys,8562
31
- evalvault/adapters/inbound/cli/commands/history.py,sha256=P8rK_nRJrmtG3y9obq3OSYtDZGZt2o3660_9JtbSPkg,11100
35
+ evalvault/adapters/inbound/cli/commands/history.py,sha256=3xf1l-I8IW-1Vtne9ypepDMDRRbwOpEvAjh4Qf9tV2w,8420
32
36
  evalvault/adapters/inbound/cli/commands/init.py,sha256=7q86fUeBVA08fU_N0lAV6Lakxirq4val2jIyALlDy3E,8822
33
37
  evalvault/adapters/inbound/cli/commands/kg.py,sha256=ycV9Xj6SUUJLTyTfLZcjXDVLcZqwo7Gw878ZhZAeDoc,19155
34
38
  evalvault/adapters/inbound/cli/commands/langfuse.py,sha256=aExhZ5WYT0FzJI4v1sF-a1jqy9b1BF46_HBtfiQjVGI,4085
35
39
  evalvault/adapters/inbound/cli/commands/method.py,sha256=OWdoofhvsDJchgNKnGGjXfIsZ-IHKZEo6RlmTsZRRYM,19124
40
+ evalvault/adapters/inbound/cli/commands/ops.py,sha256=2r6hdrZ7STnWMhtzYmv8jF_ukBq4HuKB1El6YnyxwrY,4035
36
41
  evalvault/adapters/inbound/cli/commands/phoenix.py,sha256=LQi3KTLq1ybjjBuz92oQ6lYyBS3mHrCHk0qe-7bqB4U,15611
37
42
  evalvault/adapters/inbound/cli/commands/pipeline.py,sha256=NeqWLzO9kRDuZd0pHAIHglP3F7VzoNOU4JI0QcSZ120,7788
43
+ evalvault/adapters/inbound/cli/commands/profile_difficulty.py,sha256=nOJH3iqgLAlXq4keLBj5oqpiRCg0jjGgT-7Q57HxEh8,6665
38
44
  evalvault/adapters/inbound/cli/commands/prompts.py,sha256=lddde5VbjYaqN_9gHPLNu6DWpg5fE-KqZzjN-XYwvJw,27153
39
- evalvault/adapters/inbound/cli/commands/run.py,sha256=X19rgXhajhvZNA4c0JMmzmPatTxhZgfapuW07bZL9xA,119265
40
- evalvault/adapters/inbound/cli/commands/run_helpers.py,sha256=hu2TioocitUZzGR7HUwZ6gOeEJSvt5tGNjwXOlo4Eic,40336
45
+ evalvault/adapters/inbound/cli/commands/regress.py,sha256=Dy8hUOdjapxOW9Hoov0DHHblkMaExiqWfYS14CaC9Kk,8806
46
+ evalvault/adapters/inbound/cli/commands/run.py,sha256=aKoZcQbOJ1KB_4zPk4L-AWw3u9vGWg3SaooR7A3Xd_Y,119910
47
+ evalvault/adapters/inbound/cli/commands/run_helpers.py,sha256=93jFUg8QLrD38QU2JhOhFMoHDWUphSEKRdJ5KcUvrkQ,40806
41
48
  evalvault/adapters/inbound/cli/commands/stage.py,sha256=oRC9c5CysLX90Iy5Ba1pc_00DaOBS78lcBvzkbdrGRM,17123
42
49
  evalvault/adapters/inbound/cli/utils/__init__.py,sha256=QPNKneZS-Z-tTnYYxtgJXgcJWY6puUlRQcKrn7Mlv1M,685
43
50
  evalvault/adapters/inbound/cli/utils/analysis_io.py,sha256=RHkKEq4e-PtbtRDlXAJWU80RYHNPw-O5V9_GujdaGfc,13393
@@ -52,6 +59,9 @@ evalvault/adapters/inbound/mcp/__init__.py,sha256=kctJsmaP4fY94T3WCOhgANk3TCLdfb
52
59
  evalvault/adapters/inbound/mcp/schemas.py,sha256=KUKm4gEc-UDyF8sUbyzAnAIzyZ6DcXsaCEIVR3oESNQ,4469
53
60
  evalvault/adapters/inbound/mcp/tools.py,sha256=fnvkWS5p93o3FNmUSbh3EW4jCAVwtBKHX6kDuEbXkK8,24219
54
61
  evalvault/adapters/outbound/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
62
+ evalvault/adapters/outbound/artifact_fs.py,sha256=nySk7-10B9rQpV9EslBmRUr0gFIi9V1E_E_wdC1WpIU,439
63
+ evalvault/adapters/outbound/judge_calibration_adapter.py,sha256=vu5dVJVo5AXa5ULcx8WE-6YgfD718BC8Ci1_HAdwy20,1155
64
+ evalvault/adapters/outbound/judge_calibration_reporter.py,sha256=R0nId6P1jYQ3M3636knPNFztuc8kewDBKsg2LLcUw58,2005
55
65
  evalvault/adapters/outbound/analysis/__init__.py,sha256=TLuS-eKfXg97_Db5td1nTZkD3BErRLZLic1v2EAM2sA,6185
56
66
  evalvault/adapters/outbound/analysis/analysis_report_module.py,sha256=xah3wgJErHD_Hpb1YAwWRsxr8xaC8SW--CpNA7IgfxI,3957
57
67
  evalvault/adapters/outbound/analysis/base_module.py,sha256=eUN77SSD2KR4WKU7gLY8TlVewETx_YIZvPT4LUnBv4o,2523
@@ -59,6 +69,7 @@ evalvault/adapters/outbound/analysis/bm25_searcher_module.py,sha256=I8BsXrHaOVxg
59
69
  evalvault/adapters/outbound/analysis/causal_adapter.py,sha256=Rt5QcoLDEjx8u_yidACz3u8SbAVYSJO6lLu6udwnd4U,27410
60
70
  evalvault/adapters/outbound/analysis/causal_analyzer_module.py,sha256=hBcTx7ZyUZ6HQ6I6W2VvSZ1ndatlgMen2KjKXk_Ltx4,6780
61
71
  evalvault/adapters/outbound/analysis/common.py,sha256=H1RqNBiOt7WRcHUM3jFydd3850GFQhEDUu8WBEhtMws,5734
72
+ evalvault/adapters/outbound/analysis/comparison_pipeline_adapter.py,sha256=D_ZPVgQHz3Cn3fxxl-TgLEoo_RNbhsOJCvqH-cJ2Lf4,1577
62
73
  evalvault/adapters/outbound/analysis/comparison_report_module.py,sha256=0tTMZB5qpGMaxlcWtTtTln7Y_jFEDFaaW7V-UyboBDM,2343
63
74
  evalvault/adapters/outbound/analysis/data_loader_module.py,sha256=6X0-ZcFtEfonQnbJ0POqmHXstJ1Wq1NvpijtbKSeEm0,3749
64
75
  evalvault/adapters/outbound/analysis/detailed_report_module.py,sha256=59CjuNQthlroJyGEhQap3PgahWfzXciKx_DD10gHXjM,3897
@@ -125,6 +136,9 @@ evalvault/adapters/outbound/documents/ocr/paddleocr_backend.py,sha256=AORA9JUV5u
125
136
  evalvault/adapters/outbound/domain_memory/__init__.py,sha256=ksMX1IkNiDqQHLtJe9TOXiLC1iouGt6_QSdPLiALHHs,229
126
137
  evalvault/adapters/outbound/domain_memory/domain_memory_schema.sql,sha256=APlNhJNFZdcm7Sb2tvr7V8JMiLinmXkx1gd6pgTf9ZI,11268
127
138
  evalvault/adapters/outbound/domain_memory/sqlite_adapter.py,sha256=RWobnFgvxiItxFAr6niY89sT19O-cnExTbP0I7UAY78,85186
139
+ evalvault/adapters/outbound/filesystem/__init__.py,sha256=eTQLuVPMpEctE92TtegKQT3wuJTIhiBS38BzfxRV-N0,122
140
+ evalvault/adapters/outbound/filesystem/difficulty_profile_writer.py,sha256=9qO9_3E-SL6ngDOia6zcw680S1fQloxo32f6hx76YHs,1626
141
+ evalvault/adapters/outbound/filesystem/ops_snapshot_writer.py,sha256=sOuUk8VD8yWwG8508uw1zqSnNsp3dQrF9bp9T2z-n48,448
128
142
  evalvault/adapters/outbound/improvement/__init__.py,sha256=tXA6vaZOLvqwJpyjGMiC8WrvszMmvUPzJnHjvJhQxSI,1143
129
143
  evalvault/adapters/outbound/improvement/insight_generator.py,sha256=U16l0euCZy0_08Zb_i0eijXSjS5t-iq0iMUfttwPqgI,17636
130
144
  evalvault/adapters/outbound/improvement/pattern_detector.py,sha256=uFFjWNy8A4KIihw_ANtL6At73RirwNnFnN4rFsEvcXk,24602
@@ -164,10 +178,10 @@ evalvault/adapters/outbound/nlp/korean/toolkit_factory.py,sha256=x3v-AAkVInOabC4
164
178
  evalvault/adapters/outbound/phoenix/sync_service.py,sha256=i6gHpNiZXKQ5yzV9B2TPb-P1N45k_Ck5ruzh3oqp4d8,9122
165
179
  evalvault/adapters/outbound/report/__init__.py,sha256=8OUduTHnWkBLHYrc7mBg45DnAwz0RgvSJmz1HqxVjLY,477
166
180
  evalvault/adapters/outbound/report/dashboard_generator.py,sha256=Dcu18NTK4lS8XNKnnnquagpZkd-4TSf5Mb2isFNW5Pk,7800
167
- evalvault/adapters/outbound/report/llm_report_generator.py,sha256=HUDA_IPBbl54cyEjTTJzdKTQ6H4IoZi-1VBdVmZf0uI,26593
181
+ evalvault/adapters/outbound/report/llm_report_generator.py,sha256=i_iXfY8qutIb8TsvLKyMLnijsA0yiNJ3rBEFg4zVqcE,26858
168
182
  evalvault/adapters/outbound/report/markdown_adapter.py,sha256=5PS72h_qe4ZtYs-umhX5TqQL2k5SuDaCUc6rRw9AKRw,16761
169
183
  evalvault/adapters/outbound/storage/__init__.py,sha256=n5R6thAPTx1leSwv6od6nBWcLWFa-UYD6cOLzN89T8I,614
170
- evalvault/adapters/outbound/storage/base_sql.py,sha256=7jWtmNDBHncLDABf5ewwQJnfhFjySTfpfDJmEbPBD1w,40823
184
+ evalvault/adapters/outbound/storage/base_sql.py,sha256=bNjJr941wqeLgv4E772JlOer1Q8OpJWxyotsNNn_R98,42536
171
185
  evalvault/adapters/outbound/storage/benchmark_storage_adapter.py,sha256=Qgf9xSSIkYQRpG4uLzcUdoYO9LTQDQ4tFRkkMYer-WA,9803
172
186
  evalvault/adapters/outbound/storage/postgres_adapter.py,sha256=HLaoQ3YJDFwOxeY0S92oPIqb-7EgWSasgt89RM86vr0,47148
173
187
  evalvault/adapters/outbound/storage/postgres_schema.sql,sha256=A9MfO0pjf4kjxoRj2KPI0Gg1cbX13I2YE3oieT-PGiI,8906
@@ -180,10 +194,10 @@ evalvault/adapters/outbound/tracer/open_rag_trace_decorators.py,sha256=LFnk-3FSL
180
194
  evalvault/adapters/outbound/tracer/open_rag_trace_helpers.py,sha256=D48Mbj-ioDKztjhV9513Q5DiUNiVdO60B_2sWMFEmnI,3520
181
195
  evalvault/adapters/outbound/tracer/phoenix_tracer_adapter.py,sha256=inmTAolAVsm0IrszE9VTJoI7HSvGGAnGNZVu_vZRAGg,741
182
196
  evalvault/adapters/outbound/tracker/__init__.py,sha256=Suu5BznOK5uTuD5_jS8JMZd8RPfQNlddLxHCBvMTm_4,358
183
- evalvault/adapters/outbound/tracker/langfuse_adapter.py,sha256=HmuMVUfDYjqNqHZGZMRybhrgca_EmeENuX7DfP-L5Fg,18504
197
+ evalvault/adapters/outbound/tracker/langfuse_adapter.py,sha256=uI-t5v9AC5VUMYsIc1FHYImourZeErGMXB0_prOMErc,18839
184
198
  evalvault/adapters/outbound/tracker/log_sanitizer.py,sha256=ilKTTSzsHslQYc-elnWu0Z3HKNNw1D1iI0_cCvYbo1M,2653
185
- evalvault/adapters/outbound/tracker/mlflow_adapter.py,sha256=m4xj3XBULFYg27U3twKrldLhbLyLNefezmb2pCpHJrw,7180
186
- evalvault/adapters/outbound/tracker/phoenix_adapter.py,sha256=sz5TyWC67e3YbQd2y-ogU9_66rilLdf8TbC-7bN_JR0,24316
199
+ evalvault/adapters/outbound/tracker/mlflow_adapter.py,sha256=6pSxbxSDZE7jN7uSMU6VFg0JlO7cBiMLYcd53NYpfcY,7350
200
+ evalvault/adapters/outbound/tracker/phoenix_adapter.py,sha256=8p2qJeKn6OvIxNbD16h_QrhmCzKIBIf8_ej535MNn_A,26443
187
201
  evalvault/config/__init__.py,sha256=UCgeDx62M2gOuFvdN29wWwny2fdH4bPY_uUC3-42eDw,1297
188
202
  evalvault/config/agent_types.py,sha256=EP2Pv3ZtOzDXIvIa-Hnd1to9JIbMUtGitrlwzZtx0Ys,13418
189
203
  evalvault/config/domain_config.py,sha256=rOgNA2T8NWlDzcEFC0shdUCCww0lI1E5fUm5QrKQSZI,9264
@@ -192,10 +206,10 @@ evalvault/config/langfuse_support.py,sha256=DEzVMfMGGf1V45W_2oUG-NCDfsYI4UUdnYJI
192
206
  evalvault/config/model_config.py,sha256=KlzDbGyDLeOGE7ElekFFk5YjjT5u8i6KO2B4EyZkLnI,3542
193
207
  evalvault/config/phoenix_support.py,sha256=e6RPWd6Qb7KU6Q8pLaYTpJGWULtvEEU6B0xHWyVyOH0,13604
194
208
  evalvault/config/secret_manager.py,sha256=YjPMuNqeBrAR2BzCJvsBNUExaU4TBSFyZ8kVYZZifqA,4172
195
- evalvault/config/settings.py,sha256=JKJf8t20sOHYnHoCfTxqupQixNgfmWYJhChiGMNz-W0,17617
209
+ evalvault/config/settings.py,sha256=xvoNma4CHAd8R_nF0DL4MUWXBWCR5M0C68NPSPLT5JQ,18285
196
210
  evalvault/config/playbooks/improvement_playbook.yaml,sha256=9F9WVVCydFfz6zUuGYzZ4PKdW1LLtcBKVF36T7xT764,26965
197
211
  evalvault/domain/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
198
- evalvault/domain/entities/__init__.py,sha256=RZi_6oQcq-2-sJcydfKOSr03vFxo-mF7CGHN9Ma4Cdg,3379
212
+ evalvault/domain/entities/__init__.py,sha256=wszRJ1Imdc5NJ1bQPC2udk-mAgFdlw4uZV5IPNjLpHQ,3669
199
213
  evalvault/domain/entities/analysis.py,sha256=gcMtumC66g-AIqb2LgfMpm5BMzwJhJkjg-zuybNoJCM,15208
200
214
  evalvault/domain/entities/analysis_pipeline.py,sha256=hD9rFHMa4rUq0InRkSKhh6HQ9ZeNYAHKADzs-kWRP04,16845
201
215
  evalvault/domain/entities/benchmark.py,sha256=CVbz_eW7Y9eM7wG7xA_xmldTIs72csdoTmu3E0NKoMU,18475
@@ -205,42 +219,51 @@ evalvault/domain/entities/debug.py,sha256=r92lgvOpq2svw70syJIo78muRAvrSn5h1JByH_
205
219
  evalvault/domain/entities/experiment.py,sha256=oWjbu0IJZ6oIRcnA-8ppeJDgp57Tv8ZjQ3UOZ0X9KJ8,2576
206
220
  evalvault/domain/entities/feedback.py,sha256=xiaZaUQhyuxyW_i2scXt8eKZshMC6tXe3981e-uukw8,1604
207
221
  evalvault/domain/entities/improvement.py,sha256=WHI7q1jXRxkuHhBWOrpk8UdLaH0UwjZVjRIDsqVDyZo,19322
222
+ evalvault/domain/entities/judge_calibration.py,sha256=fhQEI7g2nZuG1OliikhxgefcFAJldDqEmfTs9Mp-FPk,1234
208
223
  evalvault/domain/entities/kg.py,sha256=8awN1M4vxAGQZk_ZG8i2CXKTizQ8FA1VCLhUWHZq0U8,3001
209
224
  evalvault/domain/entities/memory.py,sha256=bfS75q8K8_jNrB7IYh4mjP8Lkyj-I0TVsmHCP0FuICw,8423
210
225
  evalvault/domain/entities/method.py,sha256=a3jZi7SjcpK3HeVyVwQkUMwpnmg2RbxCnH4NqYPLCOI,1157
211
- evalvault/domain/entities/prompt.py,sha256=VzuUzqkqXv0FwTSNGyV5sSCft5sxTbG_Noq6Ymnke5o,2910
226
+ evalvault/domain/entities/prompt.py,sha256=lQlRnHEKY69GWTC-cUIu0DMuPfJ9UWm6Sm4KTNjVwfY,2920
212
227
  evalvault/domain/entities/prompt_suggestion.py,sha256=Ep_XSjdYUj7pFSCMyeeZKs8yTnp74AVx05Zqr7829PE,1243
213
228
  evalvault/domain/entities/rag_trace.py,sha256=sZgnkG4fK6KOe3Np6TYAZ_tPnsRbOmucDSQns35U1n4,11868
214
229
  evalvault/domain/entities/result.py,sha256=OaGHMDLWMW2O4fNVuVTUvWFVBQ1iu93OD_oI3NumrCQ,10697
215
- evalvault/domain/entities/stage.py,sha256=dbVzhgpP_p2p2eDJBWe7mwyyl6zUTP9kEKN_YRUvufY,7183
216
- evalvault/domain/metrics/__init__.py,sha256=fxjC5Z_8OuBIeMn80bYgnZZxpNoay2wH-qtG3NqCUvk,797
230
+ evalvault/domain/entities/stage.py,sha256=UqS59sjoMs_bhMupNtvagbIx8QgHgFjWoRPhJ3uJP2s,7426
231
+ evalvault/domain/metrics/__init__.py,sha256=Ros3CWg5in1xlEdMa0WUSG602SBVkxw2Zbro-XUlmxU,1214
217
232
  evalvault/domain/metrics/analysis_registry.py,sha256=JZpBrBs7-JExHKYuEML6Vg_uYLm-WniBE3BfiU5OtJg,7641
218
233
  evalvault/domain/metrics/confidence.py,sha256=AX4oeN28OvmMkwD0pT-jskkOlXh87C1pe2W9P1sF69g,17224
219
234
  evalvault/domain/metrics/contextual_relevancy.py,sha256=xAPYUv_0TM4j4WOutOSGftNln_l-2Ev6qpANeu4REv8,11057
220
235
  evalvault/domain/metrics/entity_preservation.py,sha256=uSCbaETceE5PbGn-230Rm8pryOA8jDkkeOwAkWxA65g,6500
221
236
  evalvault/domain/metrics/insurance.py,sha256=5NPeAi_86rpuZRgV4KhzomGrq3Uw2jjglN6FfA_AO8o,4040
222
237
  evalvault/domain/metrics/no_answer.py,sha256=x6vRyOa1jw-qsnw9kOYT8YMPdLElaDRu7zjNCpyJfqM,8237
223
- evalvault/domain/metrics/registry.py,sha256=QKjo4RNHxCqObGg36xJP3KAHqFpHM50Jy7GeSksdz0Y,5665
238
+ evalvault/domain/metrics/registry.py,sha256=1CKPxSjdXK739zhzwodukGDL-dIhlJJH30cuP-czQWc,6926
224
239
  evalvault/domain/metrics/retrieval_rank.py,sha256=F55ByadJBowyKHKBmKAZ0T0qN_R1_7UNu-MiLnT4Ypg,14675
240
+ evalvault/domain/metrics/summary_accuracy.py,sha256=Hr4QS1e4Rxt1MgcTj5rElKuPw9rWS-zGkI0d8wB5dwA,5988
241
+ evalvault/domain/metrics/summary_needs_followup.py,sha256=5kExtZxxankP7csAAIZe_1uRFeBD7NQK-N15b5d0awM,1357
242
+ evalvault/domain/metrics/summary_non_definitive.py,sha256=1EE-z0Ib66gpjc0MGZHmZJHJfpoACSIldgOwFkUNxg0,1029
243
+ evalvault/domain/metrics/summary_risk_coverage.py,sha256=Fo-dMg_jU4MCr0YqOZzBZymwEbG9y2H6eLX-jmuS8IU,1777
225
244
  evalvault/domain/metrics/terms_dictionary.json,sha256=-ZQmpx6yMOYoAOpcLj-xK2LkAeCbAw0EUb6-syIOKS0,3801
226
245
  evalvault/domain/metrics/text_match.py,sha256=P-YTZs9ekDqEmxLNBP8eXnMRymPdC8V4dJPtwG2ajVM,10219
227
246
  evalvault/domain/services/__init__.py,sha256=X5Af1kf_vSt3S3mFwOV6OQdro-lFxwbVdNd7nJznkC8,1024
228
247
  evalvault/domain/services/analysis_service.py,sha256=oUEtfJHB3bNJ_Ksygx-pjnLm4CTk7_rDvDbqfkAfFD4,10838
248
+ evalvault/domain/services/artifact_lint_service.py,sha256=80P46weoj9lBxOqg_ViHZEQ6Cfo69XV4cniZlmMsti0,8434
229
249
  evalvault/domain/services/async_batch_executor.py,sha256=qYFRl7CGmv56XppeRhInde7Fw0GESCoZh8V-Iv_1hQQ,11140
230
250
  evalvault/domain/services/batch_executor.py,sha256=cYA_Q1es46n_PYeyyfm0iM2b7GGVtDoOGoMxexrf6tI,1243
231
251
  evalvault/domain/services/benchmark_report_service.py,sha256=IF-zqtvpsJ0ONJWUEw4ghKiC7ka_PWxUBO10lPaDRmI,15083
232
- evalvault/domain/services/benchmark_runner.py,sha256=iqirGDs-yemSmqLDnCPGA7Wug0ps6z0vRgRsgetypOM,26328
252
+ evalvault/domain/services/benchmark_runner.py,sha256=4tvQEDrfvp2fC2luUPuPBcRjEPLHdrdystLpe3PnBqM,26046
233
253
  evalvault/domain/services/benchmark_service.py,sha256=TrmnvBMAPmcs0PewGZcn2rxHbviZ8KxmDvJCeyqm28I,6286
234
254
  evalvault/domain/services/cache_metrics.py,sha256=FKNZoxym30lc1SxTGmTn3Pr-PDNoAqgC9_d_IdF_jOQ,3463
235
255
  evalvault/domain/services/cluster_map_builder.py,sha256=qPKMPj-eSqECJSCOKvv3ZETgIwxwiKWbU3d6_feCoDg,6885
236
- evalvault/domain/services/dataset_preprocessor.py,sha256=v-shY5ky1oW0LJwBfdfP4VFh7TXBabpLD5rMOmtS-dQ,14235
256
+ evalvault/domain/services/custom_metric_snapshot.py,sha256=_MLOzBlHTRyTQ2NuunZ_lrLVF0__kvEcCUxXVVCeoRA,9684
257
+ evalvault/domain/services/dataset_preprocessor.py,sha256=PnhLiPk0E9DIzjUr8N75296CCfl1AUXGv-lpaXBi0Ok,14797
237
258
  evalvault/domain/services/debug_report_service.py,sha256=SGdFh8tctAIq7RotFbg47eetxdYSS4Yju7-LOzpCMCM,4386
259
+ evalvault/domain/services/difficulty_profile_reporter.py,sha256=uIj9-eiO2dDvQ6tP-DJBddfBq8VT63st0wtNC8Co4NQ,680
260
+ evalvault/domain/services/difficulty_profiling_service.py,sha256=wB3T2iz_dZjvj7wiU2fnM0XT-doMNokV_YqSt24Wc6A,11078
238
261
  evalvault/domain/services/document_chunker.py,sha256=u05N1xSBcJuJPUfP7WmpY_EyHuUMuGMsPSM9qs-ID8c,2494
239
262
  evalvault/domain/services/document_versioning.py,sha256=M1qZaMpQ2exVT1wkVAmvEPPuoYibJDt0F7pYfTK7mvE,3323
240
263
  evalvault/domain/services/domain_learning_hook.py,sha256=rhKBmdnrJyfGzFNsNxzyv8jZO26-WOosHSmBV_9qdJg,7176
241
264
  evalvault/domain/services/embedding_overlay.py,sha256=ZTNxUPXpHGbQ3Uri5DD3feTUFn7qrhuNshhyCQEvRuM,3559
242
265
  evalvault/domain/services/entity_extractor.py,sha256=f3Rf5saK8QsgetLNK1Hbxzt8PtttJZCicSR63S8DJ5k,14141
243
- evalvault/domain/services/evaluator.py,sha256=YReil1mokTILyllAbG_QnFhob-15G5tNeWZZMbSc3yo,67551
266
+ evalvault/domain/services/evaluator.py,sha256=Fvth2VdckDJvGuwxbXPnvPfQU59WZSJHV63H4qji4lM,78815
244
267
  evalvault/domain/services/experiment_comparator.py,sha256=IBrxIwux-8GucwlLx6e5lUqB9miSPvBLGJK9ctoW7Y0,3299
245
268
  evalvault/domain/services/experiment_manager.py,sha256=2k-qGiAUyZuqqmcp4P-M3Z9HTXwwcqW5HQYKNkcIHuI,4863
246
269
  evalvault/domain/services/experiment_reporter.py,sha256=QYlVmCFSx8hKTPMezc7QjJE07b3MSQ82Q4QVucSHLVY,1420
@@ -249,53 +272,62 @@ evalvault/domain/services/experiment_statistics.py,sha256=aOrqbBjB1swHPaFRziID1m
249
272
  evalvault/domain/services/holdout_splitter.py,sha256=Sos61Zy_bBjStt8LPHJ3KxDNda-OmX7AVUsT24K1n6Q,1910
250
273
  evalvault/domain/services/improvement_guide_service.py,sha256=gMoVFlDsprOEEfRGKmdbk9_Due62J63Q-rL2zr65Q0s,17881
251
274
  evalvault/domain/services/intent_classifier.py,sha256=hsWivDXqXJjCJEE-OI7eUGeYrewpYxlz67Z0TI3oskU,11707
275
+ evalvault/domain/services/judge_calibration_service.py,sha256=cOaAsbfMBlaDxoMAXe8MacDDRK0tCD-tXRnYjB6sEPs,19264
252
276
  evalvault/domain/services/kg_generator.py,sha256=oEugjPdn8Pb2Q3r5yAZl0dZJibNUkEherlRVquknB6k,24969
253
277
  evalvault/domain/services/memory_aware_evaluator.py,sha256=vTiYoxiMfZ_CMjSBjqwkBRdpiXRwQ2zXnQ2pXzVHYts,5249
254
278
  evalvault/domain/services/memory_based_analysis.py,sha256=oh2irCy3le7fWiTtL31SMEhPyu7fyBVz-giO2hlNifE,4499
255
279
  evalvault/domain/services/method_runner.py,sha256=pABqKZeaALpWZYDfzAbd-VOZt2djQggRNIPuuPQeUSw,3571
280
+ evalvault/domain/services/ops_snapshot_service.py,sha256=1CqJN2p3tM6SgzLCZKcVEM213fd1cDGexTRPG_3e59w,5138
256
281
  evalvault/domain/services/pipeline_orchestrator.py,sha256=yriVlEVZYDtt0Vwt4Ae6xyW1H6Dj4Hxdn8XQSvQNSoQ,19436
257
282
  evalvault/domain/services/pipeline_template_registry.py,sha256=aWqXLQ24grpSZo9M4tZLRo1ysD10c6hUpW3JupZH9e0,28083
258
283
  evalvault/domain/services/prompt_candidate_service.py,sha256=Ibyb5EaWK28Ju2HnTqHHGOoiA9Q-VwY3hjxVODALwGY,3997
259
284
  evalvault/domain/services/prompt_manifest.py,sha256=5s5Kd6-_Dn-xrjjlU99CVo6njsPhvE50H5m_85U-H6U,5612
260
- evalvault/domain/services/prompt_registry.py,sha256=THcNs4jqp4FTLv9uO-VVvN6XGorkGrcIUwQH1dL74o8,4166
285
+ evalvault/domain/services/prompt_registry.py,sha256=QyL4yIcKT93uv6L0-Q_iaNXno8QnsC19YcGekuSRMtE,5247
261
286
  evalvault/domain/services/prompt_scoring_service.py,sha256=SlvfuIbhj92RJu4RQAJ1BGKhKkOAUOt3cZNH21HtsX4,9833
262
287
  evalvault/domain/services/prompt_status.py,sha256=r1dFLGz4SfRxXaxsULQsr0-HpJkG9YfZ_yLIxF1MMBo,6731
263
288
  evalvault/domain/services/prompt_suggestion_reporter.py,sha256=Fc6sCPebUMk8SZVpjoJ6bCEun0ma-YmayEQnulBVv8s,10577
264
289
  evalvault/domain/services/ragas_prompt_overrides.py,sha256=4BecYE2KrreUBbIM3ssP9WzHcK_wRc8jW7CE_k58QOU,1412
290
+ evalvault/domain/services/regression_gate_service.py,sha256=qBMODgpizmEzqEL8_JX-FYSVyARiroMW7MFVzlz7gjc,6579
265
291
  evalvault/domain/services/retrieval_metrics.py,sha256=dtrQPLMrXSyWLcgF8EGcLNFwzwA59WDzEh41JRToHAY,2980
266
292
  evalvault/domain/services/retriever_context.py,sha256=ySQ-GuadiggS0LVAib4AxA_0JpasYz4S9hbjau0eyIA,6482
293
+ evalvault/domain/services/run_comparison_service.py,sha256=_NScltCRcY3zrvdyYDiPmssTxCDv1GyjCLdP3uAxJts,5631
267
294
  evalvault/domain/services/satisfaction_calibration_service.py,sha256=H7Z8opOyPHRO5qVIw-XDsNhIwdCteAS9_a3BTlfIqHg,11906
268
- evalvault/domain/services/stage_event_builder.py,sha256=ScTgyeRiH7z_rnNI_2p-i9szVRIRwUxGSJvpEj3zto4,9645
295
+ evalvault/domain/services/stage_event_builder.py,sha256=FAT34Wmylvd2Yz5rDlhaTh1lqSCDhGApCXMi7Hjkib0,9748
269
296
  evalvault/domain/services/stage_metric_guide_service.py,sha256=_JdRsBRWirO24qYFlh6hG-dkoWlX6_XWEYKf_uUlKIQ,8807
270
- evalvault/domain/services/stage_metric_service.py,sha256=KukIWWhWVOtclrET6uyWJ17jG76LfkKiqrUrDIDJ3gw,15327
297
+ evalvault/domain/services/stage_metric_service.py,sha256=_u6ThZ8rGw8H9h3TNpu0j8XhpIfukHSoyc1ZpCa3Z00,18031
271
298
  evalvault/domain/services/stage_summary_service.py,sha256=VVtuAr4vwzvmNFn8rqURJrhKFqAMG4CaBmyGiUk_xG0,1590
272
299
  evalvault/domain/services/synthetic_qa_generator.py,sha256=aiOTPoHZbKRTEeodABQ2I5lq8-Vs_kQtuzcGWd4MTGE,16526
273
300
  evalvault/domain/services/testset_generator.py,sha256=6IpiZ0pqhKEymo-AlUdfJjDkF2P1n8Md_QKV4nOheyg,4470
274
- evalvault/domain/services/threshold_profiles.py,sha256=YuOrD5CkXugAdSQYbMsFzS5VS1R201JOJtpKTs4dpXU,1296
301
+ evalvault/domain/services/threshold_profiles.py,sha256=yYJ7o8SIRufI7kUN8edh8am-dVOq_TEhvDqlHe0WQUQ,1433
275
302
  evalvault/domain/services/unified_report_service.py,sha256=lG3VpMLC1MTYUlcGl-MUEE4PUopkyrhcgj4_ye9c_vM,11829
276
- evalvault/domain/services/visual_space_service.py,sha256=xG2jxKuRuqmQgbWsXOqmytKr6pQ7igQujNgdpb5gyB0,32569
303
+ evalvault/domain/services/visual_space_service.py,sha256=3_qyBsThr5lzP1le6qkXf9ByX3JjoYGX15iMIHe8gQs,34958
277
304
  evalvault/ports/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
278
305
  evalvault/ports/inbound/__init__.py,sha256=2Wsc0vNzH8_ZaErk4OHxP93hRonLUkMbn3W28DtTDO0,562
279
306
  evalvault/ports/inbound/analysis_pipeline_port.py,sha256=RJfKtp22AYEqnmRk6RDawAK52rEmyAhuk0FUPJQUwQU,1758
280
307
  evalvault/ports/inbound/evaluator_port.py,sha256=rDvouIRUjBD7uICgrpeo11vNPvo27_0CdylRHPodPSE,1323
281
308
  evalvault/ports/inbound/learning_hook_port.py,sha256=EeJeMl3chcPHlj5mkLOj6tm8s_qdDRvoCwK1-0l70tI,3297
282
309
  evalvault/ports/inbound/web_port.py,sha256=b4uMhwOMLXy3LeILc7ZK3RR-XtoW4p4NzoTpj4syptg,5578
283
- evalvault/ports/outbound/__init__.py,sha256=0uPE6CXkoK6ECs3O4OZYAmVg5VAAHoF5rdb1eoj1NAc,3367
310
+ evalvault/ports/outbound/__init__.py,sha256=x3LseXtwX0NONM1mnhT3fMchz7U6gEDRUX0TDswpg5E,3591
284
311
  evalvault/ports/outbound/analysis_cache_port.py,sha256=zPSdUVK_yw3PMWPII2YvS1WLmCGlg5bDScSuYINW9yc,1386
285
312
  evalvault/ports/outbound/analysis_module_port.py,sha256=QYzkvie9-BbONj8ZgiQUjm8I-bn8mgzlXTzIXMhehmQ,1881
286
313
  evalvault/ports/outbound/analysis_port.py,sha256=gE-iXToTgdQomj9JwNZJY4nwut8q0J6EurUmJNsnptQ,2127
314
+ evalvault/ports/outbound/artifact_fs_port.py,sha256=SN966vwHiIjLA06MBWePr7V0NmafbiQbSLFlXAN3YKU,273
287
315
  evalvault/ports/outbound/benchmark_port.py,sha256=pgo3rNbvvJS8x03UxBVQPBBgxc7X5kfG70ZlIf3sopE,7173
288
316
  evalvault/ports/outbound/causal_analysis_port.py,sha256=IsyVdFrs66mHcOc-_VbxrZQriwMrDxx-5a_4ElX5Bp0,941
317
+ evalvault/ports/outbound/comparison_pipeline_port.py,sha256=IOLK6vZdzjSV6Qcvkl9GD-wRxx6Waa3dsYOCFdD1mXY,503
289
318
  evalvault/ports/outbound/dataset_port.py,sha256=OpEBlkvFwpSRbmi-Lt3wK7n0wljmQ6m985mjyNn_qFk,990
319
+ evalvault/ports/outbound/difficulty_profile_port.py,sha256=hQY-TR64WyUNnCxD9Mw-QraO3ZBw0VUP8KoCmVsQYBE,347
290
320
  evalvault/ports/outbound/domain_memory_port.py,sha256=SZFurqsoBmTw1Kt_pej-YpMbooVeyV35jekhaDRojus,23320
291
321
  evalvault/ports/outbound/embedding_port.py,sha256=ZHeKRMRBNjpZKWxsLKrD8jJz0M66JTwNcrJbkRaklK4,2034
292
322
  evalvault/ports/outbound/improvement_port.py,sha256=fIXhcG4n6OJ1hdvWeqEoLBrVsCNdHZRgtEZjR8lf3qA,2325
293
323
  evalvault/ports/outbound/intent_classifier_port.py,sha256=gqMIk0rH6Z43ceuMMRX4vqXurgHZz-CJX2bR5PVAkjQ,2253
324
+ evalvault/ports/outbound/judge_calibration_port.py,sha256=kShZ2MZGvgQZaY7XxwkmLXtquK_RFKcwuWRBfJOrILA,602
294
325
  evalvault/ports/outbound/korean_nlp_port.py,sha256=mJCnxBAkV8a5Nd_VX6QcjfDucY62er8GlaNO4HQA8q8,1572
295
326
  evalvault/ports/outbound/llm_factory_port.py,sha256=lzoDJi6A6ltk-t3N4oY8DSwMBMfnvXGgSduILOpzoas,305
296
327
  evalvault/ports/outbound/llm_port.py,sha256=YAW0i-41yT8KzMuzZGEO5yPDkHN0onGxj55eL0cdPHY,4393
297
328
  evalvault/ports/outbound/method_port.py,sha256=sntcKgwagAdJGxp0dI-S_bhBQcOW9QpnND3fOjrsX9E,1377
298
329
  evalvault/ports/outbound/nlp_analysis_port.py,sha256=QDJHAsSpynTenuaKp78t1s--U036mtYeUEX0p5vQw24,3046
330
+ evalvault/ports/outbound/ops_snapshot_port.py,sha256=6v72W41tlnxjkJfbfHhFiJMPlRSAQ-BvrI2T09_yddk,214
299
331
  evalvault/ports/outbound/relation_augmenter_port.py,sha256=cMcHQnmK111WzZr50vYr7affeHhOtpFZxPARwkg9xbk,651
300
332
  evalvault/ports/outbound/report_port.py,sha256=wgReSYL4SupXIoALFh0QFWfX2kzPftXpWTvGLCMd2B8,1315
301
333
  evalvault/ports/outbound/stage_storage_port.py,sha256=Nlf9upsXxgCABQB5cJdpLQYsoZNiGRAU5zE5D-Ptp2I,1201
@@ -306,8 +338,8 @@ evalvault/reports/__init__.py,sha256=Bb1X4871msAN8I6PM6nKGED3psPwZt88hXZBAOdH06Y
306
338
  evalvault/reports/release_notes.py,sha256=pZj0PBFT-4F_Ty-Kv5P69BuoOnmTCn4kznDcORFJd0w,4011
307
339
  evalvault/scripts/__init__.py,sha256=NwEeIFQbkX4ml2R_PhtIoNtArDSX_suuoymgG_7Kwso,89
308
340
  evalvault/scripts/regression_runner.py,sha256=SxZori5BZ8jVQ057Mf5V5FPgIVDccrV5oRONmnhuk8w,8438
309
- evalvault-1.64.0.dist-info/METADATA,sha256=DcFREpjg4tyoNf8FXTK632rgrOsWuFjSGnVBBQ4LeQ4,24276
310
- evalvault-1.64.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
311
- evalvault-1.64.0.dist-info/entry_points.txt,sha256=Oj9Xc5gYcyUYYNmQfWI8NYGw7nN-3M-h2ipHIMlVn6o,65
312
- evalvault-1.64.0.dist-info/licenses/LICENSE.md,sha256=3RNWY4jjtrQ_yYa-D-7I3XO12Ti7YzxsLV_dpykujvo,11358
313
- evalvault-1.64.0.dist-info/RECORD,,
341
+ evalvault-1.66.0.dist-info/METADATA,sha256=f6jzeYkN1iuFwYJTcI8r5L52hVNZwACOlQuWYvVz_JY,26159
342
+ evalvault-1.66.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
343
+ evalvault-1.66.0.dist-info/entry_points.txt,sha256=Oj9Xc5gYcyUYYNmQfWI8NYGw7nN-3M-h2ipHIMlVn6o,65
344
+ evalvault-1.66.0.dist-info/licenses/LICENSE.md,sha256=3RNWY4jjtrQ_yYa-D-7I3XO12Ti7YzxsLV_dpykujvo,11358
345
+ evalvault-1.66.0.dist-info/RECORD,,