evalvault 1.65.0__py3-none-any.whl → 1.66.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. evalvault/adapters/inbound/api/adapter.py +14 -0
  2. evalvault/adapters/inbound/api/main.py +14 -4
  3. evalvault/adapters/inbound/api/routers/chat.py +543 -0
  4. evalvault/adapters/inbound/cli/commands/run.py +14 -0
  5. evalvault/adapters/inbound/cli/commands/run_helpers.py +21 -2
  6. evalvault/adapters/outbound/report/llm_report_generator.py +13 -1
  7. evalvault/adapters/outbound/storage/base_sql.py +41 -1
  8. evalvault/adapters/outbound/tracker/langfuse_adapter.py +1 -0
  9. evalvault/adapters/outbound/tracker/mlflow_adapter.py +5 -0
  10. evalvault/adapters/outbound/tracker/phoenix_adapter.py +29 -2
  11. evalvault/config/settings.py +21 -0
  12. evalvault/domain/entities/prompt.py +1 -1
  13. evalvault/domain/metrics/__init__.py +8 -0
  14. evalvault/domain/metrics/registry.py +39 -3
  15. evalvault/domain/metrics/summary_accuracy.py +189 -0
  16. evalvault/domain/metrics/summary_needs_followup.py +45 -0
  17. evalvault/domain/metrics/summary_non_definitive.py +41 -0
  18. evalvault/domain/metrics/summary_risk_coverage.py +45 -0
  19. evalvault/domain/services/custom_metric_snapshot.py +233 -0
  20. evalvault/domain/services/evaluator.py +280 -27
  21. evalvault/domain/services/prompt_registry.py +39 -10
  22. evalvault/domain/services/threshold_profiles.py +4 -0
  23. evalvault/domain/services/visual_space_service.py +79 -4
  24. {evalvault-1.65.0.dist-info → evalvault-1.66.0.dist-info}/METADATA +25 -1
  25. {evalvault-1.65.0.dist-info → evalvault-1.66.0.dist-info}/RECORD +28 -22
  26. {evalvault-1.65.0.dist-info → evalvault-1.66.0.dist-info}/WHEEL +0 -0
  27. {evalvault-1.65.0.dist-info → evalvault-1.66.0.dist-info}/entry_points.txt +0 -0
  28. {evalvault-1.65.0.dist-info → evalvault-1.66.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -109,30 +109,59 @@ def build_prompt_summary(bundle: PromptSetBundle) -> dict[str, Any]:
109
109
 
110
110
  def build_prompt_inputs_from_snapshots(
111
111
  snapshots: dict[str, dict[str, Any]] | None,
112
+ *,
113
+ kind: PromptKind = "ragas",
114
+ source: str | None = None,
112
115
  ) -> list[PromptInput]:
113
116
  if not snapshots:
114
117
  return []
115
118
  prompt_inputs: list[PromptInput] = []
116
119
  for metric_name, entry in snapshots.items():
117
- prompt_text = entry.get("prompt") if isinstance(entry, dict) else None
120
+ if not isinstance(entry, dict):
121
+ continue
122
+ entry_source = entry.get("source")
123
+ resolved_source = source if source else entry_source
124
+ metadata = {key: value for key, value in entry.items() if key != "prompt"}
125
+
126
+ prompts_map = entry.get("prompts")
127
+ if isinstance(prompts_map, dict) and prompts_map:
128
+ for prompt_key, prompt_text in prompts_map.items():
129
+ if not isinstance(prompt_text, str):
130
+ continue
131
+ normalized = prompt_text.strip()
132
+ if not normalized:
133
+ continue
134
+ prompt_inputs.append(
135
+ PromptInput(
136
+ content=normalized,
137
+ name=f"{kind}.{metric_name}.{prompt_key}",
138
+ kind=kind,
139
+ role=f"{metric_name}.{prompt_key}",
140
+ source=(
141
+ resolved_source
142
+ if isinstance(resolved_source, str) and resolved_source
143
+ else kind
144
+ ),
145
+ metadata=metadata or None,
146
+ )
147
+ )
148
+ continue
149
+
150
+ prompt_text = entry.get("prompt")
118
151
  if not isinstance(prompt_text, str):
119
152
  continue
120
153
  prompt_text = prompt_text.strip()
121
154
  if not prompt_text:
122
155
  continue
123
- source = entry.get("source") if isinstance(entry, dict) else None
124
- metadata = {
125
- key: value
126
- for key, value in entry.items()
127
- if key != "prompt" and isinstance(entry, dict)
128
- }
129
156
  prompt_inputs.append(
130
157
  PromptInput(
131
158
  content=prompt_text,
132
- name=f"ragas.{metric_name}",
133
- kind="ragas",
159
+ name=f"{kind}.{metric_name}",
160
+ kind=kind,
134
161
  role=str(metric_name),
135
- source=source if isinstance(source, str) and source else "ragas",
162
+ source=resolved_source
163
+ if isinstance(resolved_source, str) and resolved_source
164
+ else kind,
136
165
  metadata=metadata or None,
137
166
  )
138
167
  )
@@ -8,6 +8,10 @@ SUMMARY_RECOMMENDED_THRESHOLDS = {
8
8
  "summary_faithfulness": 0.90,
9
9
  "summary_score": 0.85,
10
10
  "entity_preservation": 0.90,
11
+ "summary_accuracy": 0.90,
12
+ "summary_risk_coverage": 0.90,
13
+ "summary_non_definitive": 0.80,
14
+ "summary_needs_followup": 0.80,
11
15
  }
12
16
  QA_RECOMMENDED_THRESHOLDS = {
13
17
  "faithfulness": 0.70,
@@ -233,13 +233,13 @@ def _aggregate_stage_metrics(metrics: Iterable[StageMetric]) -> dict[str, dict[s
233
233
 
234
234
  aggregated: dict[str, dict[str, float]] = {}
235
235
  for name, entries in buckets.items():
236
- scores = [m.score for m in entries]
236
+ scores = [m.score for m in entries if m.score is not None]
237
237
  threshold = next(
238
238
  (m.threshold for m in entries if m.threshold is not None),
239
239
  DEFAULT_STAGE_THRESHOLDS.get(name),
240
240
  )
241
241
  aggregated[name] = {
242
- "avg": mean(scores) if scores else None,
242
+ "avg": mean(scores) if scores else 0.0,
243
243
  "threshold": threshold if threshold is not None else DEFAULT_METRIC_THRESHOLD,
244
244
  }
245
245
  return aggregated
@@ -770,6 +770,77 @@ def _build_case_coords(result: TestCaseResult) -> dict[str, float | None]:
770
770
  ),
771
771
  ]
772
772
  )
773
+
774
+ if x_value is None:
775
+ x_value = _weighted_average(
776
+ [
777
+ (
778
+ _centered_norm(
779
+ scores.get("summary_accuracy"), thresholds.get("summary_accuracy")
780
+ ),
781
+ 0.4,
782
+ ),
783
+ (
784
+ _centered_norm(
785
+ scores.get("summary_risk_coverage"),
786
+ thresholds.get("summary_risk_coverage"),
787
+ ),
788
+ 0.3,
789
+ ),
790
+ (
791
+ _centered_norm(
792
+ scores.get("summary_faithfulness"),
793
+ thresholds.get("summary_faithfulness"),
794
+ ),
795
+ 0.2,
796
+ ),
797
+ (
798
+ _centered_norm(scores.get("summary_score"), thresholds.get("summary_score")),
799
+ 0.1,
800
+ ),
801
+ (
802
+ _centered_norm(
803
+ scores.get("entity_preservation"),
804
+ thresholds.get("entity_preservation"),
805
+ ),
806
+ 0.2,
807
+ ),
808
+ ]
809
+ )
810
+
811
+ if y_value is None:
812
+ y_value = _weighted_average(
813
+ [
814
+ (
815
+ _centered_norm(
816
+ scores.get("summary_accuracy"), thresholds.get("summary_accuracy")
817
+ ),
818
+ 0.35,
819
+ ),
820
+ (
821
+ _centered_norm(
822
+ scores.get("summary_non_definitive"),
823
+ thresholds.get("summary_non_definitive"),
824
+ ),
825
+ 0.35,
826
+ ),
827
+ (
828
+ _centered_norm(
829
+ scores.get("summary_needs_followup"),
830
+ thresholds.get("summary_needs_followup"),
831
+ ),
832
+ 0.3,
833
+ ),
834
+ (
835
+ _centered_norm(
836
+ scores.get("entity_preservation"),
837
+ thresholds.get("entity_preservation"),
838
+ ),
839
+ 0.2,
840
+ ),
841
+ ]
842
+ )
843
+
773
844
  return {"x": x_value, "y": y_value}
774
845
 
775
846
 
@@ -799,8 +870,12 @@ def _build_cluster_points(
799
870
 
800
871
  points = []
801
872
  for cluster_id, coords_list in clusters.items():
802
- x_values = [c.get("x") for c in coords_list if c.get("x") is not None]
803
- y_values = [c.get("y") for c in coords_list if c.get("y") is not None]
873
+ x_values = [
874
+ value for value in (c.get("x") for c in coords_list) if isinstance(value, (int, float))
875
+ ]
876
+ y_values = [
877
+ value for value in (c.get("y") for c in coords_list) if isinstance(value, (int, float))
878
+ ]
804
879
  x_avg = mean(x_values) if x_values else None
805
880
  y_avg = mean(y_values) if y_values else None
806
881
  quadrant = _quadrant_label(x_avg, y_avg)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: evalvault
3
- Version: 1.65.0
3
+ Version: 1.66.0
4
4
  Summary: RAG evaluation system using Ragas with Phoenix/Langfuse tracing
5
5
  Project-URL: Homepage, https://github.com/ntts9990/EvalVault
6
6
  Project-URL: Documentation, https://github.com/ntts9990/EvalVault#readme
@@ -25,6 +25,7 @@ Classifier: Topic :: Software Development :: Quality Assurance
25
25
  Classifier: Topic :: Software Development :: Testing
26
26
  Classifier: Typing :: Typed
27
27
  Requires-Python: >=3.12
28
+ Requires-Dist: chainlit>=2.9.5
28
29
  Requires-Dist: chardet
29
30
  Requires-Dist: fastapi>=0.128.0
30
31
  Requires-Dist: instructor
@@ -137,12 +138,17 @@ English version? See `README.en.md`.
137
138
  ## Quick Links
138
139
 
139
140
  - 문서 허브: `docs/INDEX.md`
141
+ - CLI 실행 시나리오 가이드: `docs/guides/RAG_CLI_WORKFLOW_TEMPLATES.md`
140
142
  - 사용자 가이드: `docs/guides/USER_GUIDE.md`
141
143
  - 개발 가이드: `docs/guides/DEV_GUIDE.md`
142
144
  - 상태/로드맵: `docs/STATUS.md`, `docs/ROADMAP.md`
143
145
  - 개발 백서(설계/운영/품질 기준): `docs/new_whitepaper/INDEX.md`
144
146
  - Open RAG Trace: `docs/architecture/open-rag-trace-spec.md`
145
147
 
148
+ ### 다음 개선 작업 메모
149
+ - 보험 요약 메트릭 확장 계획: `docs/guides/INSURANCE_SUMMARY_METRICS_PLAN.md`
150
+ - Prompt 반복 적용 계획: `docs/guides/repeat_query.md`
151
+
146
152
  ---
147
153
 
148
154
  ## EvalVault가 해결하는 문제
@@ -470,6 +476,24 @@ npm run dev
470
476
  - Ragas 계열: `faithfulness`, `answer_relevancy`, `context_precision`, `context_recall`, `factual_correctness`, `semantic_similarity`
471
477
  - 커스텀 예시(도메인): `insurance_term_accuracy`
472
478
 
479
+ ### 요약 메트릭 설계 근거 (summary_score, summary_faithfulness, entity_preservation)
480
+
481
+ ### 커스텀 메트릭 스냅샷 (평가 방식/과정/결과 기록)
482
+ - 평가 방식/입출력/규칙/구현 파일 해시를 `run.tracker_metadata.custom_metric_snapshot`에 기록합니다.
483
+ - Excel `CustomMetrics` 시트와 Langfuse/Phoenix/MLflow artifact에도 함께 저장됩니다.
484
+
485
+ - `summary_faithfulness`: 요약의 모든 주장이 컨텍스트에 근거하는지 평가합니다. 환각/왜곡 리스크를 직접적으로 측정합니다.
486
+ - `summary_score`: 컨텍스트 대비 요약의 핵심 정보 보존/간결성 균형을 평가합니다. 정답 요약 단일 기준의 편향을 줄입니다.
487
+ - `entity_preservation`: 금액·기간·조건·면책 등 보험 약관에서 중요한 엔티티가 요약에 유지되는지 측정합니다.
488
+
489
+ **보험 도메인 특화 근거**
490
+ - 보험 약관에서 치명적인 요소(면책, 자기부담, 한도, 조건 등)를 키워드로 직접 반영하고, 금액/기간/비율 같은 핵심 엔티티를 보존하도록 설계했습니다.
491
+ - 범용 규칙(숫자/기간/금액)과 보험 특화 키워드를 함께 사용하므로, 현재 상태는 “보험 리스크 중심의 약한 도메인 특화”로 보는 것이 정확합니다.
492
+
493
+ **해석 주의사항**
494
+ - 세 메트릭 모두 `contexts` 품질에 크게 의존합니다. 컨텍스트가 부정확/과도하면 점수가 낮아질 수 있습니다.
495
+ - `summary_score`는 키프레이즈 기반이므로, 표현이 달라지면 점수가 낮게 나올 수 있습니다.
496
+
473
497
  정확한 옵션/운영 레시피는 `docs/guides/USER_GUIDE.md`를 기준으로 최신화합니다.
474
498
 
475
499
  ---
@@ -5,10 +5,11 @@ evalvault/mkdocs_helpers.py,sha256=1AKVQ1W2_VO4qclhfyefyU9Dz1Hzkh1DWDwsFMe24jc,3
5
5
  evalvault/adapters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  evalvault/adapters/inbound/__init__.py,sha256=SG1svel1PwqetnqVpKFLSv612_WwGwLTbFpYgwk6FMw,166
7
7
  evalvault/adapters/inbound/api/__init__.py,sha256=LeVVttCA3tLKoHA2PO4z3y8VkfVcf3Bq8CZSzo91lf4,34
8
- evalvault/adapters/inbound/api/adapter.py,sha256=tYkJciUUFOK80QcSwzrqkXP1G4qUFItFV7uBYbjBGqU,68473
9
- evalvault/adapters/inbound/api/main.py,sha256=lRuyg3aBs5jIk7tq4p4d7jrRkFpV_brZypoOq8s56Rk,6896
8
+ evalvault/adapters/inbound/api/adapter.py,sha256=HgWSYyUxvJPlaSG158WVzpPckpPCYV9Ec3CWN8rLFdI,69118
9
+ evalvault/adapters/inbound/api/main.py,sha256=skYtmDngdOBryyLXQpNGlSd2Te6RF6GtfIwcMACPHFU,7068
10
10
  evalvault/adapters/inbound/api/routers/__init__.py,sha256=q07_YF9TnBl68bqcRCvhPU4-zRTyvmPoHVehwO6W7QM,19
11
11
  evalvault/adapters/inbound/api/routers/benchmark.py,sha256=yevntbZcNtMvbVODsITUBgR1Ka4pdFQrXBJJ4K4Jyr4,4477
12
+ evalvault/adapters/inbound/api/routers/chat.py,sha256=3S6-ljiY1COlDuVDH5yzMJs9SO0EkuosRcJIYScHWvI,18143
12
13
  evalvault/adapters/inbound/api/routers/config.py,sha256=LygN0fVMr8NFtj5zuQXnVFhoafx56Txa98vpwtPa4Jc,4104
13
14
  evalvault/adapters/inbound/api/routers/domain.py,sha256=RsR7GIFMjccDN7vpG1uDyk9n1DnCTH18JDGAX7o4Qqc,3648
14
15
  evalvault/adapters/inbound/api/routers/knowledge.py,sha256=yb_e7OEPtwldOAzHTGiWe7jShHw2JdpOFnzGPMceRsg,7109
@@ -42,8 +43,8 @@ evalvault/adapters/inbound/cli/commands/pipeline.py,sha256=NeqWLzO9kRDuZd0pHAIHg
42
43
  evalvault/adapters/inbound/cli/commands/profile_difficulty.py,sha256=nOJH3iqgLAlXq4keLBj5oqpiRCg0jjGgT-7Q57HxEh8,6665
43
44
  evalvault/adapters/inbound/cli/commands/prompts.py,sha256=lddde5VbjYaqN_9gHPLNu6DWpg5fE-KqZzjN-XYwvJw,27153
44
45
  evalvault/adapters/inbound/cli/commands/regress.py,sha256=Dy8hUOdjapxOW9Hoov0DHHblkMaExiqWfYS14CaC9Kk,8806
45
- evalvault/adapters/inbound/cli/commands/run.py,sha256=X19rgXhajhvZNA4c0JMmzmPatTxhZgfapuW07bZL9xA,119265
46
- evalvault/adapters/inbound/cli/commands/run_helpers.py,sha256=hu2TioocitUZzGR7HUwZ6gOeEJSvt5tGNjwXOlo4Eic,40336
46
+ evalvault/adapters/inbound/cli/commands/run.py,sha256=aKoZcQbOJ1KB_4zPk4L-AWw3u9vGWg3SaooR7A3Xd_Y,119910
47
+ evalvault/adapters/inbound/cli/commands/run_helpers.py,sha256=93jFUg8QLrD38QU2JhOhFMoHDWUphSEKRdJ5KcUvrkQ,40806
47
48
  evalvault/adapters/inbound/cli/commands/stage.py,sha256=oRC9c5CysLX90Iy5Ba1pc_00DaOBS78lcBvzkbdrGRM,17123
48
49
  evalvault/adapters/inbound/cli/utils/__init__.py,sha256=QPNKneZS-Z-tTnYYxtgJXgcJWY6puUlRQcKrn7Mlv1M,685
49
50
  evalvault/adapters/inbound/cli/utils/analysis_io.py,sha256=RHkKEq4e-PtbtRDlXAJWU80RYHNPw-O5V9_GujdaGfc,13393
@@ -177,10 +178,10 @@ evalvault/adapters/outbound/nlp/korean/toolkit_factory.py,sha256=x3v-AAkVInOabC4
177
178
  evalvault/adapters/outbound/phoenix/sync_service.py,sha256=i6gHpNiZXKQ5yzV9B2TPb-P1N45k_Ck5ruzh3oqp4d8,9122
178
179
  evalvault/adapters/outbound/report/__init__.py,sha256=8OUduTHnWkBLHYrc7mBg45DnAwz0RgvSJmz1HqxVjLY,477
179
180
  evalvault/adapters/outbound/report/dashboard_generator.py,sha256=Dcu18NTK4lS8XNKnnnquagpZkd-4TSf5Mb2isFNW5Pk,7800
180
- evalvault/adapters/outbound/report/llm_report_generator.py,sha256=HUDA_IPBbl54cyEjTTJzdKTQ6H4IoZi-1VBdVmZf0uI,26593
181
+ evalvault/adapters/outbound/report/llm_report_generator.py,sha256=i_iXfY8qutIb8TsvLKyMLnijsA0yiNJ3rBEFg4zVqcE,26858
181
182
  evalvault/adapters/outbound/report/markdown_adapter.py,sha256=5PS72h_qe4ZtYs-umhX5TqQL2k5SuDaCUc6rRw9AKRw,16761
182
183
  evalvault/adapters/outbound/storage/__init__.py,sha256=n5R6thAPTx1leSwv6od6nBWcLWFa-UYD6cOLzN89T8I,614
183
- evalvault/adapters/outbound/storage/base_sql.py,sha256=7jWtmNDBHncLDABf5ewwQJnfhFjySTfpfDJmEbPBD1w,40823
184
+ evalvault/adapters/outbound/storage/base_sql.py,sha256=bNjJr941wqeLgv4E772JlOer1Q8OpJWxyotsNNn_R98,42536
184
185
  evalvault/adapters/outbound/storage/benchmark_storage_adapter.py,sha256=Qgf9xSSIkYQRpG4uLzcUdoYO9LTQDQ4tFRkkMYer-WA,9803
185
186
  evalvault/adapters/outbound/storage/postgres_adapter.py,sha256=HLaoQ3YJDFwOxeY0S92oPIqb-7EgWSasgt89RM86vr0,47148
186
187
  evalvault/adapters/outbound/storage/postgres_schema.sql,sha256=A9MfO0pjf4kjxoRj2KPI0Gg1cbX13I2YE3oieT-PGiI,8906
@@ -193,10 +194,10 @@ evalvault/adapters/outbound/tracer/open_rag_trace_decorators.py,sha256=LFnk-3FSL
193
194
  evalvault/adapters/outbound/tracer/open_rag_trace_helpers.py,sha256=D48Mbj-ioDKztjhV9513Q5DiUNiVdO60B_2sWMFEmnI,3520
194
195
  evalvault/adapters/outbound/tracer/phoenix_tracer_adapter.py,sha256=inmTAolAVsm0IrszE9VTJoI7HSvGGAnGNZVu_vZRAGg,741
195
196
  evalvault/adapters/outbound/tracker/__init__.py,sha256=Suu5BznOK5uTuD5_jS8JMZd8RPfQNlddLxHCBvMTm_4,358
196
- evalvault/adapters/outbound/tracker/langfuse_adapter.py,sha256=oi6bnz2qjDSXEea_bACFRUEKk-tKw53d1xMfL4FE5dI,18749
197
+ evalvault/adapters/outbound/tracker/langfuse_adapter.py,sha256=uI-t5v9AC5VUMYsIc1FHYImourZeErGMXB0_prOMErc,18839
197
198
  evalvault/adapters/outbound/tracker/log_sanitizer.py,sha256=ilKTTSzsHslQYc-elnWu0Z3HKNNw1D1iI0_cCvYbo1M,2653
198
- evalvault/adapters/outbound/tracker/mlflow_adapter.py,sha256=m4xj3XBULFYg27U3twKrldLhbLyLNefezmb2pCpHJrw,7180
199
- evalvault/adapters/outbound/tracker/phoenix_adapter.py,sha256=VmGkVuBX5Nae6zTaaSbpcA94spFJGmwjW0gOh2vIxnk,25332
199
+ evalvault/adapters/outbound/tracker/mlflow_adapter.py,sha256=6pSxbxSDZE7jN7uSMU6VFg0JlO7cBiMLYcd53NYpfcY,7350
200
+ evalvault/adapters/outbound/tracker/phoenix_adapter.py,sha256=8p2qJeKn6OvIxNbD16h_QrhmCzKIBIf8_ej535MNn_A,26443
200
201
  evalvault/config/__init__.py,sha256=UCgeDx62M2gOuFvdN29wWwny2fdH4bPY_uUC3-42eDw,1297
201
202
  evalvault/config/agent_types.py,sha256=EP2Pv3ZtOzDXIvIa-Hnd1to9JIbMUtGitrlwzZtx0Ys,13418
202
203
  evalvault/config/domain_config.py,sha256=rOgNA2T8NWlDzcEFC0shdUCCww0lI1E5fUm5QrKQSZI,9264
@@ -205,7 +206,7 @@ evalvault/config/langfuse_support.py,sha256=DEzVMfMGGf1V45W_2oUG-NCDfsYI4UUdnYJI
205
206
  evalvault/config/model_config.py,sha256=KlzDbGyDLeOGE7ElekFFk5YjjT5u8i6KO2B4EyZkLnI,3542
206
207
  evalvault/config/phoenix_support.py,sha256=e6RPWd6Qb7KU6Q8pLaYTpJGWULtvEEU6B0xHWyVyOH0,13604
207
208
  evalvault/config/secret_manager.py,sha256=YjPMuNqeBrAR2BzCJvsBNUExaU4TBSFyZ8kVYZZifqA,4172
208
- evalvault/config/settings.py,sha256=JKJf8t20sOHYnHoCfTxqupQixNgfmWYJhChiGMNz-W0,17617
209
+ evalvault/config/settings.py,sha256=xvoNma4CHAd8R_nF0DL4MUWXBWCR5M0C68NPSPLT5JQ,18285
209
210
  evalvault/config/playbooks/improvement_playbook.yaml,sha256=9F9WVVCydFfz6zUuGYzZ4PKdW1LLtcBKVF36T7xT764,26965
210
211
  evalvault/domain/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
211
212
  evalvault/domain/entities/__init__.py,sha256=wszRJ1Imdc5NJ1bQPC2udk-mAgFdlw4uZV5IPNjLpHQ,3669
@@ -222,20 +223,24 @@ evalvault/domain/entities/judge_calibration.py,sha256=fhQEI7g2nZuG1OliikhxgefcFA
222
223
  evalvault/domain/entities/kg.py,sha256=8awN1M4vxAGQZk_ZG8i2CXKTizQ8FA1VCLhUWHZq0U8,3001
223
224
  evalvault/domain/entities/memory.py,sha256=bfS75q8K8_jNrB7IYh4mjP8Lkyj-I0TVsmHCP0FuICw,8423
224
225
  evalvault/domain/entities/method.py,sha256=a3jZi7SjcpK3HeVyVwQkUMwpnmg2RbxCnH4NqYPLCOI,1157
225
- evalvault/domain/entities/prompt.py,sha256=VzuUzqkqXv0FwTSNGyV5sSCft5sxTbG_Noq6Ymnke5o,2910
226
+ evalvault/domain/entities/prompt.py,sha256=lQlRnHEKY69GWTC-cUIu0DMuPfJ9UWm6Sm4KTNjVwfY,2920
226
227
  evalvault/domain/entities/prompt_suggestion.py,sha256=Ep_XSjdYUj7pFSCMyeeZKs8yTnp74AVx05Zqr7829PE,1243
227
228
  evalvault/domain/entities/rag_trace.py,sha256=sZgnkG4fK6KOe3Np6TYAZ_tPnsRbOmucDSQns35U1n4,11868
228
229
  evalvault/domain/entities/result.py,sha256=OaGHMDLWMW2O4fNVuVTUvWFVBQ1iu93OD_oI3NumrCQ,10697
229
230
  evalvault/domain/entities/stage.py,sha256=UqS59sjoMs_bhMupNtvagbIx8QgHgFjWoRPhJ3uJP2s,7426
230
- evalvault/domain/metrics/__init__.py,sha256=fxjC5Z_8OuBIeMn80bYgnZZxpNoay2wH-qtG3NqCUvk,797
231
+ evalvault/domain/metrics/__init__.py,sha256=Ros3CWg5in1xlEdMa0WUSG602SBVkxw2Zbro-XUlmxU,1214
231
232
  evalvault/domain/metrics/analysis_registry.py,sha256=JZpBrBs7-JExHKYuEML6Vg_uYLm-WniBE3BfiU5OtJg,7641
232
233
  evalvault/domain/metrics/confidence.py,sha256=AX4oeN28OvmMkwD0pT-jskkOlXh87C1pe2W9P1sF69g,17224
233
234
  evalvault/domain/metrics/contextual_relevancy.py,sha256=xAPYUv_0TM4j4WOutOSGftNln_l-2Ev6qpANeu4REv8,11057
234
235
  evalvault/domain/metrics/entity_preservation.py,sha256=uSCbaETceE5PbGn-230Rm8pryOA8jDkkeOwAkWxA65g,6500
235
236
  evalvault/domain/metrics/insurance.py,sha256=5NPeAi_86rpuZRgV4KhzomGrq3Uw2jjglN6FfA_AO8o,4040
236
237
  evalvault/domain/metrics/no_answer.py,sha256=x6vRyOa1jw-qsnw9kOYT8YMPdLElaDRu7zjNCpyJfqM,8237
237
- evalvault/domain/metrics/registry.py,sha256=QKjo4RNHxCqObGg36xJP3KAHqFpHM50Jy7GeSksdz0Y,5665
238
+ evalvault/domain/metrics/registry.py,sha256=1CKPxSjdXK739zhzwodukGDL-dIhlJJH30cuP-czQWc,6926
238
239
  evalvault/domain/metrics/retrieval_rank.py,sha256=F55ByadJBowyKHKBmKAZ0T0qN_R1_7UNu-MiLnT4Ypg,14675
240
+ evalvault/domain/metrics/summary_accuracy.py,sha256=Hr4QS1e4Rxt1MgcTj5rElKuPw9rWS-zGkI0d8wB5dwA,5988
241
+ evalvault/domain/metrics/summary_needs_followup.py,sha256=5kExtZxxankP7csAAIZe_1uRFeBD7NQK-N15b5d0awM,1357
242
+ evalvault/domain/metrics/summary_non_definitive.py,sha256=1EE-z0Ib66gpjc0MGZHmZJHJfpoACSIldgOwFkUNxg0,1029
243
+ evalvault/domain/metrics/summary_risk_coverage.py,sha256=Fo-dMg_jU4MCr0YqOZzBZymwEbG9y2H6eLX-jmuS8IU,1777
239
244
  evalvault/domain/metrics/terms_dictionary.json,sha256=-ZQmpx6yMOYoAOpcLj-xK2LkAeCbAw0EUb6-syIOKS0,3801
240
245
  evalvault/domain/metrics/text_match.py,sha256=P-YTZs9ekDqEmxLNBP8eXnMRymPdC8V4dJPtwG2ajVM,10219
241
246
  evalvault/domain/services/__init__.py,sha256=X5Af1kf_vSt3S3mFwOV6OQdro-lFxwbVdNd7nJznkC8,1024
@@ -248,6 +253,7 @@ evalvault/domain/services/benchmark_runner.py,sha256=4tvQEDrfvp2fC2luUPuPBcRjEPL
248
253
  evalvault/domain/services/benchmark_service.py,sha256=TrmnvBMAPmcs0PewGZcn2rxHbviZ8KxmDvJCeyqm28I,6286
249
254
  evalvault/domain/services/cache_metrics.py,sha256=FKNZoxym30lc1SxTGmTn3Pr-PDNoAqgC9_d_IdF_jOQ,3463
250
255
  evalvault/domain/services/cluster_map_builder.py,sha256=qPKMPj-eSqECJSCOKvv3ZETgIwxwiKWbU3d6_feCoDg,6885
256
+ evalvault/domain/services/custom_metric_snapshot.py,sha256=_MLOzBlHTRyTQ2NuunZ_lrLVF0__kvEcCUxXVVCeoRA,9684
251
257
  evalvault/domain/services/dataset_preprocessor.py,sha256=PnhLiPk0E9DIzjUr8N75296CCfl1AUXGv-lpaXBi0Ok,14797
252
258
  evalvault/domain/services/debug_report_service.py,sha256=SGdFh8tctAIq7RotFbg47eetxdYSS4Yju7-LOzpCMCM,4386
253
259
  evalvault/domain/services/difficulty_profile_reporter.py,sha256=uIj9-eiO2dDvQ6tP-DJBddfBq8VT63st0wtNC8Co4NQ,680
@@ -257,7 +263,7 @@ evalvault/domain/services/document_versioning.py,sha256=M1qZaMpQ2exVT1wkVAmvEPPu
257
263
  evalvault/domain/services/domain_learning_hook.py,sha256=rhKBmdnrJyfGzFNsNxzyv8jZO26-WOosHSmBV_9qdJg,7176
258
264
  evalvault/domain/services/embedding_overlay.py,sha256=ZTNxUPXpHGbQ3Uri5DD3feTUFn7qrhuNshhyCQEvRuM,3559
259
265
  evalvault/domain/services/entity_extractor.py,sha256=f3Rf5saK8QsgetLNK1Hbxzt8PtttJZCicSR63S8DJ5k,14141
260
- evalvault/domain/services/evaluator.py,sha256=cKcPRoqkUFc00lgx_blbIFl0qJyPIDjPeCciWAPm7m8,67669
266
+ evalvault/domain/services/evaluator.py,sha256=Fvth2VdckDJvGuwxbXPnvPfQU59WZSJHV63H4qji4lM,78815
261
267
  evalvault/domain/services/experiment_comparator.py,sha256=IBrxIwux-8GucwlLx6e5lUqB9miSPvBLGJK9ctoW7Y0,3299
262
268
  evalvault/domain/services/experiment_manager.py,sha256=2k-qGiAUyZuqqmcp4P-M3Z9HTXwwcqW5HQYKNkcIHuI,4863
263
269
  evalvault/domain/services/experiment_reporter.py,sha256=QYlVmCFSx8hKTPMezc7QjJE07b3MSQ82Q4QVucSHLVY,1420
@@ -276,7 +282,7 @@ evalvault/domain/services/pipeline_orchestrator.py,sha256=yriVlEVZYDtt0Vwt4Ae6xy
276
282
  evalvault/domain/services/pipeline_template_registry.py,sha256=aWqXLQ24grpSZo9M4tZLRo1ysD10c6hUpW3JupZH9e0,28083
277
283
  evalvault/domain/services/prompt_candidate_service.py,sha256=Ibyb5EaWK28Ju2HnTqHHGOoiA9Q-VwY3hjxVODALwGY,3997
278
284
  evalvault/domain/services/prompt_manifest.py,sha256=5s5Kd6-_Dn-xrjjlU99CVo6njsPhvE50H5m_85U-H6U,5612
279
- evalvault/domain/services/prompt_registry.py,sha256=THcNs4jqp4FTLv9uO-VVvN6XGorkGrcIUwQH1dL74o8,4166
285
+ evalvault/domain/services/prompt_registry.py,sha256=QyL4yIcKT93uv6L0-Q_iaNXno8QnsC19YcGekuSRMtE,5247
280
286
  evalvault/domain/services/prompt_scoring_service.py,sha256=SlvfuIbhj92RJu4RQAJ1BGKhKkOAUOt3cZNH21HtsX4,9833
281
287
  evalvault/domain/services/prompt_status.py,sha256=r1dFLGz4SfRxXaxsULQsr0-HpJkG9YfZ_yLIxF1MMBo,6731
282
288
  evalvault/domain/services/prompt_suggestion_reporter.py,sha256=Fc6sCPebUMk8SZVpjoJ6bCEun0ma-YmayEQnulBVv8s,10577
@@ -292,9 +298,9 @@ evalvault/domain/services/stage_metric_service.py,sha256=_u6ThZ8rGw8H9h3TNpu0j8X
292
298
  evalvault/domain/services/stage_summary_service.py,sha256=VVtuAr4vwzvmNFn8rqURJrhKFqAMG4CaBmyGiUk_xG0,1590
293
299
  evalvault/domain/services/synthetic_qa_generator.py,sha256=aiOTPoHZbKRTEeodABQ2I5lq8-Vs_kQtuzcGWd4MTGE,16526
294
300
  evalvault/domain/services/testset_generator.py,sha256=6IpiZ0pqhKEymo-AlUdfJjDkF2P1n8Md_QKV4nOheyg,4470
295
- evalvault/domain/services/threshold_profiles.py,sha256=YuOrD5CkXugAdSQYbMsFzS5VS1R201JOJtpKTs4dpXU,1296
301
+ evalvault/domain/services/threshold_profiles.py,sha256=yYJ7o8SIRufI7kUN8edh8am-dVOq_TEhvDqlHe0WQUQ,1433
296
302
  evalvault/domain/services/unified_report_service.py,sha256=lG3VpMLC1MTYUlcGl-MUEE4PUopkyrhcgj4_ye9c_vM,11829
297
- evalvault/domain/services/visual_space_service.py,sha256=xG2jxKuRuqmQgbWsXOqmytKr6pQ7igQujNgdpb5gyB0,32569
303
+ evalvault/domain/services/visual_space_service.py,sha256=3_qyBsThr5lzP1le6qkXf9ByX3JjoYGX15iMIHe8gQs,34958
298
304
  evalvault/ports/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
299
305
  evalvault/ports/inbound/__init__.py,sha256=2Wsc0vNzH8_ZaErk4OHxP93hRonLUkMbn3W28DtTDO0,562
300
306
  evalvault/ports/inbound/analysis_pipeline_port.py,sha256=RJfKtp22AYEqnmRk6RDawAK52rEmyAhuk0FUPJQUwQU,1758
@@ -332,8 +338,8 @@ evalvault/reports/__init__.py,sha256=Bb1X4871msAN8I6PM6nKGED3psPwZt88hXZBAOdH06Y
332
338
  evalvault/reports/release_notes.py,sha256=pZj0PBFT-4F_Ty-Kv5P69BuoOnmTCn4kznDcORFJd0w,4011
333
339
  evalvault/scripts/__init__.py,sha256=NwEeIFQbkX4ml2R_PhtIoNtArDSX_suuoymgG_7Kwso,89
334
340
  evalvault/scripts/regression_runner.py,sha256=SxZori5BZ8jVQ057Mf5V5FPgIVDccrV5oRONmnhuk8w,8438
335
- evalvault-1.65.0.dist-info/METADATA,sha256=1f23dU3LtSQwrlalym1j8mH-vZaM7HWMzDh-ByjVOjo,24276
336
- evalvault-1.65.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
337
- evalvault-1.65.0.dist-info/entry_points.txt,sha256=Oj9Xc5gYcyUYYNmQfWI8NYGw7nN-3M-h2ipHIMlVn6o,65
338
- evalvault-1.65.0.dist-info/licenses/LICENSE.md,sha256=3RNWY4jjtrQ_yYa-D-7I3XO12Ti7YzxsLV_dpykujvo,11358
339
- evalvault-1.65.0.dist-info/RECORD,,
341
+ evalvault-1.66.0.dist-info/METADATA,sha256=f6jzeYkN1iuFwYJTcI8r5L52hVNZwACOlQuWYvVz_JY,26159
342
+ evalvault-1.66.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
343
+ evalvault-1.66.0.dist-info/entry_points.txt,sha256=Oj9Xc5gYcyUYYNmQfWI8NYGw7nN-3M-h2ipHIMlVn6o,65
344
+ evalvault-1.66.0.dist-info/licenses/LICENSE.md,sha256=3RNWY4jjtrQ_yYa-D-7I3XO12Ti7YzxsLV_dpykujvo,11358
345
+ evalvault-1.66.0.dist-info/RECORD,,