evalvault 1.65.0__py3-none-any.whl → 1.66.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/adapter.py +14 -0
- evalvault/adapters/inbound/api/main.py +14 -4
- evalvault/adapters/inbound/api/routers/chat.py +543 -0
- evalvault/adapters/inbound/cli/commands/run.py +14 -0
- evalvault/adapters/inbound/cli/commands/run_helpers.py +21 -2
- evalvault/adapters/outbound/report/llm_report_generator.py +13 -1
- evalvault/adapters/outbound/storage/base_sql.py +41 -1
- evalvault/adapters/outbound/tracker/langfuse_adapter.py +1 -0
- evalvault/adapters/outbound/tracker/mlflow_adapter.py +5 -0
- evalvault/adapters/outbound/tracker/phoenix_adapter.py +29 -2
- evalvault/config/settings.py +21 -0
- evalvault/domain/entities/prompt.py +1 -1
- evalvault/domain/metrics/__init__.py +8 -0
- evalvault/domain/metrics/registry.py +39 -3
- evalvault/domain/metrics/summary_accuracy.py +189 -0
- evalvault/domain/metrics/summary_needs_followup.py +45 -0
- evalvault/domain/metrics/summary_non_definitive.py +41 -0
- evalvault/domain/metrics/summary_risk_coverage.py +45 -0
- evalvault/domain/services/custom_metric_snapshot.py +233 -0
- evalvault/domain/services/evaluator.py +280 -27
- evalvault/domain/services/prompt_registry.py +39 -10
- evalvault/domain/services/threshold_profiles.py +4 -0
- evalvault/domain/services/visual_space_service.py +79 -4
- {evalvault-1.65.0.dist-info → evalvault-1.66.0.dist-info}/METADATA +25 -1
- {evalvault-1.65.0.dist-info → evalvault-1.66.0.dist-info}/RECORD +28 -22
- {evalvault-1.65.0.dist-info → evalvault-1.66.0.dist-info}/WHEEL +0 -0
- {evalvault-1.65.0.dist-info → evalvault-1.66.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.65.0.dist-info → evalvault-1.66.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -109,30 +109,59 @@ def build_prompt_summary(bundle: PromptSetBundle) -> dict[str, Any]:
|
|
|
109
109
|
|
|
110
110
|
def build_prompt_inputs_from_snapshots(
|
|
111
111
|
snapshots: dict[str, dict[str, Any]] | None,
|
|
112
|
+
*,
|
|
113
|
+
kind: PromptKind = "ragas",
|
|
114
|
+
source: str | None = None,
|
|
112
115
|
) -> list[PromptInput]:
|
|
113
116
|
if not snapshots:
|
|
114
117
|
return []
|
|
115
118
|
prompt_inputs: list[PromptInput] = []
|
|
116
119
|
for metric_name, entry in snapshots.items():
|
|
117
|
-
|
|
120
|
+
if not isinstance(entry, dict):
|
|
121
|
+
continue
|
|
122
|
+
entry_source = entry.get("source")
|
|
123
|
+
resolved_source = source if source else entry_source
|
|
124
|
+
metadata = {key: value for key, value in entry.items() if key != "prompt"}
|
|
125
|
+
|
|
126
|
+
prompts_map = entry.get("prompts")
|
|
127
|
+
if isinstance(prompts_map, dict) and prompts_map:
|
|
128
|
+
for prompt_key, prompt_text in prompts_map.items():
|
|
129
|
+
if not isinstance(prompt_text, str):
|
|
130
|
+
continue
|
|
131
|
+
normalized = prompt_text.strip()
|
|
132
|
+
if not normalized:
|
|
133
|
+
continue
|
|
134
|
+
prompt_inputs.append(
|
|
135
|
+
PromptInput(
|
|
136
|
+
content=normalized,
|
|
137
|
+
name=f"{kind}.{metric_name}.{prompt_key}",
|
|
138
|
+
kind=kind,
|
|
139
|
+
role=f"{metric_name}.{prompt_key}",
|
|
140
|
+
source=(
|
|
141
|
+
resolved_source
|
|
142
|
+
if isinstance(resolved_source, str) and resolved_source
|
|
143
|
+
else kind
|
|
144
|
+
),
|
|
145
|
+
metadata=metadata or None,
|
|
146
|
+
)
|
|
147
|
+
)
|
|
148
|
+
continue
|
|
149
|
+
|
|
150
|
+
prompt_text = entry.get("prompt")
|
|
118
151
|
if not isinstance(prompt_text, str):
|
|
119
152
|
continue
|
|
120
153
|
prompt_text = prompt_text.strip()
|
|
121
154
|
if not prompt_text:
|
|
122
155
|
continue
|
|
123
|
-
source = entry.get("source") if isinstance(entry, dict) else None
|
|
124
|
-
metadata = {
|
|
125
|
-
key: value
|
|
126
|
-
for key, value in entry.items()
|
|
127
|
-
if key != "prompt" and isinstance(entry, dict)
|
|
128
|
-
}
|
|
129
156
|
prompt_inputs.append(
|
|
130
157
|
PromptInput(
|
|
131
158
|
content=prompt_text,
|
|
132
|
-
name=f"
|
|
133
|
-
kind=
|
|
159
|
+
name=f"{kind}.{metric_name}",
|
|
160
|
+
kind=kind,
|
|
134
161
|
role=str(metric_name),
|
|
135
|
-
source=
|
|
162
|
+
source=resolved_source
|
|
163
|
+
if isinstance(resolved_source, str) and resolved_source
|
|
164
|
+
else kind,
|
|
136
165
|
metadata=metadata or None,
|
|
137
166
|
)
|
|
138
167
|
)
|
|
@@ -8,6 +8,10 @@ SUMMARY_RECOMMENDED_THRESHOLDS = {
|
|
|
8
8
|
"summary_faithfulness": 0.90,
|
|
9
9
|
"summary_score": 0.85,
|
|
10
10
|
"entity_preservation": 0.90,
|
|
11
|
+
"summary_accuracy": 0.90,
|
|
12
|
+
"summary_risk_coverage": 0.90,
|
|
13
|
+
"summary_non_definitive": 0.80,
|
|
14
|
+
"summary_needs_followup": 0.80,
|
|
11
15
|
}
|
|
12
16
|
QA_RECOMMENDED_THRESHOLDS = {
|
|
13
17
|
"faithfulness": 0.70,
|
|
@@ -233,13 +233,13 @@ def _aggregate_stage_metrics(metrics: Iterable[StageMetric]) -> dict[str, dict[s
|
|
|
233
233
|
|
|
234
234
|
aggregated: dict[str, dict[str, float]] = {}
|
|
235
235
|
for name, entries in buckets.items():
|
|
236
|
-
scores = [m.score for m in entries]
|
|
236
|
+
scores = [m.score for m in entries if m.score is not None]
|
|
237
237
|
threshold = next(
|
|
238
238
|
(m.threshold for m in entries if m.threshold is not None),
|
|
239
239
|
DEFAULT_STAGE_THRESHOLDS.get(name),
|
|
240
240
|
)
|
|
241
241
|
aggregated[name] = {
|
|
242
|
-
"avg": mean(scores) if scores else
|
|
242
|
+
"avg": mean(scores) if scores else 0.0,
|
|
243
243
|
"threshold": threshold if threshold is not None else DEFAULT_METRIC_THRESHOLD,
|
|
244
244
|
}
|
|
245
245
|
return aggregated
|
|
@@ -770,6 +770,77 @@ def _build_case_coords(result: TestCaseResult) -> dict[str, float | None]:
|
|
|
770
770
|
),
|
|
771
771
|
]
|
|
772
772
|
)
|
|
773
|
+
|
|
774
|
+
if x_value is None:
|
|
775
|
+
x_value = _weighted_average(
|
|
776
|
+
[
|
|
777
|
+
(
|
|
778
|
+
_centered_norm(
|
|
779
|
+
scores.get("summary_accuracy"), thresholds.get("summary_accuracy")
|
|
780
|
+
),
|
|
781
|
+
0.4,
|
|
782
|
+
),
|
|
783
|
+
(
|
|
784
|
+
_centered_norm(
|
|
785
|
+
scores.get("summary_risk_coverage"),
|
|
786
|
+
thresholds.get("summary_risk_coverage"),
|
|
787
|
+
),
|
|
788
|
+
0.3,
|
|
789
|
+
),
|
|
790
|
+
(
|
|
791
|
+
_centered_norm(
|
|
792
|
+
scores.get("summary_faithfulness"),
|
|
793
|
+
thresholds.get("summary_faithfulness"),
|
|
794
|
+
),
|
|
795
|
+
0.2,
|
|
796
|
+
),
|
|
797
|
+
(
|
|
798
|
+
_centered_norm(scores.get("summary_score"), thresholds.get("summary_score")),
|
|
799
|
+
0.1,
|
|
800
|
+
),
|
|
801
|
+
(
|
|
802
|
+
_centered_norm(
|
|
803
|
+
scores.get("entity_preservation"),
|
|
804
|
+
thresholds.get("entity_preservation"),
|
|
805
|
+
),
|
|
806
|
+
0.2,
|
|
807
|
+
),
|
|
808
|
+
]
|
|
809
|
+
)
|
|
810
|
+
|
|
811
|
+
if y_value is None:
|
|
812
|
+
y_value = _weighted_average(
|
|
813
|
+
[
|
|
814
|
+
(
|
|
815
|
+
_centered_norm(
|
|
816
|
+
scores.get("summary_accuracy"), thresholds.get("summary_accuracy")
|
|
817
|
+
),
|
|
818
|
+
0.35,
|
|
819
|
+
),
|
|
820
|
+
(
|
|
821
|
+
_centered_norm(
|
|
822
|
+
scores.get("summary_non_definitive"),
|
|
823
|
+
thresholds.get("summary_non_definitive"),
|
|
824
|
+
),
|
|
825
|
+
0.35,
|
|
826
|
+
),
|
|
827
|
+
(
|
|
828
|
+
_centered_norm(
|
|
829
|
+
scores.get("summary_needs_followup"),
|
|
830
|
+
thresholds.get("summary_needs_followup"),
|
|
831
|
+
),
|
|
832
|
+
0.3,
|
|
833
|
+
),
|
|
834
|
+
(
|
|
835
|
+
_centered_norm(
|
|
836
|
+
scores.get("entity_preservation"),
|
|
837
|
+
thresholds.get("entity_preservation"),
|
|
838
|
+
),
|
|
839
|
+
0.2,
|
|
840
|
+
),
|
|
841
|
+
]
|
|
842
|
+
)
|
|
843
|
+
|
|
773
844
|
return {"x": x_value, "y": y_value}
|
|
774
845
|
|
|
775
846
|
|
|
@@ -799,8 +870,12 @@ def _build_cluster_points(
|
|
|
799
870
|
|
|
800
871
|
points = []
|
|
801
872
|
for cluster_id, coords_list in clusters.items():
|
|
802
|
-
x_values = [
|
|
803
|
-
|
|
873
|
+
x_values = [
|
|
874
|
+
value for value in (c.get("x") for c in coords_list) if isinstance(value, (int, float))
|
|
875
|
+
]
|
|
876
|
+
y_values = [
|
|
877
|
+
value for value in (c.get("y") for c in coords_list) if isinstance(value, (int, float))
|
|
878
|
+
]
|
|
804
879
|
x_avg = mean(x_values) if x_values else None
|
|
805
880
|
y_avg = mean(y_values) if y_values else None
|
|
806
881
|
quadrant = _quadrant_label(x_avg, y_avg)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: evalvault
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.66.0
|
|
4
4
|
Summary: RAG evaluation system using Ragas with Phoenix/Langfuse tracing
|
|
5
5
|
Project-URL: Homepage, https://github.com/ntts9990/EvalVault
|
|
6
6
|
Project-URL: Documentation, https://github.com/ntts9990/EvalVault#readme
|
|
@@ -25,6 +25,7 @@ Classifier: Topic :: Software Development :: Quality Assurance
|
|
|
25
25
|
Classifier: Topic :: Software Development :: Testing
|
|
26
26
|
Classifier: Typing :: Typed
|
|
27
27
|
Requires-Python: >=3.12
|
|
28
|
+
Requires-Dist: chainlit>=2.9.5
|
|
28
29
|
Requires-Dist: chardet
|
|
29
30
|
Requires-Dist: fastapi>=0.128.0
|
|
30
31
|
Requires-Dist: instructor
|
|
@@ -137,12 +138,17 @@ English version? See `README.en.md`.
|
|
|
137
138
|
## Quick Links
|
|
138
139
|
|
|
139
140
|
- 문서 허브: `docs/INDEX.md`
|
|
141
|
+
- CLI 실행 시나리오 가이드: `docs/guides/RAG_CLI_WORKFLOW_TEMPLATES.md`
|
|
140
142
|
- 사용자 가이드: `docs/guides/USER_GUIDE.md`
|
|
141
143
|
- 개발 가이드: `docs/guides/DEV_GUIDE.md`
|
|
142
144
|
- 상태/로드맵: `docs/STATUS.md`, `docs/ROADMAP.md`
|
|
143
145
|
- 개발 백서(설계/운영/품질 기준): `docs/new_whitepaper/INDEX.md`
|
|
144
146
|
- Open RAG Trace: `docs/architecture/open-rag-trace-spec.md`
|
|
145
147
|
|
|
148
|
+
### 다음 개선 작업 메모
|
|
149
|
+
- 보험 요약 메트릭 확장 계획: `docs/guides/INSURANCE_SUMMARY_METRICS_PLAN.md`
|
|
150
|
+
- Prompt 반복 적용 계획: `docs/guides/repeat_query.md`
|
|
151
|
+
|
|
146
152
|
---
|
|
147
153
|
|
|
148
154
|
## EvalVault가 해결하는 문제
|
|
@@ -470,6 +476,24 @@ npm run dev
|
|
|
470
476
|
- Ragas 계열: `faithfulness`, `answer_relevancy`, `context_precision`, `context_recall`, `factual_correctness`, `semantic_similarity`
|
|
471
477
|
- 커스텀 예시(도메인): `insurance_term_accuracy`
|
|
472
478
|
|
|
479
|
+
### 요약 메트릭 설계 근거 (summary_score, summary_faithfulness, entity_preservation)
|
|
480
|
+
|
|
481
|
+
### 커스텀 메트릭 스냅샷 (평가 방식/과정/결과 기록)
|
|
482
|
+
- 평가 방식/입출력/규칙/구현 파일 해시를 `run.tracker_metadata.custom_metric_snapshot`에 기록합니다.
|
|
483
|
+
- Excel `CustomMetrics` 시트와 Langfuse/Phoenix/MLflow artifact에도 함께 저장됩니다.
|
|
484
|
+
|
|
485
|
+
- `summary_faithfulness`: 요약의 모든 주장이 컨텍스트에 근거하는지 평가합니다. 환각/왜곡 리스크를 직접적으로 측정합니다.
|
|
486
|
+
- `summary_score`: 컨텍스트 대비 요약의 핵심 정보 보존/간결성 균형을 평가합니다. 정답 요약 단일 기준의 편향을 줄입니다.
|
|
487
|
+
- `entity_preservation`: 금액·기간·조건·면책 등 보험 약관에서 중요한 엔티티가 요약에 유지되는지 측정합니다.
|
|
488
|
+
|
|
489
|
+
**보험 도메인 특화 근거**
|
|
490
|
+
- 보험 약관에서 치명적인 요소(면책, 자기부담, 한도, 조건 등)를 키워드로 직접 반영하고, 금액/기간/비율 같은 핵심 엔티티를 보존하도록 설계했습니다.
|
|
491
|
+
- 범용 규칙(숫자/기간/금액)과 보험 특화 키워드를 함께 사용하므로, 현재 상태는 “보험 리스크 중심의 약한 도메인 특화”로 보는 것이 정확합니다.
|
|
492
|
+
|
|
493
|
+
**해석 주의사항**
|
|
494
|
+
- 세 메트릭 모두 `contexts` 품질에 크게 의존합니다. 컨텍스트가 부정확/과도하면 점수가 낮아질 수 있습니다.
|
|
495
|
+
- `summary_score`는 키프레이즈 기반이므로, 표현이 달라지면 점수가 낮게 나올 수 있습니다.
|
|
496
|
+
|
|
473
497
|
정확한 옵션/운영 레시피는 `docs/guides/USER_GUIDE.md`를 기준으로 최신화합니다.
|
|
474
498
|
|
|
475
499
|
---
|
|
@@ -5,10 +5,11 @@ evalvault/mkdocs_helpers.py,sha256=1AKVQ1W2_VO4qclhfyefyU9Dz1Hzkh1DWDwsFMe24jc,3
|
|
|
5
5
|
evalvault/adapters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
6
|
evalvault/adapters/inbound/__init__.py,sha256=SG1svel1PwqetnqVpKFLSv612_WwGwLTbFpYgwk6FMw,166
|
|
7
7
|
evalvault/adapters/inbound/api/__init__.py,sha256=LeVVttCA3tLKoHA2PO4z3y8VkfVcf3Bq8CZSzo91lf4,34
|
|
8
|
-
evalvault/adapters/inbound/api/adapter.py,sha256=
|
|
9
|
-
evalvault/adapters/inbound/api/main.py,sha256=
|
|
8
|
+
evalvault/adapters/inbound/api/adapter.py,sha256=HgWSYyUxvJPlaSG158WVzpPckpPCYV9Ec3CWN8rLFdI,69118
|
|
9
|
+
evalvault/adapters/inbound/api/main.py,sha256=skYtmDngdOBryyLXQpNGlSd2Te6RF6GtfIwcMACPHFU,7068
|
|
10
10
|
evalvault/adapters/inbound/api/routers/__init__.py,sha256=q07_YF9TnBl68bqcRCvhPU4-zRTyvmPoHVehwO6W7QM,19
|
|
11
11
|
evalvault/adapters/inbound/api/routers/benchmark.py,sha256=yevntbZcNtMvbVODsITUBgR1Ka4pdFQrXBJJ4K4Jyr4,4477
|
|
12
|
+
evalvault/adapters/inbound/api/routers/chat.py,sha256=3S6-ljiY1COlDuVDH5yzMJs9SO0EkuosRcJIYScHWvI,18143
|
|
12
13
|
evalvault/adapters/inbound/api/routers/config.py,sha256=LygN0fVMr8NFtj5zuQXnVFhoafx56Txa98vpwtPa4Jc,4104
|
|
13
14
|
evalvault/adapters/inbound/api/routers/domain.py,sha256=RsR7GIFMjccDN7vpG1uDyk9n1DnCTH18JDGAX7o4Qqc,3648
|
|
14
15
|
evalvault/adapters/inbound/api/routers/knowledge.py,sha256=yb_e7OEPtwldOAzHTGiWe7jShHw2JdpOFnzGPMceRsg,7109
|
|
@@ -42,8 +43,8 @@ evalvault/adapters/inbound/cli/commands/pipeline.py,sha256=NeqWLzO9kRDuZd0pHAIHg
|
|
|
42
43
|
evalvault/adapters/inbound/cli/commands/profile_difficulty.py,sha256=nOJH3iqgLAlXq4keLBj5oqpiRCg0jjGgT-7Q57HxEh8,6665
|
|
43
44
|
evalvault/adapters/inbound/cli/commands/prompts.py,sha256=lddde5VbjYaqN_9gHPLNu6DWpg5fE-KqZzjN-XYwvJw,27153
|
|
44
45
|
evalvault/adapters/inbound/cli/commands/regress.py,sha256=Dy8hUOdjapxOW9Hoov0DHHblkMaExiqWfYS14CaC9Kk,8806
|
|
45
|
-
evalvault/adapters/inbound/cli/commands/run.py,sha256=
|
|
46
|
-
evalvault/adapters/inbound/cli/commands/run_helpers.py,sha256=
|
|
46
|
+
evalvault/adapters/inbound/cli/commands/run.py,sha256=aKoZcQbOJ1KB_4zPk4L-AWw3u9vGWg3SaooR7A3Xd_Y,119910
|
|
47
|
+
evalvault/adapters/inbound/cli/commands/run_helpers.py,sha256=93jFUg8QLrD38QU2JhOhFMoHDWUphSEKRdJ5KcUvrkQ,40806
|
|
47
48
|
evalvault/adapters/inbound/cli/commands/stage.py,sha256=oRC9c5CysLX90Iy5Ba1pc_00DaOBS78lcBvzkbdrGRM,17123
|
|
48
49
|
evalvault/adapters/inbound/cli/utils/__init__.py,sha256=QPNKneZS-Z-tTnYYxtgJXgcJWY6puUlRQcKrn7Mlv1M,685
|
|
49
50
|
evalvault/adapters/inbound/cli/utils/analysis_io.py,sha256=RHkKEq4e-PtbtRDlXAJWU80RYHNPw-O5V9_GujdaGfc,13393
|
|
@@ -177,10 +178,10 @@ evalvault/adapters/outbound/nlp/korean/toolkit_factory.py,sha256=x3v-AAkVInOabC4
|
|
|
177
178
|
evalvault/adapters/outbound/phoenix/sync_service.py,sha256=i6gHpNiZXKQ5yzV9B2TPb-P1N45k_Ck5ruzh3oqp4d8,9122
|
|
178
179
|
evalvault/adapters/outbound/report/__init__.py,sha256=8OUduTHnWkBLHYrc7mBg45DnAwz0RgvSJmz1HqxVjLY,477
|
|
179
180
|
evalvault/adapters/outbound/report/dashboard_generator.py,sha256=Dcu18NTK4lS8XNKnnnquagpZkd-4TSf5Mb2isFNW5Pk,7800
|
|
180
|
-
evalvault/adapters/outbound/report/llm_report_generator.py,sha256=
|
|
181
|
+
evalvault/adapters/outbound/report/llm_report_generator.py,sha256=i_iXfY8qutIb8TsvLKyMLnijsA0yiNJ3rBEFg4zVqcE,26858
|
|
181
182
|
evalvault/adapters/outbound/report/markdown_adapter.py,sha256=5PS72h_qe4ZtYs-umhX5TqQL2k5SuDaCUc6rRw9AKRw,16761
|
|
182
183
|
evalvault/adapters/outbound/storage/__init__.py,sha256=n5R6thAPTx1leSwv6od6nBWcLWFa-UYD6cOLzN89T8I,614
|
|
183
|
-
evalvault/adapters/outbound/storage/base_sql.py,sha256=
|
|
184
|
+
evalvault/adapters/outbound/storage/base_sql.py,sha256=bNjJr941wqeLgv4E772JlOer1Q8OpJWxyotsNNn_R98,42536
|
|
184
185
|
evalvault/adapters/outbound/storage/benchmark_storage_adapter.py,sha256=Qgf9xSSIkYQRpG4uLzcUdoYO9LTQDQ4tFRkkMYer-WA,9803
|
|
185
186
|
evalvault/adapters/outbound/storage/postgres_adapter.py,sha256=HLaoQ3YJDFwOxeY0S92oPIqb-7EgWSasgt89RM86vr0,47148
|
|
186
187
|
evalvault/adapters/outbound/storage/postgres_schema.sql,sha256=A9MfO0pjf4kjxoRj2KPI0Gg1cbX13I2YE3oieT-PGiI,8906
|
|
@@ -193,10 +194,10 @@ evalvault/adapters/outbound/tracer/open_rag_trace_decorators.py,sha256=LFnk-3FSL
|
|
|
193
194
|
evalvault/adapters/outbound/tracer/open_rag_trace_helpers.py,sha256=D48Mbj-ioDKztjhV9513Q5DiUNiVdO60B_2sWMFEmnI,3520
|
|
194
195
|
evalvault/adapters/outbound/tracer/phoenix_tracer_adapter.py,sha256=inmTAolAVsm0IrszE9VTJoI7HSvGGAnGNZVu_vZRAGg,741
|
|
195
196
|
evalvault/adapters/outbound/tracker/__init__.py,sha256=Suu5BznOK5uTuD5_jS8JMZd8RPfQNlddLxHCBvMTm_4,358
|
|
196
|
-
evalvault/adapters/outbound/tracker/langfuse_adapter.py,sha256=
|
|
197
|
+
evalvault/adapters/outbound/tracker/langfuse_adapter.py,sha256=uI-t5v9AC5VUMYsIc1FHYImourZeErGMXB0_prOMErc,18839
|
|
197
198
|
evalvault/adapters/outbound/tracker/log_sanitizer.py,sha256=ilKTTSzsHslQYc-elnWu0Z3HKNNw1D1iI0_cCvYbo1M,2653
|
|
198
|
-
evalvault/adapters/outbound/tracker/mlflow_adapter.py,sha256=
|
|
199
|
-
evalvault/adapters/outbound/tracker/phoenix_adapter.py,sha256=
|
|
199
|
+
evalvault/adapters/outbound/tracker/mlflow_adapter.py,sha256=6pSxbxSDZE7jN7uSMU6VFg0JlO7cBiMLYcd53NYpfcY,7350
|
|
200
|
+
evalvault/adapters/outbound/tracker/phoenix_adapter.py,sha256=8p2qJeKn6OvIxNbD16h_QrhmCzKIBIf8_ej535MNn_A,26443
|
|
200
201
|
evalvault/config/__init__.py,sha256=UCgeDx62M2gOuFvdN29wWwny2fdH4bPY_uUC3-42eDw,1297
|
|
201
202
|
evalvault/config/agent_types.py,sha256=EP2Pv3ZtOzDXIvIa-Hnd1to9JIbMUtGitrlwzZtx0Ys,13418
|
|
202
203
|
evalvault/config/domain_config.py,sha256=rOgNA2T8NWlDzcEFC0shdUCCww0lI1E5fUm5QrKQSZI,9264
|
|
@@ -205,7 +206,7 @@ evalvault/config/langfuse_support.py,sha256=DEzVMfMGGf1V45W_2oUG-NCDfsYI4UUdnYJI
|
|
|
205
206
|
evalvault/config/model_config.py,sha256=KlzDbGyDLeOGE7ElekFFk5YjjT5u8i6KO2B4EyZkLnI,3542
|
|
206
207
|
evalvault/config/phoenix_support.py,sha256=e6RPWd6Qb7KU6Q8pLaYTpJGWULtvEEU6B0xHWyVyOH0,13604
|
|
207
208
|
evalvault/config/secret_manager.py,sha256=YjPMuNqeBrAR2BzCJvsBNUExaU4TBSFyZ8kVYZZifqA,4172
|
|
208
|
-
evalvault/config/settings.py,sha256=
|
|
209
|
+
evalvault/config/settings.py,sha256=xvoNma4CHAd8R_nF0DL4MUWXBWCR5M0C68NPSPLT5JQ,18285
|
|
209
210
|
evalvault/config/playbooks/improvement_playbook.yaml,sha256=9F9WVVCydFfz6zUuGYzZ4PKdW1LLtcBKVF36T7xT764,26965
|
|
210
211
|
evalvault/domain/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
211
212
|
evalvault/domain/entities/__init__.py,sha256=wszRJ1Imdc5NJ1bQPC2udk-mAgFdlw4uZV5IPNjLpHQ,3669
|
|
@@ -222,20 +223,24 @@ evalvault/domain/entities/judge_calibration.py,sha256=fhQEI7g2nZuG1OliikhxgefcFA
|
|
|
222
223
|
evalvault/domain/entities/kg.py,sha256=8awN1M4vxAGQZk_ZG8i2CXKTizQ8FA1VCLhUWHZq0U8,3001
|
|
223
224
|
evalvault/domain/entities/memory.py,sha256=bfS75q8K8_jNrB7IYh4mjP8Lkyj-I0TVsmHCP0FuICw,8423
|
|
224
225
|
evalvault/domain/entities/method.py,sha256=a3jZi7SjcpK3HeVyVwQkUMwpnmg2RbxCnH4NqYPLCOI,1157
|
|
225
|
-
evalvault/domain/entities/prompt.py,sha256=
|
|
226
|
+
evalvault/domain/entities/prompt.py,sha256=lQlRnHEKY69GWTC-cUIu0DMuPfJ9UWm6Sm4KTNjVwfY,2920
|
|
226
227
|
evalvault/domain/entities/prompt_suggestion.py,sha256=Ep_XSjdYUj7pFSCMyeeZKs8yTnp74AVx05Zqr7829PE,1243
|
|
227
228
|
evalvault/domain/entities/rag_trace.py,sha256=sZgnkG4fK6KOe3Np6TYAZ_tPnsRbOmucDSQns35U1n4,11868
|
|
228
229
|
evalvault/domain/entities/result.py,sha256=OaGHMDLWMW2O4fNVuVTUvWFVBQ1iu93OD_oI3NumrCQ,10697
|
|
229
230
|
evalvault/domain/entities/stage.py,sha256=UqS59sjoMs_bhMupNtvagbIx8QgHgFjWoRPhJ3uJP2s,7426
|
|
230
|
-
evalvault/domain/metrics/__init__.py,sha256=
|
|
231
|
+
evalvault/domain/metrics/__init__.py,sha256=Ros3CWg5in1xlEdMa0WUSG602SBVkxw2Zbro-XUlmxU,1214
|
|
231
232
|
evalvault/domain/metrics/analysis_registry.py,sha256=JZpBrBs7-JExHKYuEML6Vg_uYLm-WniBE3BfiU5OtJg,7641
|
|
232
233
|
evalvault/domain/metrics/confidence.py,sha256=AX4oeN28OvmMkwD0pT-jskkOlXh87C1pe2W9P1sF69g,17224
|
|
233
234
|
evalvault/domain/metrics/contextual_relevancy.py,sha256=xAPYUv_0TM4j4WOutOSGftNln_l-2Ev6qpANeu4REv8,11057
|
|
234
235
|
evalvault/domain/metrics/entity_preservation.py,sha256=uSCbaETceE5PbGn-230Rm8pryOA8jDkkeOwAkWxA65g,6500
|
|
235
236
|
evalvault/domain/metrics/insurance.py,sha256=5NPeAi_86rpuZRgV4KhzomGrq3Uw2jjglN6FfA_AO8o,4040
|
|
236
237
|
evalvault/domain/metrics/no_answer.py,sha256=x6vRyOa1jw-qsnw9kOYT8YMPdLElaDRu7zjNCpyJfqM,8237
|
|
237
|
-
evalvault/domain/metrics/registry.py,sha256=
|
|
238
|
+
evalvault/domain/metrics/registry.py,sha256=1CKPxSjdXK739zhzwodukGDL-dIhlJJH30cuP-czQWc,6926
|
|
238
239
|
evalvault/domain/metrics/retrieval_rank.py,sha256=F55ByadJBowyKHKBmKAZ0T0qN_R1_7UNu-MiLnT4Ypg,14675
|
|
240
|
+
evalvault/domain/metrics/summary_accuracy.py,sha256=Hr4QS1e4Rxt1MgcTj5rElKuPw9rWS-zGkI0d8wB5dwA,5988
|
|
241
|
+
evalvault/domain/metrics/summary_needs_followup.py,sha256=5kExtZxxankP7csAAIZe_1uRFeBD7NQK-N15b5d0awM,1357
|
|
242
|
+
evalvault/domain/metrics/summary_non_definitive.py,sha256=1EE-z0Ib66gpjc0MGZHmZJHJfpoACSIldgOwFkUNxg0,1029
|
|
243
|
+
evalvault/domain/metrics/summary_risk_coverage.py,sha256=Fo-dMg_jU4MCr0YqOZzBZymwEbG9y2H6eLX-jmuS8IU,1777
|
|
239
244
|
evalvault/domain/metrics/terms_dictionary.json,sha256=-ZQmpx6yMOYoAOpcLj-xK2LkAeCbAw0EUb6-syIOKS0,3801
|
|
240
245
|
evalvault/domain/metrics/text_match.py,sha256=P-YTZs9ekDqEmxLNBP8eXnMRymPdC8V4dJPtwG2ajVM,10219
|
|
241
246
|
evalvault/domain/services/__init__.py,sha256=X5Af1kf_vSt3S3mFwOV6OQdro-lFxwbVdNd7nJznkC8,1024
|
|
@@ -248,6 +253,7 @@ evalvault/domain/services/benchmark_runner.py,sha256=4tvQEDrfvp2fC2luUPuPBcRjEPL
|
|
|
248
253
|
evalvault/domain/services/benchmark_service.py,sha256=TrmnvBMAPmcs0PewGZcn2rxHbviZ8KxmDvJCeyqm28I,6286
|
|
249
254
|
evalvault/domain/services/cache_metrics.py,sha256=FKNZoxym30lc1SxTGmTn3Pr-PDNoAqgC9_d_IdF_jOQ,3463
|
|
250
255
|
evalvault/domain/services/cluster_map_builder.py,sha256=qPKMPj-eSqECJSCOKvv3ZETgIwxwiKWbU3d6_feCoDg,6885
|
|
256
|
+
evalvault/domain/services/custom_metric_snapshot.py,sha256=_MLOzBlHTRyTQ2NuunZ_lrLVF0__kvEcCUxXVVCeoRA,9684
|
|
251
257
|
evalvault/domain/services/dataset_preprocessor.py,sha256=PnhLiPk0E9DIzjUr8N75296CCfl1AUXGv-lpaXBi0Ok,14797
|
|
252
258
|
evalvault/domain/services/debug_report_service.py,sha256=SGdFh8tctAIq7RotFbg47eetxdYSS4Yju7-LOzpCMCM,4386
|
|
253
259
|
evalvault/domain/services/difficulty_profile_reporter.py,sha256=uIj9-eiO2dDvQ6tP-DJBddfBq8VT63st0wtNC8Co4NQ,680
|
|
@@ -257,7 +263,7 @@ evalvault/domain/services/document_versioning.py,sha256=M1qZaMpQ2exVT1wkVAmvEPPu
|
|
|
257
263
|
evalvault/domain/services/domain_learning_hook.py,sha256=rhKBmdnrJyfGzFNsNxzyv8jZO26-WOosHSmBV_9qdJg,7176
|
|
258
264
|
evalvault/domain/services/embedding_overlay.py,sha256=ZTNxUPXpHGbQ3Uri5DD3feTUFn7qrhuNshhyCQEvRuM,3559
|
|
259
265
|
evalvault/domain/services/entity_extractor.py,sha256=f3Rf5saK8QsgetLNK1Hbxzt8PtttJZCicSR63S8DJ5k,14141
|
|
260
|
-
evalvault/domain/services/evaluator.py,sha256=
|
|
266
|
+
evalvault/domain/services/evaluator.py,sha256=Fvth2VdckDJvGuwxbXPnvPfQU59WZSJHV63H4qji4lM,78815
|
|
261
267
|
evalvault/domain/services/experiment_comparator.py,sha256=IBrxIwux-8GucwlLx6e5lUqB9miSPvBLGJK9ctoW7Y0,3299
|
|
262
268
|
evalvault/domain/services/experiment_manager.py,sha256=2k-qGiAUyZuqqmcp4P-M3Z9HTXwwcqW5HQYKNkcIHuI,4863
|
|
263
269
|
evalvault/domain/services/experiment_reporter.py,sha256=QYlVmCFSx8hKTPMezc7QjJE07b3MSQ82Q4QVucSHLVY,1420
|
|
@@ -276,7 +282,7 @@ evalvault/domain/services/pipeline_orchestrator.py,sha256=yriVlEVZYDtt0Vwt4Ae6xy
|
|
|
276
282
|
evalvault/domain/services/pipeline_template_registry.py,sha256=aWqXLQ24grpSZo9M4tZLRo1ysD10c6hUpW3JupZH9e0,28083
|
|
277
283
|
evalvault/domain/services/prompt_candidate_service.py,sha256=Ibyb5EaWK28Ju2HnTqHHGOoiA9Q-VwY3hjxVODALwGY,3997
|
|
278
284
|
evalvault/domain/services/prompt_manifest.py,sha256=5s5Kd6-_Dn-xrjjlU99CVo6njsPhvE50H5m_85U-H6U,5612
|
|
279
|
-
evalvault/domain/services/prompt_registry.py,sha256=
|
|
285
|
+
evalvault/domain/services/prompt_registry.py,sha256=QyL4yIcKT93uv6L0-Q_iaNXno8QnsC19YcGekuSRMtE,5247
|
|
280
286
|
evalvault/domain/services/prompt_scoring_service.py,sha256=SlvfuIbhj92RJu4RQAJ1BGKhKkOAUOt3cZNH21HtsX4,9833
|
|
281
287
|
evalvault/domain/services/prompt_status.py,sha256=r1dFLGz4SfRxXaxsULQsr0-HpJkG9YfZ_yLIxF1MMBo,6731
|
|
282
288
|
evalvault/domain/services/prompt_suggestion_reporter.py,sha256=Fc6sCPebUMk8SZVpjoJ6bCEun0ma-YmayEQnulBVv8s,10577
|
|
@@ -292,9 +298,9 @@ evalvault/domain/services/stage_metric_service.py,sha256=_u6ThZ8rGw8H9h3TNpu0j8X
|
|
|
292
298
|
evalvault/domain/services/stage_summary_service.py,sha256=VVtuAr4vwzvmNFn8rqURJrhKFqAMG4CaBmyGiUk_xG0,1590
|
|
293
299
|
evalvault/domain/services/synthetic_qa_generator.py,sha256=aiOTPoHZbKRTEeodABQ2I5lq8-Vs_kQtuzcGWd4MTGE,16526
|
|
294
300
|
evalvault/domain/services/testset_generator.py,sha256=6IpiZ0pqhKEymo-AlUdfJjDkF2P1n8Md_QKV4nOheyg,4470
|
|
295
|
-
evalvault/domain/services/threshold_profiles.py,sha256=
|
|
301
|
+
evalvault/domain/services/threshold_profiles.py,sha256=yYJ7o8SIRufI7kUN8edh8am-dVOq_TEhvDqlHe0WQUQ,1433
|
|
296
302
|
evalvault/domain/services/unified_report_service.py,sha256=lG3VpMLC1MTYUlcGl-MUEE4PUopkyrhcgj4_ye9c_vM,11829
|
|
297
|
-
evalvault/domain/services/visual_space_service.py,sha256=
|
|
303
|
+
evalvault/domain/services/visual_space_service.py,sha256=3_qyBsThr5lzP1le6qkXf9ByX3JjoYGX15iMIHe8gQs,34958
|
|
298
304
|
evalvault/ports/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
299
305
|
evalvault/ports/inbound/__init__.py,sha256=2Wsc0vNzH8_ZaErk4OHxP93hRonLUkMbn3W28DtTDO0,562
|
|
300
306
|
evalvault/ports/inbound/analysis_pipeline_port.py,sha256=RJfKtp22AYEqnmRk6RDawAK52rEmyAhuk0FUPJQUwQU,1758
|
|
@@ -332,8 +338,8 @@ evalvault/reports/__init__.py,sha256=Bb1X4871msAN8I6PM6nKGED3psPwZt88hXZBAOdH06Y
|
|
|
332
338
|
evalvault/reports/release_notes.py,sha256=pZj0PBFT-4F_Ty-Kv5P69BuoOnmTCn4kznDcORFJd0w,4011
|
|
333
339
|
evalvault/scripts/__init__.py,sha256=NwEeIFQbkX4ml2R_PhtIoNtArDSX_suuoymgG_7Kwso,89
|
|
334
340
|
evalvault/scripts/regression_runner.py,sha256=SxZori5BZ8jVQ057Mf5V5FPgIVDccrV5oRONmnhuk8w,8438
|
|
335
|
-
evalvault-1.
|
|
336
|
-
evalvault-1.
|
|
337
|
-
evalvault-1.
|
|
338
|
-
evalvault-1.
|
|
339
|
-
evalvault-1.
|
|
341
|
+
evalvault-1.66.0.dist-info/METADATA,sha256=f6jzeYkN1iuFwYJTcI8r5L52hVNZwACOlQuWYvVz_JY,26159
|
|
342
|
+
evalvault-1.66.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
343
|
+
evalvault-1.66.0.dist-info/entry_points.txt,sha256=Oj9Xc5gYcyUYYNmQfWI8NYGw7nN-3M-h2ipHIMlVn6o,65
|
|
344
|
+
evalvault-1.66.0.dist-info/licenses/LICENSE.md,sha256=3RNWY4jjtrQ_yYa-D-7I3XO12Ti7YzxsLV_dpykujvo,11358
|
|
345
|
+
evalvault-1.66.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|