evalvault 1.57.1__py3-none-any.whl → 1.59.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/routers/pipeline.py +48 -0
- evalvault/adapters/inbound/cli/commands/analyze.py +434 -179
- evalvault/adapters/inbound/cli/commands/pipeline.py +5 -1
- evalvault/adapters/inbound/cli/commands/run.py +628 -183
- evalvault/adapters/inbound/cli/commands/run_helpers.py +29 -30
- evalvault/adapters/inbound/cli/utils/analysis_io.py +2 -2
- evalvault/adapters/inbound/cli/utils/progress.py +2 -2
- evalvault/adapters/outbound/analysis/__init__.py +13 -3
- evalvault/adapters/outbound/analysis/embedding_analyzer_module.py +2 -1
- evalvault/adapters/outbound/analysis/embedding_searcher_module.py +2 -1
- evalvault/adapters/outbound/analysis/hypothesis_generator_module.py +359 -0
- evalvault/adapters/outbound/analysis/llm_report_module.py +9 -9
- evalvault/adapters/outbound/analysis/network_analyzer_module.py +250 -0
- evalvault/adapters/outbound/analysis/pipeline_factory.py +3 -0
- evalvault/adapters/outbound/analysis/pipeline_helpers.py +1 -1
- evalvault/adapters/outbound/analysis/priority_summary_module.py +1 -1
- evalvault/adapters/outbound/analysis/retrieval_benchmark_module.py +3 -2
- evalvault/adapters/outbound/analysis/timeseries_advanced_module.py +349 -0
- evalvault/adapters/outbound/benchmark/lm_eval_adapter.py +1 -1
- evalvault/adapters/outbound/documents/__init__.py +4 -0
- evalvault/adapters/outbound/documents/ocr/__init__.py +3 -0
- evalvault/adapters/outbound/documents/ocr/paddleocr_backend.py +112 -0
- evalvault/adapters/outbound/documents/pdf_extractor.py +50 -0
- evalvault/adapters/outbound/documents/versioned_loader.py +244 -0
- evalvault/adapters/outbound/improvement/insight_generator.py +23 -12
- evalvault/adapters/outbound/improvement/pattern_detector.py +16 -10
- evalvault/adapters/outbound/improvement/playbook_loader.py +21 -13
- evalvault/adapters/outbound/kg/graph_rag_retriever.py +2 -1
- evalvault/adapters/outbound/llm/__init__.py +63 -63
- evalvault/adapters/outbound/llm/instructor_factory.py +101 -7
- evalvault/adapters/outbound/llm/ollama_adapter.py +27 -27
- evalvault/adapters/outbound/llm/token_aware_chat.py +1 -1
- evalvault/adapters/outbound/report/__init__.py +2 -0
- evalvault/adapters/outbound/report/dashboard_generator.py +197 -0
- evalvault/adapters/outbound/report/llm_report_generator.py +4 -4
- evalvault/adapters/outbound/report/markdown_adapter.py +61 -63
- evalvault/adapters/outbound/storage/postgres_adapter.py +1 -1
- evalvault/adapters/outbound/tracer/open_rag_log_handler.py +3 -3
- evalvault/adapters/outbound/tracer/open_rag_trace_adapter.py +3 -3
- evalvault/adapters/outbound/tracer/open_rag_trace_helpers.py +4 -4
- evalvault/config/settings.py +10 -0
- evalvault/domain/entities/analysis_pipeline.py +13 -3
- evalvault/domain/services/analysis_service.py +3 -3
- evalvault/domain/services/document_versioning.py +119 -0
- evalvault/domain/services/evaluator.py +1 -1
- evalvault/domain/services/pipeline_template_registry.py +197 -127
- evalvault/domain/services/retriever_context.py +56 -2
- evalvault/domain/services/visual_space_service.py +1 -1
- evalvault/ports/outbound/analysis_port.py +2 -2
- evalvault/ports/outbound/improvement_port.py +4 -0
- evalvault-1.59.0.dist-info/METADATA +327 -0
- {evalvault-1.57.1.dist-info → evalvault-1.59.0.dist-info}/RECORD +55 -45
- evalvault-1.57.1.dist-info/METADATA +0 -683
- {evalvault-1.57.1.dist-info → evalvault-1.59.0.dist-info}/WHEEL +0 -0
- {evalvault-1.57.1.dist-info → evalvault-1.59.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.57.1.dist-info → evalvault-1.59.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -51,136 +51,22 @@ class PipelineTemplateRegistry:
|
|
|
51
51
|
)
|
|
52
52
|
self._templates[AnalysisIntent.ANALYZE_PATTERNS] = self._create_analyze_patterns_template()
|
|
53
53
|
self._templates[AnalysisIntent.ANALYZE_TRENDS] = self._create_analyze_trends_template()
|
|
54
|
-
self._templates[AnalysisIntent.
|
|
55
|
-
self.
|
|
56
|
-
)
|
|
57
|
-
|
|
58
|
-
# 보고서 템플릿
|
|
59
|
-
self._templates[AnalysisIntent.GENERATE_SUMMARY] = self._create_generate_summary_template()
|
|
60
|
-
self._templates[AnalysisIntent.GENERATE_DETAILED] = (
|
|
61
|
-
self._create_generate_detailed_template()
|
|
54
|
+
self._templates[AnalysisIntent.ANALYZE_STATISTICAL] = (
|
|
55
|
+
self._create_analyze_statistical_template()
|
|
62
56
|
)
|
|
63
|
-
self._templates[AnalysisIntent.
|
|
64
|
-
|
|
65
|
-
)
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
Args:
|
|
71
|
-
intent: 분석 의도
|
|
72
|
-
|
|
73
|
-
Returns:
|
|
74
|
-
파이프라인 템플릿 또는 None
|
|
75
|
-
"""
|
|
76
|
-
return self._templates.get(intent)
|
|
77
|
-
|
|
78
|
-
def list_all(self) -> list[tuple[AnalysisIntent, AnalysisPipeline]]:
|
|
79
|
-
"""모든 템플릿 목록.
|
|
80
|
-
|
|
81
|
-
Returns:
|
|
82
|
-
(의도, 템플릿) 튜플 목록
|
|
83
|
-
"""
|
|
84
|
-
return list(self._templates.items())
|
|
85
|
-
|
|
86
|
-
# =========================================================================
|
|
87
|
-
# Verification Templates
|
|
88
|
-
# =========================================================================
|
|
89
|
-
|
|
90
|
-
def _create_verify_morpheme_template(self) -> AnalysisPipeline:
|
|
91
|
-
"""형태소 분석 검증 템플릿."""
|
|
92
|
-
nodes = [
|
|
93
|
-
AnalysisNode(
|
|
94
|
-
id="load_data",
|
|
95
|
-
name="데이터 로드",
|
|
96
|
-
module="data_loader",
|
|
97
|
-
),
|
|
98
|
-
AnalysisNode(
|
|
99
|
-
id="morpheme_analysis",
|
|
100
|
-
name="형태소 분석",
|
|
101
|
-
module="morpheme_analyzer",
|
|
102
|
-
depends_on=["load_data"],
|
|
103
|
-
),
|
|
104
|
-
AnalysisNode(
|
|
105
|
-
id="quality_check",
|
|
106
|
-
name="품질 검사",
|
|
107
|
-
module="morpheme_quality_checker",
|
|
108
|
-
depends_on=["morpheme_analysis"],
|
|
109
|
-
),
|
|
110
|
-
AnalysisNode(
|
|
111
|
-
id="report",
|
|
112
|
-
name="검증 보고서",
|
|
113
|
-
module="verification_report",
|
|
114
|
-
depends_on=["quality_check"],
|
|
115
|
-
),
|
|
116
|
-
]
|
|
117
|
-
return AnalysisPipeline(
|
|
118
|
-
intent=AnalysisIntent.VERIFY_MORPHEME,
|
|
119
|
-
nodes=nodes,
|
|
57
|
+
self._templates[AnalysisIntent.ANALYZE_NLP] = self._create_analyze_nlp_template()
|
|
58
|
+
self._templates[AnalysisIntent.ANALYZE_CAUSAL] = self._create_analyze_causal_template()
|
|
59
|
+
self._templates[AnalysisIntent.ANALYZE_NETWORK] = self._create_analyze_network_template()
|
|
60
|
+
self._templates[AnalysisIntent.ANALYZE_PLAYBOOK] = self._create_analyze_playbook_template()
|
|
61
|
+
self._templates[AnalysisIntent.DETECT_ANOMALIES] = self._create_detect_anomalies_template()
|
|
62
|
+
self._templates[AnalysisIntent.FORECAST_PERFORMANCE] = (
|
|
63
|
+
self._create_forecast_performance_template()
|
|
120
64
|
)
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
"""임베딩 품질 검증 템플릿."""
|
|
124
|
-
nodes = [
|
|
125
|
-
AnalysisNode(
|
|
126
|
-
id="load_data",
|
|
127
|
-
name="데이터 로드",
|
|
128
|
-
module="data_loader",
|
|
129
|
-
),
|
|
130
|
-
AnalysisNode(
|
|
131
|
-
id="embedding_analysis",
|
|
132
|
-
name="임베딩 분석",
|
|
133
|
-
module="embedding_analyzer",
|
|
134
|
-
depends_on=["load_data"],
|
|
135
|
-
),
|
|
136
|
-
AnalysisNode(
|
|
137
|
-
id="distribution_check",
|
|
138
|
-
name="분포 검사",
|
|
139
|
-
module="embedding_distribution",
|
|
140
|
-
depends_on=["embedding_analysis"],
|
|
141
|
-
),
|
|
142
|
-
AnalysisNode(
|
|
143
|
-
id="report",
|
|
144
|
-
name="검증 보고서",
|
|
145
|
-
module="verification_report",
|
|
146
|
-
depends_on=["distribution_check"],
|
|
147
|
-
),
|
|
148
|
-
]
|
|
149
|
-
return AnalysisPipeline(
|
|
150
|
-
intent=AnalysisIntent.VERIFY_EMBEDDING,
|
|
151
|
-
nodes=nodes,
|
|
65
|
+
self._templates[AnalysisIntent.GENERATE_HYPOTHESES] = (
|
|
66
|
+
self._create_generate_hypotheses_template()
|
|
152
67
|
)
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
"""검색 품질 검증 템플릿."""
|
|
156
|
-
nodes = [
|
|
157
|
-
AnalysisNode(
|
|
158
|
-
id="load_data",
|
|
159
|
-
name="데이터 로드",
|
|
160
|
-
module="data_loader",
|
|
161
|
-
),
|
|
162
|
-
AnalysisNode(
|
|
163
|
-
id="retrieval_analysis",
|
|
164
|
-
name="검색 분석",
|
|
165
|
-
module="retrieval_analyzer",
|
|
166
|
-
depends_on=["load_data"],
|
|
167
|
-
),
|
|
168
|
-
AnalysisNode(
|
|
169
|
-
id="quality_check",
|
|
170
|
-
name="품질 검사",
|
|
171
|
-
module="retrieval_quality_checker",
|
|
172
|
-
depends_on=["retrieval_analysis"],
|
|
173
|
-
),
|
|
174
|
-
AnalysisNode(
|
|
175
|
-
id="report",
|
|
176
|
-
name="검증 보고서",
|
|
177
|
-
module="verification_report",
|
|
178
|
-
depends_on=["quality_check"],
|
|
179
|
-
),
|
|
180
|
-
]
|
|
181
|
-
return AnalysisPipeline(
|
|
182
|
-
intent=AnalysisIntent.VERIFY_RETRIEVAL,
|
|
183
|
-
nodes=nodes,
|
|
68
|
+
self._templates[AnalysisIntent.BENCHMARK_RETRIEVAL] = (
|
|
69
|
+
self._create_benchmark_retrieval_template()
|
|
184
70
|
)
|
|
185
71
|
|
|
186
72
|
# =========================================================================
|
|
@@ -441,6 +327,190 @@ class PipelineTemplateRegistry:
|
|
|
441
327
|
nodes=nodes,
|
|
442
328
|
)
|
|
443
329
|
|
|
330
|
+
def _create_analyze_statistical_template(self) -> AnalysisPipeline:
|
|
331
|
+
nodes = [
|
|
332
|
+
AnalysisNode(
|
|
333
|
+
id="load_data",
|
|
334
|
+
name="데이터 로드",
|
|
335
|
+
module="data_loader",
|
|
336
|
+
),
|
|
337
|
+
AnalysisNode(
|
|
338
|
+
id="statistics",
|
|
339
|
+
name="통계 분석",
|
|
340
|
+
module="statistical_analyzer",
|
|
341
|
+
depends_on=["load_data"],
|
|
342
|
+
),
|
|
343
|
+
]
|
|
344
|
+
return AnalysisPipeline(
|
|
345
|
+
intent=AnalysisIntent.ANALYZE_STATISTICAL,
|
|
346
|
+
nodes=nodes,
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
def _create_analyze_nlp_template(self) -> AnalysisPipeline:
|
|
350
|
+
nodes = [
|
|
351
|
+
AnalysisNode(
|
|
352
|
+
id="load_data",
|
|
353
|
+
name="데이터 로드",
|
|
354
|
+
module="data_loader",
|
|
355
|
+
),
|
|
356
|
+
AnalysisNode(
|
|
357
|
+
id="nlp_analysis",
|
|
358
|
+
name="NLP 분석",
|
|
359
|
+
module="nlp_analyzer",
|
|
360
|
+
depends_on=["load_data"],
|
|
361
|
+
),
|
|
362
|
+
]
|
|
363
|
+
return AnalysisPipeline(
|
|
364
|
+
intent=AnalysisIntent.ANALYZE_NLP,
|
|
365
|
+
nodes=nodes,
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
def _create_analyze_causal_template(self) -> AnalysisPipeline:
|
|
369
|
+
nodes = [
|
|
370
|
+
AnalysisNode(
|
|
371
|
+
id="load_data",
|
|
372
|
+
name="데이터 로드",
|
|
373
|
+
module="data_loader",
|
|
374
|
+
),
|
|
375
|
+
AnalysisNode(
|
|
376
|
+
id="statistics",
|
|
377
|
+
name="통계 분석",
|
|
378
|
+
module="statistical_analyzer",
|
|
379
|
+
depends_on=["load_data"],
|
|
380
|
+
),
|
|
381
|
+
AnalysisNode(
|
|
382
|
+
id="causal_analysis",
|
|
383
|
+
name="인과 분석",
|
|
384
|
+
module="causal_analyzer",
|
|
385
|
+
depends_on=["load_data", "statistics"],
|
|
386
|
+
),
|
|
387
|
+
]
|
|
388
|
+
return AnalysisPipeline(
|
|
389
|
+
intent=AnalysisIntent.ANALYZE_CAUSAL,
|
|
390
|
+
nodes=nodes,
|
|
391
|
+
)
|
|
392
|
+
|
|
393
|
+
def _create_analyze_network_template(self) -> AnalysisPipeline:
|
|
394
|
+
nodes = [
|
|
395
|
+
AnalysisNode(
|
|
396
|
+
id="load_data",
|
|
397
|
+
name="데이터 로드",
|
|
398
|
+
module="data_loader",
|
|
399
|
+
),
|
|
400
|
+
AnalysisNode(
|
|
401
|
+
id="statistics",
|
|
402
|
+
name="통계 분석",
|
|
403
|
+
module="statistical_analyzer",
|
|
404
|
+
depends_on=["load_data"],
|
|
405
|
+
),
|
|
406
|
+
AnalysisNode(
|
|
407
|
+
id="network_analysis",
|
|
408
|
+
name="네트워크 분석",
|
|
409
|
+
module="network_analyzer",
|
|
410
|
+
depends_on=["statistics"],
|
|
411
|
+
),
|
|
412
|
+
]
|
|
413
|
+
return AnalysisPipeline(
|
|
414
|
+
intent=AnalysisIntent.ANALYZE_NETWORK,
|
|
415
|
+
nodes=nodes,
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
def _create_analyze_playbook_template(self) -> AnalysisPipeline:
|
|
419
|
+
nodes = [
|
|
420
|
+
AnalysisNode(
|
|
421
|
+
id="load_data",
|
|
422
|
+
name="데이터 로드",
|
|
423
|
+
module="data_loader",
|
|
424
|
+
),
|
|
425
|
+
AnalysisNode(
|
|
426
|
+
id="diagnostic",
|
|
427
|
+
name="진단 분석",
|
|
428
|
+
module="diagnostic_playbook",
|
|
429
|
+
depends_on=["load_data"],
|
|
430
|
+
),
|
|
431
|
+
]
|
|
432
|
+
return AnalysisPipeline(
|
|
433
|
+
intent=AnalysisIntent.ANALYZE_PLAYBOOK,
|
|
434
|
+
nodes=nodes,
|
|
435
|
+
)
|
|
436
|
+
|
|
437
|
+
def _create_detect_anomalies_template(self) -> AnalysisPipeline:
|
|
438
|
+
nodes = [
|
|
439
|
+
AnalysisNode(
|
|
440
|
+
id="load_runs",
|
|
441
|
+
name="실행 기록 로드",
|
|
442
|
+
module="run_loader",
|
|
443
|
+
),
|
|
444
|
+
AnalysisNode(
|
|
445
|
+
id="anomaly_detection",
|
|
446
|
+
name="이상 탐지",
|
|
447
|
+
module="timeseries_advanced",
|
|
448
|
+
params={"mode": "anomaly"},
|
|
449
|
+
depends_on=["load_runs"],
|
|
450
|
+
),
|
|
451
|
+
]
|
|
452
|
+
return AnalysisPipeline(
|
|
453
|
+
intent=AnalysisIntent.DETECT_ANOMALIES,
|
|
454
|
+
nodes=nodes,
|
|
455
|
+
)
|
|
456
|
+
|
|
457
|
+
def _create_forecast_performance_template(self) -> AnalysisPipeline:
|
|
458
|
+
nodes = [
|
|
459
|
+
AnalysisNode(
|
|
460
|
+
id="load_runs",
|
|
461
|
+
name="실행 기록 로드",
|
|
462
|
+
module="run_loader",
|
|
463
|
+
),
|
|
464
|
+
AnalysisNode(
|
|
465
|
+
id="forecast",
|
|
466
|
+
name="성능 예측",
|
|
467
|
+
module="timeseries_advanced",
|
|
468
|
+
params={"mode": "forecast"},
|
|
469
|
+
depends_on=["load_runs"],
|
|
470
|
+
),
|
|
471
|
+
]
|
|
472
|
+
return AnalysisPipeline(
|
|
473
|
+
intent=AnalysisIntent.FORECAST_PERFORMANCE,
|
|
474
|
+
nodes=nodes,
|
|
475
|
+
)
|
|
476
|
+
|
|
477
|
+
def _create_generate_hypotheses_template(self) -> AnalysisPipeline:
|
|
478
|
+
nodes = [
|
|
479
|
+
AnalysisNode(
|
|
480
|
+
id="load_data",
|
|
481
|
+
name="데이터 로드",
|
|
482
|
+
module="data_loader",
|
|
483
|
+
),
|
|
484
|
+
AnalysisNode(
|
|
485
|
+
id="statistics",
|
|
486
|
+
name="통계 분석",
|
|
487
|
+
module="statistical_analyzer",
|
|
488
|
+
depends_on=["load_data"],
|
|
489
|
+
),
|
|
490
|
+
AnalysisNode(
|
|
491
|
+
id="ragas_eval",
|
|
492
|
+
name="RAGAS 평가",
|
|
493
|
+
module="ragas_evaluator",
|
|
494
|
+
depends_on=["load_data"],
|
|
495
|
+
),
|
|
496
|
+
AnalysisNode(
|
|
497
|
+
id="low_samples",
|
|
498
|
+
name="낮은 성능 케이스 추출",
|
|
499
|
+
module="low_performer_extractor",
|
|
500
|
+
depends_on=["ragas_eval"],
|
|
501
|
+
),
|
|
502
|
+
AnalysisNode(
|
|
503
|
+
id="hypothesis",
|
|
504
|
+
name="가설 생성",
|
|
505
|
+
module="hypothesis_generator",
|
|
506
|
+
depends_on=["statistics", "low_samples"],
|
|
507
|
+
),
|
|
508
|
+
]
|
|
509
|
+
return AnalysisPipeline(
|
|
510
|
+
intent=AnalysisIntent.GENERATE_HYPOTHESES,
|
|
511
|
+
nodes=nodes,
|
|
512
|
+
)
|
|
513
|
+
|
|
444
514
|
def _create_benchmark_retrieval_template(self) -> AnalysisPipeline:
|
|
445
515
|
"""검색 벤치마크 템플릿."""
|
|
446
516
|
nodes = [
|
|
@@ -3,10 +3,16 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import time
|
|
6
|
-
from collections.abc import Sequence
|
|
6
|
+
from collections.abc import Callable, Sequence
|
|
7
|
+
from datetime import date
|
|
7
8
|
from typing import Any
|
|
8
9
|
|
|
9
|
-
from evalvault.domain.entities import Dataset
|
|
10
|
+
from evalvault.domain.entities import Dataset, TestCase
|
|
11
|
+
from evalvault.domain.services.document_versioning import (
|
|
12
|
+
VersionedChunk,
|
|
13
|
+
parse_contract_date,
|
|
14
|
+
select_chunks_for_contract_date,
|
|
15
|
+
)
|
|
10
16
|
from evalvault.ports.outbound.korean_nlp_port import RetrieverPort, RetrieverResultProtocol
|
|
11
17
|
|
|
12
18
|
|
|
@@ -156,3 +162,51 @@ def _compact_values(values: set[str]) -> str | list[str]:
|
|
|
156
162
|
if len(values) == 1:
|
|
157
163
|
return next(iter(values))
|
|
158
164
|
return sorted(values)
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def apply_versioned_retriever_to_dataset(
|
|
168
|
+
*,
|
|
169
|
+
dataset: Dataset,
|
|
170
|
+
versioned_chunks: Sequence[VersionedChunk],
|
|
171
|
+
build_retriever: Callable[[Sequence[str]], RetrieverPort],
|
|
172
|
+
top_k: int,
|
|
173
|
+
) -> dict[str, dict[str, Any]]:
|
|
174
|
+
cases_by_contract: dict[date | None, list[TestCase]] = {}
|
|
175
|
+
for test_case in dataset.test_cases:
|
|
176
|
+
if _has_contexts(test_case.contexts):
|
|
177
|
+
continue
|
|
178
|
+
contract = None
|
|
179
|
+
if isinstance(test_case.metadata, dict):
|
|
180
|
+
contract = parse_contract_date(test_case.metadata.get("contract_date"))
|
|
181
|
+
cases_by_contract.setdefault(contract, []).append(test_case)
|
|
182
|
+
|
|
183
|
+
if not cases_by_contract:
|
|
184
|
+
return {}
|
|
185
|
+
|
|
186
|
+
retrieval_metadata: dict[str, dict[str, Any]] = {}
|
|
187
|
+
chunk_list = list(versioned_chunks)
|
|
188
|
+
|
|
189
|
+
for contract, cases in cases_by_contract.items():
|
|
190
|
+
selected = select_chunks_for_contract_date(chunk_list, contract)
|
|
191
|
+
documents = [chunk.content for chunk in selected]
|
|
192
|
+
doc_ids = [chunk.doc_id for chunk in selected]
|
|
193
|
+
|
|
194
|
+
retriever = build_retriever(documents)
|
|
195
|
+
subset = Dataset(
|
|
196
|
+
name=dataset.name,
|
|
197
|
+
version=dataset.version,
|
|
198
|
+
test_cases=cases,
|
|
199
|
+
metadata=dict(dataset.metadata or {}),
|
|
200
|
+
source_file=dataset.source_file,
|
|
201
|
+
thresholds=dict(dataset.thresholds or {}),
|
|
202
|
+
)
|
|
203
|
+
retrieval_metadata.update(
|
|
204
|
+
apply_retriever_to_dataset(
|
|
205
|
+
dataset=subset,
|
|
206
|
+
retriever=retriever,
|
|
207
|
+
top_k=top_k,
|
|
208
|
+
doc_ids=doc_ids,
|
|
209
|
+
)
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
return retrieval_metadata
|
|
@@ -591,7 +591,7 @@ def _resolve_result_count_norm(stage_events: list[StageEvent]) -> float | None:
|
|
|
591
591
|
if isinstance(doc_ids, list):
|
|
592
592
|
counts.append(len(doc_ids))
|
|
593
593
|
top_k = event.attributes.get("top_k")
|
|
594
|
-
if isinstance(top_k,
|
|
594
|
+
if isinstance(top_k, int | float) and top_k > 0:
|
|
595
595
|
top_ks.append(int(top_k))
|
|
596
596
|
else:
|
|
597
597
|
top_ks.append(len(doc_ids))
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"""분석 서비스 인터페이스."""
|
|
2
2
|
|
|
3
|
-
from typing import TYPE_CHECKING, Protocol
|
|
3
|
+
from typing import TYPE_CHECKING, Literal, Protocol
|
|
4
4
|
|
|
5
5
|
from evalvault.domain.entities import EvaluationRun
|
|
6
6
|
from evalvault.domain.entities.analysis import (
|
|
@@ -44,7 +44,7 @@ class AnalysisPort(Protocol):
|
|
|
44
44
|
run_a: EvaluationRun,
|
|
45
45
|
run_b: EvaluationRun,
|
|
46
46
|
metrics: list[str] | None = None,
|
|
47
|
-
test_type:
|
|
47
|
+
test_type: Literal["t-test", "mann-whitney"] = "t-test",
|
|
48
48
|
) -> list[ComparisonResult]:
|
|
49
49
|
"""두 실행을 통계적으로 비교합니다.
|
|
50
50
|
|
|
@@ -28,6 +28,7 @@ class PatternDefinitionProtocol(Protocol):
|
|
|
28
28
|
pattern_type: str
|
|
29
29
|
component: str
|
|
30
30
|
priority: str
|
|
31
|
+
detection_rules: Sequence[Any]
|
|
31
32
|
actions: Sequence[ActionDefinitionProtocol]
|
|
32
33
|
|
|
33
34
|
|
|
@@ -55,6 +56,7 @@ class PatternDetectorPort(Protocol):
|
|
|
55
56
|
metrics: Sequence[str] | None = None,
|
|
56
57
|
) -> Mapping[str, list[PatternEvidence]]:
|
|
57
58
|
"""Detect problematic patterns for the evaluation run."""
|
|
59
|
+
...
|
|
58
60
|
|
|
59
61
|
|
|
60
62
|
@runtime_checkable
|
|
@@ -71,6 +73,7 @@ class InsightGeneratorPort(Protocol):
|
|
|
71
73
|
|
|
72
74
|
def enrich_failure_sample(self, failure: FailureSample) -> FailureSample:
|
|
73
75
|
"""Enrich a single failure sample using LLM analysis."""
|
|
76
|
+
...
|
|
74
77
|
|
|
75
78
|
def analyze_batch_failures(
|
|
76
79
|
self,
|
|
@@ -80,3 +83,4 @@ class InsightGeneratorPort(Protocol):
|
|
|
80
83
|
threshold: float,
|
|
81
84
|
) -> ClaimImprovementProtocol:
|
|
82
85
|
"""Produce aggregated insights for multiple failures."""
|
|
86
|
+
...
|