evalvault 1.57.1__py3-none-any.whl → 1.59.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. evalvault/adapters/inbound/api/routers/pipeline.py +48 -0
  2. evalvault/adapters/inbound/cli/commands/analyze.py +434 -179
  3. evalvault/adapters/inbound/cli/commands/pipeline.py +5 -1
  4. evalvault/adapters/inbound/cli/commands/run.py +628 -183
  5. evalvault/adapters/inbound/cli/commands/run_helpers.py +29 -30
  6. evalvault/adapters/inbound/cli/utils/analysis_io.py +2 -2
  7. evalvault/adapters/inbound/cli/utils/progress.py +2 -2
  8. evalvault/adapters/outbound/analysis/__init__.py +13 -3
  9. evalvault/adapters/outbound/analysis/embedding_analyzer_module.py +2 -1
  10. evalvault/adapters/outbound/analysis/embedding_searcher_module.py +2 -1
  11. evalvault/adapters/outbound/analysis/hypothesis_generator_module.py +359 -0
  12. evalvault/adapters/outbound/analysis/llm_report_module.py +9 -9
  13. evalvault/adapters/outbound/analysis/network_analyzer_module.py +250 -0
  14. evalvault/adapters/outbound/analysis/pipeline_factory.py +3 -0
  15. evalvault/adapters/outbound/analysis/pipeline_helpers.py +1 -1
  16. evalvault/adapters/outbound/analysis/priority_summary_module.py +1 -1
  17. evalvault/adapters/outbound/analysis/retrieval_benchmark_module.py +3 -2
  18. evalvault/adapters/outbound/analysis/timeseries_advanced_module.py +349 -0
  19. evalvault/adapters/outbound/benchmark/lm_eval_adapter.py +1 -1
  20. evalvault/adapters/outbound/documents/__init__.py +4 -0
  21. evalvault/adapters/outbound/documents/ocr/__init__.py +3 -0
  22. evalvault/adapters/outbound/documents/ocr/paddleocr_backend.py +112 -0
  23. evalvault/adapters/outbound/documents/pdf_extractor.py +50 -0
  24. evalvault/adapters/outbound/documents/versioned_loader.py +244 -0
  25. evalvault/adapters/outbound/improvement/insight_generator.py +23 -12
  26. evalvault/adapters/outbound/improvement/pattern_detector.py +16 -10
  27. evalvault/adapters/outbound/improvement/playbook_loader.py +21 -13
  28. evalvault/adapters/outbound/kg/graph_rag_retriever.py +2 -1
  29. evalvault/adapters/outbound/llm/__init__.py +63 -63
  30. evalvault/adapters/outbound/llm/instructor_factory.py +101 -7
  31. evalvault/adapters/outbound/llm/ollama_adapter.py +27 -27
  32. evalvault/adapters/outbound/llm/token_aware_chat.py +1 -1
  33. evalvault/adapters/outbound/report/__init__.py +2 -0
  34. evalvault/adapters/outbound/report/dashboard_generator.py +197 -0
  35. evalvault/adapters/outbound/report/llm_report_generator.py +4 -4
  36. evalvault/adapters/outbound/report/markdown_adapter.py +61 -63
  37. evalvault/adapters/outbound/storage/postgres_adapter.py +1 -1
  38. evalvault/adapters/outbound/tracer/open_rag_log_handler.py +3 -3
  39. evalvault/adapters/outbound/tracer/open_rag_trace_adapter.py +3 -3
  40. evalvault/adapters/outbound/tracer/open_rag_trace_helpers.py +4 -4
  41. evalvault/config/settings.py +10 -0
  42. evalvault/domain/entities/analysis_pipeline.py +13 -3
  43. evalvault/domain/services/analysis_service.py +3 -3
  44. evalvault/domain/services/document_versioning.py +119 -0
  45. evalvault/domain/services/evaluator.py +1 -1
  46. evalvault/domain/services/pipeline_template_registry.py +197 -127
  47. evalvault/domain/services/retriever_context.py +56 -2
  48. evalvault/domain/services/visual_space_service.py +1 -1
  49. evalvault/ports/outbound/analysis_port.py +2 -2
  50. evalvault/ports/outbound/improvement_port.py +4 -0
  51. evalvault-1.59.0.dist-info/METADATA +327 -0
  52. {evalvault-1.57.1.dist-info → evalvault-1.59.0.dist-info}/RECORD +55 -45
  53. evalvault-1.57.1.dist-info/METADATA +0 -683
  54. {evalvault-1.57.1.dist-info → evalvault-1.59.0.dist-info}/WHEEL +0 -0
  55. {evalvault-1.57.1.dist-info → evalvault-1.59.0.dist-info}/entry_points.txt +0 -0
  56. {evalvault-1.57.1.dist-info → evalvault-1.59.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -51,136 +51,22 @@ class PipelineTemplateRegistry:
51
51
  )
52
52
  self._templates[AnalysisIntent.ANALYZE_PATTERNS] = self._create_analyze_patterns_template()
53
53
  self._templates[AnalysisIntent.ANALYZE_TRENDS] = self._create_analyze_trends_template()
54
- self._templates[AnalysisIntent.BENCHMARK_RETRIEVAL] = (
55
- self._create_benchmark_retrieval_template()
56
- )
57
-
58
- # 보고서 템플릿
59
- self._templates[AnalysisIntent.GENERATE_SUMMARY] = self._create_generate_summary_template()
60
- self._templates[AnalysisIntent.GENERATE_DETAILED] = (
61
- self._create_generate_detailed_template()
54
+ self._templates[AnalysisIntent.ANALYZE_STATISTICAL] = (
55
+ self._create_analyze_statistical_template()
62
56
  )
63
- self._templates[AnalysisIntent.GENERATE_COMPARISON] = (
64
- self._create_generate_comparison_template()
65
- )
66
-
67
- def get_template(self, intent: AnalysisIntent) -> AnalysisPipeline | None:
68
- """의도에 대한 템플릿 조회.
69
-
70
- Args:
71
- intent: 분석 의도
72
-
73
- Returns:
74
- 파이프라인 템플릿 또는 None
75
- """
76
- return self._templates.get(intent)
77
-
78
- def list_all(self) -> list[tuple[AnalysisIntent, AnalysisPipeline]]:
79
- """모든 템플릿 목록.
80
-
81
- Returns:
82
- (의도, 템플릿) 튜플 목록
83
- """
84
- return list(self._templates.items())
85
-
86
- # =========================================================================
87
- # Verification Templates
88
- # =========================================================================
89
-
90
- def _create_verify_morpheme_template(self) -> AnalysisPipeline:
91
- """형태소 분석 검증 템플릿."""
92
- nodes = [
93
- AnalysisNode(
94
- id="load_data",
95
- name="데이터 로드",
96
- module="data_loader",
97
- ),
98
- AnalysisNode(
99
- id="morpheme_analysis",
100
- name="형태소 분석",
101
- module="morpheme_analyzer",
102
- depends_on=["load_data"],
103
- ),
104
- AnalysisNode(
105
- id="quality_check",
106
- name="품질 검사",
107
- module="morpheme_quality_checker",
108
- depends_on=["morpheme_analysis"],
109
- ),
110
- AnalysisNode(
111
- id="report",
112
- name="검증 보고서",
113
- module="verification_report",
114
- depends_on=["quality_check"],
115
- ),
116
- ]
117
- return AnalysisPipeline(
118
- intent=AnalysisIntent.VERIFY_MORPHEME,
119
- nodes=nodes,
57
+ self._templates[AnalysisIntent.ANALYZE_NLP] = self._create_analyze_nlp_template()
58
+ self._templates[AnalysisIntent.ANALYZE_CAUSAL] = self._create_analyze_causal_template()
59
+ self._templates[AnalysisIntent.ANALYZE_NETWORK] = self._create_analyze_network_template()
60
+ self._templates[AnalysisIntent.ANALYZE_PLAYBOOK] = self._create_analyze_playbook_template()
61
+ self._templates[AnalysisIntent.DETECT_ANOMALIES] = self._create_detect_anomalies_template()
62
+ self._templates[AnalysisIntent.FORECAST_PERFORMANCE] = (
63
+ self._create_forecast_performance_template()
120
64
  )
121
-
122
- def _create_verify_embedding_template(self) -> AnalysisPipeline:
123
- """임베딩 품질 검증 템플릿."""
124
- nodes = [
125
- AnalysisNode(
126
- id="load_data",
127
- name="데이터 로드",
128
- module="data_loader",
129
- ),
130
- AnalysisNode(
131
- id="embedding_analysis",
132
- name="임베딩 분석",
133
- module="embedding_analyzer",
134
- depends_on=["load_data"],
135
- ),
136
- AnalysisNode(
137
- id="distribution_check",
138
- name="분포 검사",
139
- module="embedding_distribution",
140
- depends_on=["embedding_analysis"],
141
- ),
142
- AnalysisNode(
143
- id="report",
144
- name="검증 보고서",
145
- module="verification_report",
146
- depends_on=["distribution_check"],
147
- ),
148
- ]
149
- return AnalysisPipeline(
150
- intent=AnalysisIntent.VERIFY_EMBEDDING,
151
- nodes=nodes,
65
+ self._templates[AnalysisIntent.GENERATE_HYPOTHESES] = (
66
+ self._create_generate_hypotheses_template()
152
67
  )
153
-
154
- def _create_verify_retrieval_template(self) -> AnalysisPipeline:
155
- """검색 품질 검증 템플릿."""
156
- nodes = [
157
- AnalysisNode(
158
- id="load_data",
159
- name="데이터 로드",
160
- module="data_loader",
161
- ),
162
- AnalysisNode(
163
- id="retrieval_analysis",
164
- name="검색 분석",
165
- module="retrieval_analyzer",
166
- depends_on=["load_data"],
167
- ),
168
- AnalysisNode(
169
- id="quality_check",
170
- name="품질 검사",
171
- module="retrieval_quality_checker",
172
- depends_on=["retrieval_analysis"],
173
- ),
174
- AnalysisNode(
175
- id="report",
176
- name="검증 보고서",
177
- module="verification_report",
178
- depends_on=["quality_check"],
179
- ),
180
- ]
181
- return AnalysisPipeline(
182
- intent=AnalysisIntent.VERIFY_RETRIEVAL,
183
- nodes=nodes,
68
+ self._templates[AnalysisIntent.BENCHMARK_RETRIEVAL] = (
69
+ self._create_benchmark_retrieval_template()
184
70
  )
185
71
 
186
72
  # =========================================================================
@@ -441,6 +327,190 @@ class PipelineTemplateRegistry:
441
327
  nodes=nodes,
442
328
  )
443
329
 
330
+ def _create_analyze_statistical_template(self) -> AnalysisPipeline:
331
+ nodes = [
332
+ AnalysisNode(
333
+ id="load_data",
334
+ name="데이터 로드",
335
+ module="data_loader",
336
+ ),
337
+ AnalysisNode(
338
+ id="statistics",
339
+ name="통계 분석",
340
+ module="statistical_analyzer",
341
+ depends_on=["load_data"],
342
+ ),
343
+ ]
344
+ return AnalysisPipeline(
345
+ intent=AnalysisIntent.ANALYZE_STATISTICAL,
346
+ nodes=nodes,
347
+ )
348
+
349
+ def _create_analyze_nlp_template(self) -> AnalysisPipeline:
350
+ nodes = [
351
+ AnalysisNode(
352
+ id="load_data",
353
+ name="데이터 로드",
354
+ module="data_loader",
355
+ ),
356
+ AnalysisNode(
357
+ id="nlp_analysis",
358
+ name="NLP 분석",
359
+ module="nlp_analyzer",
360
+ depends_on=["load_data"],
361
+ ),
362
+ ]
363
+ return AnalysisPipeline(
364
+ intent=AnalysisIntent.ANALYZE_NLP,
365
+ nodes=nodes,
366
+ )
367
+
368
+ def _create_analyze_causal_template(self) -> AnalysisPipeline:
369
+ nodes = [
370
+ AnalysisNode(
371
+ id="load_data",
372
+ name="데이터 로드",
373
+ module="data_loader",
374
+ ),
375
+ AnalysisNode(
376
+ id="statistics",
377
+ name="통계 분석",
378
+ module="statistical_analyzer",
379
+ depends_on=["load_data"],
380
+ ),
381
+ AnalysisNode(
382
+ id="causal_analysis",
383
+ name="인과 분석",
384
+ module="causal_analyzer",
385
+ depends_on=["load_data", "statistics"],
386
+ ),
387
+ ]
388
+ return AnalysisPipeline(
389
+ intent=AnalysisIntent.ANALYZE_CAUSAL,
390
+ nodes=nodes,
391
+ )
392
+
393
+ def _create_analyze_network_template(self) -> AnalysisPipeline:
394
+ nodes = [
395
+ AnalysisNode(
396
+ id="load_data",
397
+ name="데이터 로드",
398
+ module="data_loader",
399
+ ),
400
+ AnalysisNode(
401
+ id="statistics",
402
+ name="통계 분석",
403
+ module="statistical_analyzer",
404
+ depends_on=["load_data"],
405
+ ),
406
+ AnalysisNode(
407
+ id="network_analysis",
408
+ name="네트워크 분석",
409
+ module="network_analyzer",
410
+ depends_on=["statistics"],
411
+ ),
412
+ ]
413
+ return AnalysisPipeline(
414
+ intent=AnalysisIntent.ANALYZE_NETWORK,
415
+ nodes=nodes,
416
+ )
417
+
418
+ def _create_analyze_playbook_template(self) -> AnalysisPipeline:
419
+ nodes = [
420
+ AnalysisNode(
421
+ id="load_data",
422
+ name="데이터 로드",
423
+ module="data_loader",
424
+ ),
425
+ AnalysisNode(
426
+ id="diagnostic",
427
+ name="진단 분석",
428
+ module="diagnostic_playbook",
429
+ depends_on=["load_data"],
430
+ ),
431
+ ]
432
+ return AnalysisPipeline(
433
+ intent=AnalysisIntent.ANALYZE_PLAYBOOK,
434
+ nodes=nodes,
435
+ )
436
+
437
+ def _create_detect_anomalies_template(self) -> AnalysisPipeline:
438
+ nodes = [
439
+ AnalysisNode(
440
+ id="load_runs",
441
+ name="실행 기록 로드",
442
+ module="run_loader",
443
+ ),
444
+ AnalysisNode(
445
+ id="anomaly_detection",
446
+ name="이상 탐지",
447
+ module="timeseries_advanced",
448
+ params={"mode": "anomaly"},
449
+ depends_on=["load_runs"],
450
+ ),
451
+ ]
452
+ return AnalysisPipeline(
453
+ intent=AnalysisIntent.DETECT_ANOMALIES,
454
+ nodes=nodes,
455
+ )
456
+
457
+ def _create_forecast_performance_template(self) -> AnalysisPipeline:
458
+ nodes = [
459
+ AnalysisNode(
460
+ id="load_runs",
461
+ name="실행 기록 로드",
462
+ module="run_loader",
463
+ ),
464
+ AnalysisNode(
465
+ id="forecast",
466
+ name="성능 예측",
467
+ module="timeseries_advanced",
468
+ params={"mode": "forecast"},
469
+ depends_on=["load_runs"],
470
+ ),
471
+ ]
472
+ return AnalysisPipeline(
473
+ intent=AnalysisIntent.FORECAST_PERFORMANCE,
474
+ nodes=nodes,
475
+ )
476
+
477
+ def _create_generate_hypotheses_template(self) -> AnalysisPipeline:
478
+ nodes = [
479
+ AnalysisNode(
480
+ id="load_data",
481
+ name="데이터 로드",
482
+ module="data_loader",
483
+ ),
484
+ AnalysisNode(
485
+ id="statistics",
486
+ name="통계 분석",
487
+ module="statistical_analyzer",
488
+ depends_on=["load_data"],
489
+ ),
490
+ AnalysisNode(
491
+ id="ragas_eval",
492
+ name="RAGAS 평가",
493
+ module="ragas_evaluator",
494
+ depends_on=["load_data"],
495
+ ),
496
+ AnalysisNode(
497
+ id="low_samples",
498
+ name="낮은 성능 케이스 추출",
499
+ module="low_performer_extractor",
500
+ depends_on=["ragas_eval"],
501
+ ),
502
+ AnalysisNode(
503
+ id="hypothesis",
504
+ name="가설 생성",
505
+ module="hypothesis_generator",
506
+ depends_on=["statistics", "low_samples"],
507
+ ),
508
+ ]
509
+ return AnalysisPipeline(
510
+ intent=AnalysisIntent.GENERATE_HYPOTHESES,
511
+ nodes=nodes,
512
+ )
513
+
444
514
  def _create_benchmark_retrieval_template(self) -> AnalysisPipeline:
445
515
  """검색 벤치마크 템플릿."""
446
516
  nodes = [
@@ -3,10 +3,16 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import time
6
- from collections.abc import Sequence
6
+ from collections.abc import Callable, Sequence
7
+ from datetime import date
7
8
  from typing import Any
8
9
 
9
- from evalvault.domain.entities import Dataset
10
+ from evalvault.domain.entities import Dataset, TestCase
11
+ from evalvault.domain.services.document_versioning import (
12
+ VersionedChunk,
13
+ parse_contract_date,
14
+ select_chunks_for_contract_date,
15
+ )
10
16
  from evalvault.ports.outbound.korean_nlp_port import RetrieverPort, RetrieverResultProtocol
11
17
 
12
18
 
@@ -156,3 +162,51 @@ def _compact_values(values: set[str]) -> str | list[str]:
156
162
  if len(values) == 1:
157
163
  return next(iter(values))
158
164
  return sorted(values)
165
+
166
+
167
+ def apply_versioned_retriever_to_dataset(
168
+ *,
169
+ dataset: Dataset,
170
+ versioned_chunks: Sequence[VersionedChunk],
171
+ build_retriever: Callable[[Sequence[str]], RetrieverPort],
172
+ top_k: int,
173
+ ) -> dict[str, dict[str, Any]]:
174
+ cases_by_contract: dict[date | None, list[TestCase]] = {}
175
+ for test_case in dataset.test_cases:
176
+ if _has_contexts(test_case.contexts):
177
+ continue
178
+ contract = None
179
+ if isinstance(test_case.metadata, dict):
180
+ contract = parse_contract_date(test_case.metadata.get("contract_date"))
181
+ cases_by_contract.setdefault(contract, []).append(test_case)
182
+
183
+ if not cases_by_contract:
184
+ return {}
185
+
186
+ retrieval_metadata: dict[str, dict[str, Any]] = {}
187
+ chunk_list = list(versioned_chunks)
188
+
189
+ for contract, cases in cases_by_contract.items():
190
+ selected = select_chunks_for_contract_date(chunk_list, contract)
191
+ documents = [chunk.content for chunk in selected]
192
+ doc_ids = [chunk.doc_id for chunk in selected]
193
+
194
+ retriever = build_retriever(documents)
195
+ subset = Dataset(
196
+ name=dataset.name,
197
+ version=dataset.version,
198
+ test_cases=cases,
199
+ metadata=dict(dataset.metadata or {}),
200
+ source_file=dataset.source_file,
201
+ thresholds=dict(dataset.thresholds or {}),
202
+ )
203
+ retrieval_metadata.update(
204
+ apply_retriever_to_dataset(
205
+ dataset=subset,
206
+ retriever=retriever,
207
+ top_k=top_k,
208
+ doc_ids=doc_ids,
209
+ )
210
+ )
211
+
212
+ return retrieval_metadata
@@ -591,7 +591,7 @@ def _resolve_result_count_norm(stage_events: list[StageEvent]) -> float | None:
591
591
  if isinstance(doc_ids, list):
592
592
  counts.append(len(doc_ids))
593
593
  top_k = event.attributes.get("top_k")
594
- if isinstance(top_k, (int, float)) and top_k > 0:
594
+ if isinstance(top_k, int | float) and top_k > 0:
595
595
  top_ks.append(int(top_k))
596
596
  else:
597
597
  top_ks.append(len(doc_ids))
@@ -1,6 +1,6 @@
1
1
  """분석 서비스 인터페이스."""
2
2
 
3
- from typing import TYPE_CHECKING, Protocol
3
+ from typing import TYPE_CHECKING, Literal, Protocol
4
4
 
5
5
  from evalvault.domain.entities import EvaluationRun
6
6
  from evalvault.domain.entities.analysis import (
@@ -44,7 +44,7 @@ class AnalysisPort(Protocol):
44
44
  run_a: EvaluationRun,
45
45
  run_b: EvaluationRun,
46
46
  metrics: list[str] | None = None,
47
- test_type: str = "t-test",
47
+ test_type: Literal["t-test", "mann-whitney"] = "t-test",
48
48
  ) -> list[ComparisonResult]:
49
49
  """두 실행을 통계적으로 비교합니다.
50
50
 
@@ -28,6 +28,7 @@ class PatternDefinitionProtocol(Protocol):
28
28
  pattern_type: str
29
29
  component: str
30
30
  priority: str
31
+ detection_rules: Sequence[Any]
31
32
  actions: Sequence[ActionDefinitionProtocol]
32
33
 
33
34
 
@@ -55,6 +56,7 @@ class PatternDetectorPort(Protocol):
55
56
  metrics: Sequence[str] | None = None,
56
57
  ) -> Mapping[str, list[PatternEvidence]]:
57
58
  """Detect problematic patterns for the evaluation run."""
59
+ ...
58
60
 
59
61
 
60
62
  @runtime_checkable
@@ -71,6 +73,7 @@ class InsightGeneratorPort(Protocol):
71
73
 
72
74
  def enrich_failure_sample(self, failure: FailureSample) -> FailureSample:
73
75
  """Enrich a single failure sample using LLM analysis."""
76
+ ...
74
77
 
75
78
  def analyze_batch_failures(
76
79
  self,
@@ -80,3 +83,4 @@ class InsightGeneratorPort(Protocol):
80
83
  threshold: float,
81
84
  ) -> ClaimImprovementProtocol:
82
85
  """Produce aggregated insights for multiple failures."""
86
+ ...