evalvault 1.68.1__py3-none-any.whl → 1.69.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -58,6 +58,7 @@ def build_analysis_pipeline_service(
58
58
  service.register_module(analysis_modules.DiagnosticPlaybookModule())
59
59
  service.register_module(analysis_modules.RootCauseAnalyzerModule())
60
60
  service.register_module(analysis_modules.PatternDetectorModule())
61
+ service.register_module(analysis_modules.MultiTurnAnalyzerModule())
61
62
  service.register_module(analysis_modules.TimeSeriesAnalyzerModule())
62
63
  service.register_module(analysis_modules.TimeSeriesAdvancedModule())
63
64
  service.register_module(analysis_modules.TrendDetectorModule())
@@ -37,6 +37,8 @@ class RetrievalAnalyzerModule(BaseAnalysisModule):
37
37
 
38
38
  params = params or {}
39
39
  max_cases = int(params.get("max_cases", 150))
40
+ max_examples = int(params.get("max_examples", 5))
41
+ max_graphrag_docs = int(params.get("max_graphrag_docs", 5))
40
42
 
41
43
  context_counts: list[int] = []
42
44
  context_token_counts: list[int] = []
@@ -96,6 +98,11 @@ class RetrievalAnalyzerModule(BaseAnalysisModule):
96
98
  retrieval_meta = run.retrieval_metadata or {}
97
99
  retrieval_times: list[float] = []
98
100
  retrieval_scores: list[float] = []
101
+ graph_nodes: list[int] = []
102
+ graph_edges: list[int] = []
103
+ subgraph_sizes: list[int] = []
104
+ graphrag_cases = 0
105
+ graphrag_examples: list[dict[str, Any]] = []
99
106
  for item in retrieval_meta.values():
100
107
  if isinstance(item, dict):
101
108
  if "retrieval_time_ms" in item:
@@ -103,6 +110,20 @@ class RetrievalAnalyzerModule(BaseAnalysisModule):
103
110
  scores = item.get("scores")
104
111
  if isinstance(scores, list) and scores:
105
112
  retrieval_scores.append(safe_mean([float(s) for s in scores]))
113
+ if "graph_nodes" in item:
114
+ graph_nodes.append(int(item["graph_nodes"]))
115
+ if "graph_edges" in item:
116
+ graph_edges.append(int(item["graph_edges"]))
117
+ if "subgraph_size" in item:
118
+ subgraph_sizes.append(int(item["subgraph_size"]))
119
+ if item.get("retriever") == "graphrag":
120
+ graphrag_cases += 1
121
+ if len(graphrag_examples) < max_examples:
122
+ graphrag_details = item.get("graphrag")
123
+ if isinstance(graphrag_details, dict):
124
+ graphrag_examples.append(
125
+ _trim_graphrag_example(graphrag_details, max_docs=max_graphrag_docs)
126
+ )
106
127
 
107
128
  summary = {
108
129
  "total_cases": total_cases,
@@ -122,6 +143,15 @@ class RetrievalAnalyzerModule(BaseAnalysisModule):
122
143
  summary["avg_retrieval_time_ms"] = round(safe_mean(retrieval_times), 2)
123
144
  if retrieval_scores:
124
145
  summary["avg_retrieval_score"] = round(safe_mean(retrieval_scores), 4)
146
+ if graph_nodes:
147
+ summary["avg_graph_nodes"] = round(safe_mean(graph_nodes), 2)
148
+ if graph_edges:
149
+ summary["avg_graph_edges"] = round(safe_mean(graph_edges), 2)
150
+ if subgraph_sizes:
151
+ summary["avg_subgraph_size"] = round(safe_mean(subgraph_sizes), 2)
152
+ if total_cases:
153
+ summary["graphrag_case_rate"] = round(graphrag_cases / total_cases, 4)
154
+ summary["graphrag_case_count"] = graphrag_cases
125
155
 
126
156
  insights = []
127
157
  if summary["avg_contexts"] < 1:
@@ -138,6 +168,38 @@ class RetrievalAnalyzerModule(BaseAnalysisModule):
138
168
  "context_token_counts": context_token_counts[:100],
139
169
  "keyword_overlap_scores": keyword_overlap_scores[:100],
140
170
  "faithfulness_scores": faithfulness_scores[:100],
171
+ "graph_nodes": graph_nodes[:100],
172
+ "graph_edges": graph_edges[:100],
173
+ "subgraph_sizes": subgraph_sizes[:100],
174
+ "graphrag_examples": graphrag_examples,
141
175
  },
142
176
  "insights": insights,
143
177
  }
178
+
179
+
180
+ def _trim_graphrag_example(payload: dict[str, Any], *, max_docs: int) -> dict[str, Any]:
181
+ docs = payload.get("docs")
182
+ if isinstance(docs, list):
183
+ trimmed_docs = []
184
+ for entry in docs[:max_docs]:
185
+ if not isinstance(entry, dict):
186
+ continue
187
+ trimmed_docs.append(_trim_graphrag_doc(entry))
188
+ docs = trimmed_docs
189
+ else:
190
+ docs = []
191
+ return {
192
+ "docs": docs,
193
+ "max_docs": max_docs,
194
+ }
195
+
196
+
197
+ def _trim_graphrag_doc(entry: dict[str, Any]) -> dict[str, Any]:
198
+ output: dict[str, Any] = {}
199
+ for key in ("doc_id", "rank", "score"):
200
+ if key in entry:
201
+ output[key] = entry[key]
202
+ sources = entry.get("sources")
203
+ if isinstance(sources, dict):
204
+ output["sources"] = sources
205
+ return output
@@ -658,6 +658,12 @@ class PipelineTemplateRegistry:
658
658
  module="statistical_analyzer",
659
659
  depends_on=["load_data"],
660
660
  ),
661
+ AnalysisNode(
662
+ id="retrieval_analysis",
663
+ name="검색 분석",
664
+ module="retrieval_analyzer",
665
+ depends_on=["load_data"],
666
+ ),
661
667
  AnalysisNode(
662
668
  id="priority_summary",
663
669
  name="우선순위 요약",
@@ -669,7 +675,7 @@ class PipelineTemplateRegistry:
669
675
  name="LLM 요약 보고서",
670
676
  module="llm_report",
671
677
  params={"report_type": "summary"},
672
- depends_on=["load_data", "statistics"],
678
+ depends_on=["load_data", "statistics", "retrieval_analysis"],
673
679
  ),
674
680
  ]
675
681
  return AnalysisPipeline(
@@ -698,6 +704,12 @@ class PipelineTemplateRegistry:
698
704
  module="ragas_evaluator",
699
705
  depends_on=["load_data"],
700
706
  ),
707
+ AnalysisNode(
708
+ id="retrieval_analysis",
709
+ name="검색 분석",
710
+ module="retrieval_analyzer",
711
+ depends_on=["load_data"],
712
+ ),
701
713
  AnalysisNode(
702
714
  id="low_samples",
703
715
  name="낮은 성능 케이스 추출",
@@ -710,6 +722,12 @@ class PipelineTemplateRegistry:
710
722
  module="diagnostic_playbook",
711
723
  depends_on=["load_data", "ragas_eval"],
712
724
  ),
725
+ AnalysisNode(
726
+ id="multiturn",
727
+ name="멀티턴 분석",
728
+ module="multiturn_analyzer",
729
+ depends_on=["load_data", "ragas_eval"],
730
+ ),
713
731
  AnalysisNode(
714
732
  id="nlp_analysis",
715
733
  name="NLP 분석",
@@ -767,11 +785,13 @@ class PipelineTemplateRegistry:
767
785
  "load_data",
768
786
  "statistics",
769
787
  "ragas_eval",
788
+ "retrieval_analysis",
770
789
  "nlp_analysis",
771
790
  "pattern_detection",
772
791
  "causal_analysis",
773
792
  "root_cause",
774
793
  "priority_summary",
794
+ "multiturn",
775
795
  "trend_detection",
776
796
  ],
777
797
  ),
@@ -48,6 +48,14 @@ def apply_retriever_to_dataset(
48
48
  if scores:
49
49
  metadata["scores"] = scores
50
50
  metadata.update(_extract_graph_attributes(results))
51
+ graphrag_details = _build_graphrag_details(
52
+ results,
53
+ doc_ids=resolved_doc_ids,
54
+ max_docs=top_k,
55
+ )
56
+ if graphrag_details:
57
+ metadata["retriever"] = "graphrag"
58
+ metadata["graphrag"] = graphrag_details
51
59
  retrieval_metadata[test_case.id] = metadata
52
60
 
53
61
  return retrieval_metadata
@@ -164,6 +172,114 @@ def _compact_values(values: set[str]) -> str | list[str]:
164
172
  return sorted(values)
165
173
 
166
174
 
175
+ def _build_graphrag_details(
176
+ results: Sequence[RetrieverResultProtocol],
177
+ *,
178
+ doc_ids: Sequence[str],
179
+ max_docs: int,
180
+ max_entities: int = 20,
181
+ max_relations: int = 20,
182
+ ) -> dict[str, Any] | None:
183
+ details: list[dict[str, Any]] = []
184
+ for rank, result in enumerate(results, start=1):
185
+ metadata = getattr(result, "metadata", None)
186
+ if not isinstance(metadata, dict):
187
+ continue
188
+
189
+ kg_meta = metadata.get("kg") if isinstance(metadata.get("kg"), dict) else None
190
+ bm25_meta = metadata.get("bm25") if isinstance(metadata.get("bm25"), dict) else None
191
+ dense_meta = metadata.get("dense") if isinstance(metadata.get("dense"), dict) else None
192
+ community_id = metadata.get("community_id")
193
+
194
+ if not (kg_meta or bm25_meta or dense_meta or community_id is not None):
195
+ continue
196
+
197
+ doc_id = _resolve_doc_id(result, doc_ids, rank)
198
+ entry: dict[str, Any] = {
199
+ "doc_id": doc_id,
200
+ "rank": rank,
201
+ }
202
+ score = _extract_score(result)
203
+ if score is not None:
204
+ entry["score"] = score
205
+
206
+ sources: dict[str, Any] = {}
207
+ if kg_meta:
208
+ sources["kg"] = {
209
+ "entity_score": _coerce_float_or_none(kg_meta.get("entity_score")),
210
+ "relation_score": _coerce_float_or_none(kg_meta.get("relation_score")),
211
+ "entities": _limit_strings(kg_meta.get("entities"), max_entities),
212
+ "relations": _limit_strings(kg_meta.get("relations"), max_relations),
213
+ "community_id": _coerce_text_or_list(kg_meta.get("community_id")),
214
+ }
215
+ if bm25_meta:
216
+ sources["bm25"] = _build_rank_score(bm25_meta)
217
+ if dense_meta:
218
+ sources["dense"] = _build_rank_score(dense_meta)
219
+ if community_id is not None:
220
+ sources["community_id"] = _coerce_text_or_list(community_id)
221
+ if sources:
222
+ entry["sources"] = sources
223
+
224
+ details.append(entry)
225
+ if len(details) >= max_docs:
226
+ break
227
+
228
+ if not details:
229
+ return None
230
+
231
+ return {
232
+ "docs": details,
233
+ "max_docs": max_docs,
234
+ "max_entities": max_entities,
235
+ "max_relations": max_relations,
236
+ }
237
+
238
+
239
+ def _build_rank_score(payload: dict[str, Any]) -> dict[str, Any]:
240
+ out: dict[str, Any] = {}
241
+ rank = _coerce_int_optional(payload.get("rank"))
242
+ if rank is not None:
243
+ out["rank"] = rank
244
+ score = _coerce_float_or_none(payload.get("score"))
245
+ if score is not None:
246
+ out["score"] = score
247
+ return out
248
+
249
+
250
+ def _coerce_float_or_none(value: Any) -> float | None:
251
+ try:
252
+ if value is None:
253
+ return None
254
+ return float(value)
255
+ except (TypeError, ValueError):
256
+ return None
257
+
258
+
259
+ def _coerce_int_optional(value: Any) -> int | None:
260
+ try:
261
+ if value is None:
262
+ return None
263
+ return int(value)
264
+ except (TypeError, ValueError):
265
+ return None
266
+
267
+
268
+ def _coerce_text_or_list(value: Any) -> str | list[str] | None:
269
+ if value is None:
270
+ return None
271
+ if isinstance(value, (list, tuple, set)):
272
+ return [str(item) for item in value]
273
+ return str(value)
274
+
275
+
276
+ def _limit_strings(value: Any, limit: int) -> list[str]:
277
+ if not value:
278
+ return []
279
+ items = list(value) if isinstance(value, (list, tuple, set)) else [value]
280
+ return [str(item) for item in items[:limit]]
281
+
282
+
167
283
  def apply_versioned_retriever_to_dataset(
168
284
  *,
169
285
  dataset: Dataset,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: evalvault
3
- Version: 1.68.1
3
+ Version: 1.69.0
4
4
  Summary: RAG evaluation system using Ragas with Phoenix/Langfuse tracing
5
5
  Project-URL: Homepage, https://github.com/ntts9990/EvalVault
6
6
  Project-URL: Documentation, https://github.com/ntts9990/EvalVault#readme
@@ -90,11 +90,11 @@ evalvault/adapters/outbound/analysis/network_analyzer_module.py,sha256=ITUVnt_CI
90
90
  evalvault/adapters/outbound/analysis/nlp_adapter.py,sha256=aLtF_fns-7IEtitwON2EYS_lweq_IdldFsRm47alN0Q,29561
91
91
  evalvault/adapters/outbound/analysis/nlp_analyzer_module.py,sha256=kVuG9pVMQO6OYY5zxj_w9nNQZ1-qIO0y6XcXo6lG-n0,8221
92
92
  evalvault/adapters/outbound/analysis/pattern_detector_module.py,sha256=SyCDO_VS-r-tjGh8WrW-t1GCSC9ouxirdVk4NizFPXo,1882
93
- evalvault/adapters/outbound/analysis/pipeline_factory.py,sha256=XvcCbKCN_otv1pGUzk0oE76RV19yFga8r6RngBvgEFo,3691
93
+ evalvault/adapters/outbound/analysis/pipeline_factory.py,sha256=Yk-VPagdAZXbbD08pCSOleg-URuVAzJks4oGl61mKAs,3763
94
94
  evalvault/adapters/outbound/analysis/pipeline_helpers.py,sha256=8E8IrYI5JvRrpnjxe0DS7srbPzB0XAxxXhLLYgfwsgU,5756
95
95
  evalvault/adapters/outbound/analysis/priority_summary_module.py,sha256=o8Y0rfHjYYE9WNTwKtpJulwfvLA3MNMhYjdSg15Vacc,10802
96
96
  evalvault/adapters/outbound/analysis/ragas_evaluator_module.py,sha256=Cd-spGn56zMcqOdoTLUHTYVOFqHqR17tPFyJs7rmnbw,7659
97
- evalvault/adapters/outbound/analysis/retrieval_analyzer_module.py,sha256=D24GTaKabHacSBI-UqCd_jy61hnne8-QG1p4rqW1Bzk,5748
97
+ evalvault/adapters/outbound/analysis/retrieval_analyzer_module.py,sha256=STRHWapVAEz0YbSxR3NzT6zV7wfwlPxjKZunuWpfTmE,8340
98
98
  evalvault/adapters/outbound/analysis/retrieval_benchmark_module.py,sha256=_duIBlYhAsFygEpC7DuwoAqfTbVG2xgp70JjW1LJAGE,9312
99
99
  evalvault/adapters/outbound/analysis/retrieval_quality_checker_module.py,sha256=K1IJn4bvvz-BfqQmhd5Ik9oATjq_-G7V1AZSW8zKtSE,3121
100
100
  evalvault/adapters/outbound/analysis/root_cause_analyzer_module.py,sha256=UagHWb2d1vD7aCH0vLl3tSJx86gkkxNarrF-rwtEBhU,2811
@@ -280,7 +280,7 @@ evalvault/domain/services/memory_based_analysis.py,sha256=oh2irCy3le7fWiTtL31SME
280
280
  evalvault/domain/services/method_runner.py,sha256=pABqKZeaALpWZYDfzAbd-VOZt2djQggRNIPuuPQeUSw,3571
281
281
  evalvault/domain/services/ops_snapshot_service.py,sha256=1CqJN2p3tM6SgzLCZKcVEM213fd1cDGexTRPG_3e59w,5138
282
282
  evalvault/domain/services/pipeline_orchestrator.py,sha256=yriVlEVZYDtt0Vwt4Ae6xyW1H6Dj4Hxdn8XQSvQNSoQ,19436
283
- evalvault/domain/services/pipeline_template_registry.py,sha256=aWqXLQ24grpSZo9M4tZLRo1ysD10c6hUpW3JupZH9e0,28083
283
+ evalvault/domain/services/pipeline_template_registry.py,sha256=k5Ce1BC3NgcYqCLiUZpXsl_6WwDHOXONoYDH7KzX2L4,28809
284
284
  evalvault/domain/services/prompt_candidate_service.py,sha256=Ibyb5EaWK28Ju2HnTqHHGOoiA9Q-VwY3hjxVODALwGY,3997
285
285
  evalvault/domain/services/prompt_manifest.py,sha256=5s5Kd6-_Dn-xrjjlU99CVo6njsPhvE50H5m_85U-H6U,5612
286
286
  evalvault/domain/services/prompt_registry.py,sha256=QyL4yIcKT93uv6L0-Q_iaNXno8QnsC19YcGekuSRMtE,5247
@@ -290,7 +290,7 @@ evalvault/domain/services/prompt_suggestion_reporter.py,sha256=Fc6sCPebUMk8SZVpj
290
290
  evalvault/domain/services/ragas_prompt_overrides.py,sha256=4BecYE2KrreUBbIM3ssP9WzHcK_wRc8jW7CE_k58QOU,1412
291
291
  evalvault/domain/services/regression_gate_service.py,sha256=qBMODgpizmEzqEL8_JX-FYSVyARiroMW7MFVzlz7gjc,6579
292
292
  evalvault/domain/services/retrieval_metrics.py,sha256=dtrQPLMrXSyWLcgF8EGcLNFwzwA59WDzEh41JRToHAY,2980
293
- evalvault/domain/services/retriever_context.py,sha256=ySQ-GuadiggS0LVAib4AxA_0JpasYz4S9hbjau0eyIA,6482
293
+ evalvault/domain/services/retriever_context.py,sha256=TeJ9UgT4l3lXxOXcYMz_7PdVMlV7JsW2ewTXdv9dI2M,10185
294
294
  evalvault/domain/services/run_comparison_service.py,sha256=_NScltCRcY3zrvdyYDiPmssTxCDv1GyjCLdP3uAxJts,5631
295
295
  evalvault/domain/services/satisfaction_calibration_service.py,sha256=H7Z8opOyPHRO5qVIw-XDsNhIwdCteAS9_a3BTlfIqHg,11906
296
296
  evalvault/domain/services/stage_event_builder.py,sha256=FAT34Wmylvd2Yz5rDlhaTh1lqSCDhGApCXMi7Hjkib0,9748
@@ -339,8 +339,8 @@ evalvault/reports/__init__.py,sha256=Bb1X4871msAN8I6PM6nKGED3psPwZt88hXZBAOdH06Y
339
339
  evalvault/reports/release_notes.py,sha256=pZj0PBFT-4F_Ty-Kv5P69BuoOnmTCn4kznDcORFJd0w,4011
340
340
  evalvault/scripts/__init__.py,sha256=NwEeIFQbkX4ml2R_PhtIoNtArDSX_suuoymgG_7Kwso,89
341
341
  evalvault/scripts/regression_runner.py,sha256=SxZori5BZ8jVQ057Mf5V5FPgIVDccrV5oRONmnhuk8w,8438
342
- evalvault-1.68.1.dist-info/METADATA,sha256=BRG7UFXRx1fT_JDFqSsdOuB_nk_LVnaNSNYzVyYWyyU,26159
343
- evalvault-1.68.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
344
- evalvault-1.68.1.dist-info/entry_points.txt,sha256=Oj9Xc5gYcyUYYNmQfWI8NYGw7nN-3M-h2ipHIMlVn6o,65
345
- evalvault-1.68.1.dist-info/licenses/LICENSE.md,sha256=3RNWY4jjtrQ_yYa-D-7I3XO12Ti7YzxsLV_dpykujvo,11358
346
- evalvault-1.68.1.dist-info/RECORD,,
342
+ evalvault-1.69.0.dist-info/METADATA,sha256=dmvwwIQIH_eIRT1ZUxhEsoQbTdoAyXvxiSRuU_zxysg,26159
343
+ evalvault-1.69.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
344
+ evalvault-1.69.0.dist-info/entry_points.txt,sha256=Oj9Xc5gYcyUYYNmQfWI8NYGw7nN-3M-h2ipHIMlVn6o,65
345
+ evalvault-1.69.0.dist-info/licenses/LICENSE.md,sha256=3RNWY4jjtrQ_yYa-D-7I3XO12Ti7YzxsLV_dpykujvo,11358
346
+ evalvault-1.69.0.dist-info/RECORD,,