evalvault 1.62.1__py3-none-any.whl → 1.63.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. evalvault/adapters/inbound/api/adapter.py +190 -19
  2. evalvault/adapters/inbound/api/routers/runs.py +66 -2
  3. evalvault/adapters/inbound/cli/commands/method.py +5 -2
  4. evalvault/adapters/inbound/cli/commands/prompts.py +613 -5
  5. evalvault/adapters/inbound/cli/commands/run.py +88 -5
  6. evalvault/adapters/inbound/cli/commands/run_helpers.py +12 -0
  7. evalvault/adapters/inbound/mcp/tools.py +5 -2
  8. evalvault/adapters/outbound/analysis/ragas_evaluator_module.py +13 -9
  9. evalvault/adapters/outbound/improvement/pattern_detector.py +1 -1
  10. evalvault/adapters/outbound/improvement/playbook_loader.py +1 -1
  11. evalvault/adapters/outbound/llm/__init__.py +5 -43
  12. evalvault/adapters/outbound/llm/anthropic_adapter.py +27 -7
  13. evalvault/adapters/outbound/llm/factory.py +103 -0
  14. evalvault/adapters/outbound/llm/llm_relation_augmenter.py +39 -14
  15. evalvault/adapters/outbound/llm/ollama_adapter.py +34 -10
  16. evalvault/adapters/outbound/llm/openai_adapter.py +41 -8
  17. evalvault/adapters/outbound/llm/token_aware_chat.py +21 -2
  18. evalvault/adapters/outbound/llm/vllm_adapter.py +39 -8
  19. evalvault/adapters/outbound/nlp/korean/toolkit_factory.py +20 -0
  20. evalvault/adapters/outbound/report/llm_report_generator.py +90 -6
  21. evalvault/adapters/outbound/storage/base_sql.py +528 -21
  22. evalvault/adapters/outbound/storage/postgres_adapter.py +209 -0
  23. evalvault/adapters/outbound/storage/postgres_schema.sql +38 -0
  24. evalvault/adapters/outbound/storage/sqlite_adapter.py +86 -5
  25. evalvault/debug_ragas.py +7 -1
  26. evalvault/debug_ragas_real.py +5 -1
  27. evalvault/domain/entities/__init__.py +10 -0
  28. evalvault/domain/entities/prompt_suggestion.py +50 -0
  29. evalvault/domain/services/__init__.py +6 -0
  30. evalvault/domain/services/evaluator.py +191 -103
  31. evalvault/domain/services/holdout_splitter.py +67 -0
  32. evalvault/domain/services/intent_classifier.py +73 -0
  33. evalvault/domain/services/pipeline_template_registry.py +3 -0
  34. evalvault/domain/services/prompt_candidate_service.py +117 -0
  35. evalvault/domain/services/prompt_registry.py +40 -2
  36. evalvault/domain/services/prompt_scoring_service.py +286 -0
  37. evalvault/domain/services/prompt_suggestion_reporter.py +277 -0
  38. evalvault/domain/services/synthetic_qa_generator.py +4 -3
  39. evalvault/ports/inbound/learning_hook_port.py +4 -1
  40. evalvault/ports/outbound/__init__.py +2 -0
  41. evalvault/ports/outbound/llm_factory_port.py +13 -0
  42. evalvault/ports/outbound/llm_port.py +34 -2
  43. evalvault/ports/outbound/storage_port.py +38 -0
  44. {evalvault-1.62.1.dist-info → evalvault-1.63.1.dist-info}/METADATA +228 -4
  45. {evalvault-1.62.1.dist-info → evalvault-1.63.1.dist-info}/RECORD +48 -40
  46. {evalvault-1.62.1.dist-info → evalvault-1.63.1.dist-info}/WHEEL +0 -0
  47. {evalvault-1.62.1.dist-info → evalvault-1.63.1.dist-info}/entry_points.txt +0 -0
  48. {evalvault-1.62.1.dist-info → evalvault-1.63.1.dist-info}/licenses/LICENSE.md +0 -0
@@ -27,6 +27,7 @@ from evalvault.domain.entities.analysis import (
27
27
  )
28
28
  from evalvault.domain.entities.experiment import Experiment
29
29
  from evalvault.domain.entities.prompt import Prompt, PromptSet, PromptSetBundle, PromptSetItem
30
+ from evalvault.domain.entities.stage import StageEvent, StageMetric
30
31
 
31
32
 
32
33
  class PostgreSQLStorageAdapter(BaseSQLStorageAdapter):
@@ -823,6 +824,56 @@ class PostgreSQLStorageAdapter(BaseSQLStorageAdapter):
823
824
  )
824
825
  conn.commit()
825
826
 
827
+ def save_analysis_report(
828
+ self,
829
+ *,
830
+ report_id: str | None,
831
+ run_id: str | None,
832
+ experiment_id: str | None,
833
+ report_type: str,
834
+ format: str,
835
+ content: str | None,
836
+ metadata: dict[str, Any] | None = None,
837
+ created_at: str | None = None,
838
+ ) -> str:
839
+ report_id = report_id or str(uuid.uuid4())
840
+ if created_at is None:
841
+ created_at_value = datetime.now(UTC)
842
+ else:
843
+ created_at_value = (
844
+ datetime.fromisoformat(created_at) if isinstance(created_at, str) else created_at
845
+ )
846
+
847
+ with self._get_connection() as conn:
848
+ conn.execute(
849
+ """
850
+ INSERT INTO analysis_reports (
851
+ report_id, run_id, experiment_id, report_type, format, content, metadata, created_at
852
+ ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
853
+ ON CONFLICT (report_id) DO UPDATE SET
854
+ run_id = EXCLUDED.run_id,
855
+ experiment_id = EXCLUDED.experiment_id,
856
+ report_type = EXCLUDED.report_type,
857
+ format = EXCLUDED.format,
858
+ content = EXCLUDED.content,
859
+ metadata = EXCLUDED.metadata,
860
+ created_at = EXCLUDED.created_at
861
+ """,
862
+ (
863
+ report_id,
864
+ run_id,
865
+ experiment_id,
866
+ report_type,
867
+ format,
868
+ content,
869
+ self._serialize_pipeline_json(metadata),
870
+ created_at_value,
871
+ ),
872
+ )
873
+ conn.commit()
874
+
875
+ return report_id
876
+
826
877
  def list_pipeline_results(self, limit: int = 50) -> list[dict[str, Any]]:
827
878
  """파이프라인 분석 결과 목록을 조회합니다."""
828
879
  query = """
@@ -837,6 +888,164 @@ class PostgreSQLStorageAdapter(BaseSQLStorageAdapter):
837
888
  rows = conn.execute(query, (limit,)).fetchall()
838
889
  return [self._deserialize_pipeline_result(row, include_payload=False) for row in rows]
839
890
 
891
+ def save_stage_events(self, events: list[StageEvent]) -> int:
892
+ if not events:
893
+ return 0
894
+ with self._get_connection() as conn:
895
+ conn.executemany(
896
+ """
897
+ INSERT INTO stage_events (
898
+ run_id, stage_id, parent_stage_id, stage_type, stage_name,
899
+ status, attempt, started_at, finished_at, duration_ms,
900
+ input_ref, output_ref, attributes, metadata, trace_id, span_id
901
+ ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
902
+ ON CONFLICT (run_id, stage_id) DO UPDATE SET
903
+ parent_stage_id = EXCLUDED.parent_stage_id,
904
+ stage_type = EXCLUDED.stage_type,
905
+ stage_name = EXCLUDED.stage_name,
906
+ status = EXCLUDED.status,
907
+ attempt = EXCLUDED.attempt,
908
+ started_at = EXCLUDED.started_at,
909
+ finished_at = EXCLUDED.finished_at,
910
+ duration_ms = EXCLUDED.duration_ms,
911
+ input_ref = EXCLUDED.input_ref,
912
+ output_ref = EXCLUDED.output_ref,
913
+ attributes = EXCLUDED.attributes,
914
+ metadata = EXCLUDED.metadata,
915
+ trace_id = EXCLUDED.trace_id,
916
+ span_id = EXCLUDED.span_id
917
+ """,
918
+ [self._serialize_stage_event(event) for event in events],
919
+ )
920
+ conn.commit()
921
+ return len(events)
922
+
923
+ def list_stage_events(
924
+ self,
925
+ run_id: str,
926
+ *,
927
+ stage_type: str | None = None,
928
+ ) -> list[StageEvent]:
929
+ query = (
930
+ "SELECT run_id, stage_id, parent_stage_id, stage_type, stage_name, status, attempt, "
931
+ "started_at, finished_at, duration_ms, input_ref, output_ref, attributes, metadata, "
932
+ "trace_id, span_id FROM stage_events WHERE run_id = %s"
933
+ )
934
+ params: list[Any] = [run_id]
935
+ if stage_type:
936
+ query += " AND stage_type = %s"
937
+ params.append(stage_type)
938
+ query += " ORDER BY id"
939
+ with self._get_connection() as conn:
940
+ rows = conn.execute(query, params).fetchall()
941
+ return [self._deserialize_stage_event(row) for row in rows]
942
+
943
+ def save_stage_metrics(self, metrics: list[StageMetric]) -> int:
944
+ if not metrics:
945
+ return 0
946
+ with self._get_connection() as conn:
947
+ conn.executemany(
948
+ """
949
+ INSERT INTO stage_metrics (
950
+ run_id, stage_id, metric_name, score, threshold, evidence
951
+ ) VALUES (%s, %s, %s, %s, %s, %s)
952
+ """,
953
+ [self._serialize_stage_metric(metric) for metric in metrics],
954
+ )
955
+ conn.commit()
956
+ return len(metrics)
957
+
958
+ def list_stage_metrics(
959
+ self,
960
+ run_id: str,
961
+ *,
962
+ stage_id: str | None = None,
963
+ metric_name: str | None = None,
964
+ ) -> list[StageMetric]:
965
+ query = (
966
+ "SELECT run_id, stage_id, metric_name, score, threshold, evidence "
967
+ "FROM stage_metrics WHERE run_id = %s"
968
+ )
969
+ params: list[Any] = [run_id]
970
+ if stage_id:
971
+ query += " AND stage_id = %s"
972
+ params.append(stage_id)
973
+ if metric_name:
974
+ query += " AND metric_name = %s"
975
+ params.append(metric_name)
976
+ query += " ORDER BY id"
977
+ with self._get_connection() as conn:
978
+ rows = conn.execute(query, params).fetchall()
979
+ return [self._deserialize_stage_metric(row) for row in rows]
980
+
981
+ def _serialize_stage_event(self, event: StageEvent) -> tuple[Any, ...]:
982
+ return (
983
+ event.run_id,
984
+ event.stage_id,
985
+ event.parent_stage_id,
986
+ event.stage_type,
987
+ event.stage_name,
988
+ event.status,
989
+ event.attempt,
990
+ event.started_at,
991
+ event.finished_at,
992
+ event.duration_ms,
993
+ self._serialize_payload_ref(event.input_ref),
994
+ self._serialize_payload_ref(event.output_ref),
995
+ self._serialize_pipeline_json(event.attributes),
996
+ self._serialize_pipeline_json(event.metadata),
997
+ event.trace_id,
998
+ event.span_id,
999
+ )
1000
+
1001
+ def _serialize_stage_metric(self, metric: StageMetric) -> tuple[Any, ...]:
1002
+ return (
1003
+ metric.run_id,
1004
+ metric.stage_id,
1005
+ metric.metric_name,
1006
+ metric.score,
1007
+ metric.threshold,
1008
+ self._serialize_pipeline_json(metric.evidence),
1009
+ )
1010
+
1011
+ def _serialize_payload_ref(self, ref: Any) -> str | None:
1012
+ if ref is None:
1013
+ return None
1014
+ payload = ref.to_dict() if hasattr(ref, "to_dict") else ref
1015
+ return self._serialize_pipeline_json(payload)
1016
+
1017
+ def _deserialize_stage_event(self, row: dict[str, Any]) -> StageEvent:
1018
+ payload = {
1019
+ "run_id": row.get("run_id"),
1020
+ "stage_id": row.get("stage_id"),
1021
+ "parent_stage_id": row.get("parent_stage_id"),
1022
+ "stage_type": row.get("stage_type"),
1023
+ "stage_name": row.get("stage_name"),
1024
+ "status": row.get("status"),
1025
+ "attempt": row.get("attempt"),
1026
+ "started_at": row.get("started_at"),
1027
+ "finished_at": row.get("finished_at"),
1028
+ "duration_ms": row.get("duration_ms"),
1029
+ "input_ref": self._ensure_json(row.get("input_ref")),
1030
+ "output_ref": self._ensure_json(row.get("output_ref")),
1031
+ "attributes": self._ensure_json(row.get("attributes")) or {},
1032
+ "metadata": self._ensure_json(row.get("metadata")) or {},
1033
+ "trace_id": row.get("trace_id"),
1034
+ "span_id": row.get("span_id"),
1035
+ }
1036
+ return StageEvent.from_dict(payload)
1037
+
1038
+ def _deserialize_stage_metric(self, row: dict[str, Any]) -> StageMetric:
1039
+ payload = {
1040
+ "run_id": row.get("run_id"),
1041
+ "stage_id": row.get("stage_id"),
1042
+ "metric_name": row.get("metric_name"),
1043
+ "score": row.get("score"),
1044
+ "threshold": row.get("threshold"),
1045
+ "evidence": self._ensure_json(row.get("evidence")),
1046
+ }
1047
+ return StageMetric.from_dict(payload)
1048
+
840
1049
  def get_pipeline_result(self, result_id: str) -> dict[str, Any]:
841
1050
  """저장된 파이프라인 분석 결과를 조회합니다."""
842
1051
  with self._get_connection() as conn:
@@ -206,3 +206,41 @@ CREATE INDEX IF NOT EXISTS idx_pipeline_results_intent
206
206
  ON pipeline_results(intent);
207
207
  CREATE INDEX IF NOT EXISTS idx_pipeline_results_run_id
208
208
  ON pipeline_results(run_id);
209
+
210
+ CREATE TABLE IF NOT EXISTS stage_events (
211
+ id BIGSERIAL PRIMARY KEY,
212
+ run_id UUID NOT NULL REFERENCES evaluation_runs(run_id) ON DELETE CASCADE,
213
+ stage_id TEXT NOT NULL,
214
+ parent_stage_id TEXT,
215
+ stage_type TEXT NOT NULL,
216
+ stage_name TEXT,
217
+ status TEXT,
218
+ attempt INTEGER DEFAULT 1,
219
+ started_at TIMESTAMP WITH TIME ZONE,
220
+ finished_at TIMESTAMP WITH TIME ZONE,
221
+ duration_ms DOUBLE PRECISION,
222
+ input_ref JSONB,
223
+ output_ref JSONB,
224
+ attributes JSONB,
225
+ metadata JSONB,
226
+ trace_id TEXT,
227
+ span_id TEXT
228
+ );
229
+
230
+ CREATE UNIQUE INDEX IF NOT EXISTS idx_stage_events_run_stage_id
231
+ ON stage_events(run_id, stage_id);
232
+ CREATE INDEX IF NOT EXISTS idx_stage_events_run_id ON stage_events(run_id);
233
+ CREATE INDEX IF NOT EXISTS idx_stage_events_stage_type ON stage_events(stage_type);
234
+
235
+ CREATE TABLE IF NOT EXISTS stage_metrics (
236
+ id BIGSERIAL PRIMARY KEY,
237
+ run_id UUID NOT NULL REFERENCES evaluation_runs(run_id) ON DELETE CASCADE,
238
+ stage_id TEXT NOT NULL,
239
+ metric_name TEXT NOT NULL,
240
+ score DOUBLE PRECISION NOT NULL,
241
+ threshold DOUBLE PRECISION,
242
+ evidence JSONB
243
+ );
244
+
245
+ CREATE INDEX IF NOT EXISTS idx_stage_metrics_run_id ON stage_metrics(run_id);
246
+ CREATE INDEX IF NOT EXISTS idx_stage_metrics_stage_id ON stage_metrics(stage_id);
@@ -4,10 +4,12 @@ from __future__ import annotations
4
4
 
5
5
  import json
6
6
  import sqlite3
7
+ import uuid
8
+ from contextlib import AbstractContextManager, closing
7
9
  from dataclasses import asdict
8
10
  from datetime import datetime
9
11
  from pathlib import Path
10
- from typing import TYPE_CHECKING, Any
12
+ from typing import TYPE_CHECKING, Any, cast
11
13
 
12
14
  from evalvault.adapters.outbound.storage.base_sql import BaseSQLStorageAdapter, SQLQueries
13
15
  from evalvault.domain.entities.analysis import (
@@ -61,15 +63,20 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
61
63
  conn.commit()
62
64
  conn.close()
63
65
 
64
- def _connect(self) -> sqlite3.Connection:
66
+ def _connect(self) -> Any:
65
67
  """Create a DB-API connection with the expected options."""
66
68
  conn = sqlite3.connect(self.db_path)
67
69
  conn.row_factory = sqlite3.Row
68
70
  conn.execute("PRAGMA foreign_keys = ON")
69
71
  return conn
70
72
 
71
- def _apply_migrations(self, conn: sqlite3.Connection) -> None:
73
+ def _get_connection(self) -> AbstractContextManager[sqlite3.Connection]:
74
+ conn = self._connect()
75
+ return closing(cast(sqlite3.Connection, conn))
76
+
77
+ def _apply_migrations(self, conn: Any) -> None:
72
78
  """Apply schema migrations for legacy databases."""
79
+ conn = cast(Any, conn)
73
80
  cursor = conn.execute("PRAGMA table_info(evaluation_runs)")
74
81
  columns = {row[1] for row in cursor.fetchall()}
75
82
  if "metadata" not in columns:
@@ -180,6 +187,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
180
187
  def save_prompt_set(self, bundle: PromptSetBundle) -> None:
181
188
  """Save prompt set, prompts, and join items."""
182
189
  with self._get_connection() as conn:
190
+ conn = cast(Any, conn)
183
191
  cursor = conn.cursor()
184
192
  cursor.execute(
185
193
  """
@@ -241,6 +249,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
241
249
  def link_prompt_set_to_run(self, run_id: str, prompt_set_id: str) -> None:
242
250
  """Attach a prompt set to a run."""
243
251
  with self._get_connection() as conn:
252
+ conn = cast(Any, conn)
244
253
  conn.execute(
245
254
  """
246
255
  INSERT OR REPLACE INTO run_prompt_sets (
@@ -258,6 +267,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
258
267
  def get_prompt_set(self, prompt_set_id: str) -> PromptSetBundle:
259
268
  """Load a prompt set bundle by ID."""
260
269
  with self._get_connection() as conn:
270
+ conn = cast(Any, conn)
261
271
  cursor = conn.execute(
262
272
  """
263
273
  SELECT prompt_set_id, name, description, metadata, created_at
@@ -270,12 +280,17 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
270
280
  if not row:
271
281
  raise KeyError(f"Prompt set not found: {prompt_set_id}")
272
282
 
283
+ created_at = self._deserialize_datetime(row["created_at"])
284
+ if created_at is None:
285
+ created_at = datetime.now()
286
+ assert created_at is not None
287
+
273
288
  prompt_set = PromptSet(
274
289
  prompt_set_id=row["prompt_set_id"],
275
290
  name=row["name"],
276
291
  description=row["description"] or "",
277
292
  metadata=json.loads(row["metadata"]) if row["metadata"] else {},
278
- created_at=self._deserialize_datetime(row["created_at"]),
293
+ created_at=created_at,
279
294
  )
280
295
 
281
296
  item_rows = conn.execute(
@@ -313,6 +328,11 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
313
328
  tuple(prompt_ids),
314
329
  ).fetchall()
315
330
  for prompt_row in prompt_rows:
331
+ created_at = self._deserialize_datetime(prompt_row["created_at"])
332
+ if created_at is None:
333
+ created_at = datetime.now()
334
+ assert created_at is not None
335
+
316
336
  prompts.append(
317
337
  Prompt(
318
338
  prompt_id=prompt_row["prompt_id"],
@@ -325,7 +345,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
325
345
  metadata=json.loads(prompt_row["metadata"])
326
346
  if prompt_row["metadata"]
327
347
  else {},
328
- created_at=self._deserialize_datetime(prompt_row["created_at"]),
348
+ created_at=created_at,
329
349
  )
330
350
  )
331
351
 
@@ -334,6 +354,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
334
354
  def get_prompt_set_for_run(self, run_id: str) -> PromptSetBundle | None:
335
355
  """Load the prompt set linked to a run."""
336
356
  with self._get_connection() as conn:
357
+ conn = cast(Any, conn)
337
358
  row = conn.execute(
338
359
  """
339
360
  SELECT prompt_set_id
@@ -360,6 +381,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
360
381
  저장된 experiment의 ID
361
382
  """
362
383
  with self._get_connection() as conn:
384
+ conn = cast(Any, conn)
363
385
  cursor = conn.cursor()
364
386
  # Insert or replace experiment
365
387
  cursor.execute(
@@ -425,6 +447,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
425
447
  KeyError: 실험을 찾을 수 없는 경우
426
448
  """
427
449
  with self._get_connection() as conn:
450
+ conn = cast(Any, conn)
428
451
  cursor = conn.cursor()
429
452
  # Fetch experiment
430
453
  cursor.execute(
@@ -503,6 +526,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
503
526
  Experiment 객체 리스트
504
527
  """
505
528
  with self._get_connection() as conn:
529
+ conn = cast(Any, conn)
506
530
  cursor = conn.cursor()
507
531
  query = "SELECT experiment_id FROM experiments WHERE 1=1"
508
532
  params = []
@@ -539,6 +563,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
539
563
  저장된 analysis의 ID
540
564
  """
541
565
  with self._get_connection() as conn:
566
+ conn = cast(Any, conn)
542
567
  cursor = conn.cursor()
543
568
  # Serialize analysis to JSON
544
569
  result_data = self._serialize_analysis(analysis)
@@ -574,6 +599,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
574
599
  KeyError: 분석을 찾을 수 없는 경우
575
600
  """
576
601
  with self._get_connection() as conn:
602
+ conn = cast(Any, conn)
577
603
  cursor = conn.cursor()
578
604
  cursor.execute(
579
605
  """
@@ -612,6 +638,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
612
638
  StatisticalAnalysis 리스트
613
639
  """
614
640
  with self._get_connection() as conn:
641
+ conn = cast(Any, conn)
615
642
  cursor = conn.cursor()
616
643
  query = """
617
644
  SELECT analysis_id, run_id, analysis_type, result_data, created_at
@@ -650,6 +677,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
650
677
  삭제 성공 여부
651
678
  """
652
679
  with self._get_connection() as conn:
680
+ conn = cast(Any, conn)
653
681
  cursor = conn.cursor()
654
682
  cursor.execute(
655
683
  "DELETE FROM analysis_results WHERE analysis_id = ?",
@@ -726,6 +754,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
726
754
  import uuid
727
755
 
728
756
  with self._get_connection() as conn:
757
+ conn = cast(Any, conn)
729
758
  cursor = conn.cursor()
730
759
  analysis_id = f"nlp-{analysis.run_id}-{uuid.uuid4().hex[:8]}"
731
760
  result_data = self._serialize_nlp_analysis(analysis)
@@ -761,6 +790,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
761
790
  KeyError: 분석을 찾을 수 없는 경우
762
791
  """
763
792
  with self._get_connection() as conn:
793
+ conn = cast(Any, conn)
764
794
  cursor = conn.cursor()
765
795
  cursor.execute(
766
796
  """
@@ -788,6 +818,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
788
818
  NLPAnalysis 또는 None (분석 결과가 없는 경우)
789
819
  """
790
820
  with self._get_connection() as conn:
821
+ conn = cast(Any, conn)
791
822
  cursor = conn.cursor()
792
823
  cursor.execute(
793
824
  """
@@ -891,6 +922,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
891
922
  is_complete = 1 if record.get("is_complete", False) else 0
892
923
 
893
924
  with self._get_connection() as conn:
925
+ conn = cast(Any, conn)
894
926
  conn.execute(
895
927
  """
896
928
  INSERT OR REPLACE INTO pipeline_results (
@@ -920,6 +952,44 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
920
952
  )
921
953
  conn.commit()
922
954
 
955
+ def save_analysis_report(
956
+ self,
957
+ *,
958
+ report_id: str | None,
959
+ run_id: str | None,
960
+ experiment_id: str | None,
961
+ report_type: str,
962
+ format: str,
963
+ content: str | None,
964
+ metadata: dict[str, Any] | None = None,
965
+ created_at: str | None = None,
966
+ ) -> str:
967
+ report_id = report_id or str(uuid.uuid4())
968
+ created_at = created_at or datetime.now().isoformat()
969
+
970
+ with self._get_connection() as conn:
971
+ conn = cast(Any, conn)
972
+ conn.execute(
973
+ """
974
+ INSERT OR REPLACE INTO analysis_reports (
975
+ report_id, run_id, experiment_id, report_type, format, content, metadata, created_at
976
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?)
977
+ """,
978
+ (
979
+ report_id,
980
+ run_id,
981
+ experiment_id,
982
+ report_type,
983
+ format,
984
+ content,
985
+ self._serialize_json(metadata),
986
+ created_at,
987
+ ),
988
+ )
989
+ conn.commit()
990
+
991
+ return report_id
992
+
923
993
  def list_pipeline_results(self, limit: int = 50) -> list[dict[str, Any]]:
924
994
  """파이프라인 분석 결과 목록을 조회합니다."""
925
995
  query = """
@@ -931,12 +1001,14 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
931
1001
  LIMIT ?
932
1002
  """
933
1003
  with self._get_connection() as conn:
1004
+ conn = cast(Any, conn)
934
1005
  rows = conn.execute(query, (limit,)).fetchall()
935
1006
  return [self._deserialize_pipeline_result(row, include_payload=False) for row in rows]
936
1007
 
937
1008
  def get_pipeline_result(self, result_id: str) -> dict[str, Any]:
938
1009
  """저장된 파이프라인 분석 결과를 조회합니다."""
939
1010
  with self._get_connection() as conn:
1011
+ conn = cast(Any, conn)
940
1012
  row = conn.execute(
941
1013
  """
942
1014
  SELECT result_id, intent, query, run_id, pipeline_id,
@@ -983,6 +1055,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
983
1055
  def save_stage_event(self, event: StageEvent) -> str:
984
1056
  """단계 이벤트를 저장합니다."""
985
1057
  with self._get_connection() as conn:
1058
+ conn = cast(Any, conn)
986
1059
  conn.execute(
987
1060
  """
988
1061
  INSERT OR REPLACE INTO stage_events (
@@ -1001,6 +1074,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
1001
1074
  if not events:
1002
1075
  return 0
1003
1076
  with self._get_connection() as conn:
1077
+ conn = cast(Any, conn)
1004
1078
  conn.executemany(
1005
1079
  """
1006
1080
  INSERT OR REPLACE INTO stage_events (
@@ -1034,6 +1108,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
1034
1108
  params.append(stage_type)
1035
1109
  query += " ORDER BY id"
1036
1110
  with self._get_connection() as conn:
1111
+ conn = cast(Any, conn)
1037
1112
  cursor = conn.execute(query, params)
1038
1113
  rows = cursor.fetchall()
1039
1114
  return [self._deserialize_stage_event(row) for row in rows]
@@ -1043,6 +1118,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
1043
1118
  if not metrics:
1044
1119
  return 0
1045
1120
  with self._get_connection() as conn:
1121
+ conn = cast(Any, conn)
1046
1122
  conn.executemany(
1047
1123
  """
1048
1124
  INSERT INTO stage_metrics (
@@ -1076,6 +1152,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
1076
1152
  params.append(metric_name)
1077
1153
  query += " ORDER BY id"
1078
1154
  with self._get_connection() as conn:
1155
+ conn = cast(Any, conn)
1079
1156
  cursor = conn.execute(query, params)
1080
1157
  rows = cursor.fetchall()
1081
1158
  return [self._deserialize_stage_metric(row) for row in rows]
@@ -1155,6 +1232,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
1155
1232
 
1156
1233
  def save_benchmark_run(self, run: BenchmarkRun) -> str:
1157
1234
  with self._get_connection() as conn:
1235
+ conn = cast(Any, conn)
1158
1236
  task_scores_json = json.dumps(
1159
1237
  [
1160
1238
  {
@@ -1208,6 +1286,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
1208
1286
  )
1209
1287
 
1210
1288
  with self._get_connection() as conn:
1289
+ conn = cast(Any, conn)
1211
1290
  cursor = conn.execute(
1212
1291
  """
1213
1292
  SELECT run_id, benchmark_type, model_name, backend, tasks,
@@ -1288,6 +1367,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
1288
1367
  params.append(limit)
1289
1368
 
1290
1369
  with self._get_connection() as conn:
1370
+ conn = cast(Any, conn)
1291
1371
  cursor = conn.execute(query, params)
1292
1372
  run_ids = [row["run_id"] for row in cursor.fetchall()]
1293
1373
 
@@ -1295,6 +1375,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
1295
1375
 
1296
1376
  def delete_benchmark_run(self, run_id: str) -> bool:
1297
1377
  with self._get_connection() as conn:
1378
+ conn = cast(Any, conn)
1298
1379
  cursor = conn.execute(
1299
1380
  "DELETE FROM benchmark_runs WHERE run_id = ?",
1300
1381
  (run_id,),
evalvault/debug_ragas.py CHANGED
@@ -5,6 +5,9 @@ from unittest.mock import MagicMock
5
5
  from ragas import SingleTurnSample
6
6
  from ragas.metrics import AnswerRelevancy, Faithfulness
7
7
 
8
+ from evalvault.adapters.outbound.llm import SettingsLLMFactory
9
+ from evalvault.adapters.outbound.nlp.korean.toolkit_factory import try_create_korean_toolkit
10
+ from evalvault.config.settings import Settings
8
11
  from evalvault.domain.services.evaluator import RagasEvaluator
9
12
  from evalvault.ports.outbound.llm_port import LLMPort
10
13
 
@@ -29,7 +32,10 @@ async def debug_ragas():
29
32
  # Actually, Ragas metrics execute validation on `score` or `ascore`.
30
33
  # Failing at LLM call (e.g. no auth) is different from failing at argument passing.
31
34
 
32
- evaluator = RagasEvaluator()
35
+ settings = Settings()
36
+ llm_factory = SettingsLLMFactory(settings)
37
+ korean_toolkit = try_create_korean_toolkit()
38
+ evaluator = RagasEvaluator(korean_toolkit=korean_toolkit, llm_factory=llm_factory)
33
39
 
34
40
  # Create sample similar to what we observed
35
41
  sample = SingleTurnSample(
@@ -1,7 +1,9 @@
1
1
  import asyncio
2
2
  import logging
3
3
 
4
+ from evalvault.adapters.outbound.llm import SettingsLLMFactory
4
5
  from evalvault.adapters.outbound.llm.openai_adapter import OpenAIAdapter
6
+ from evalvault.adapters.outbound.nlp.korean.toolkit_factory import try_create_korean_toolkit
5
7
  from evalvault.config.settings import get_settings
6
8
  from evalvault.domain.entities.dataset import Dataset, TestCase
7
9
  from evalvault.domain.services.evaluator import RagasEvaluator
@@ -25,7 +27,9 @@ async def debug_ragas_real():
25
27
  print(f"Using Model: {settings.openai_model}")
26
28
 
27
29
  llm = OpenAIAdapter(settings)
28
- evaluator = RagasEvaluator()
30
+ llm_factory = SettingsLLMFactory(settings)
31
+ korean_toolkit = try_create_korean_toolkit()
32
+ evaluator = RagasEvaluator(korean_toolkit=korean_toolkit, llm_factory=llm_factory)
29
33
 
30
34
  # Manual Dataset
31
35
  test_case = TestCase(
@@ -37,6 +37,12 @@ from evalvault.domain.entities.improvement import (
37
37
  from evalvault.domain.entities.kg import EntityModel, RelationModel
38
38
  from evalvault.domain.entities.method import MethodInput, MethodInputDataset, MethodOutput
39
39
  from evalvault.domain.entities.prompt import Prompt, PromptSet, PromptSetBundle, PromptSetItem
40
+ from evalvault.domain.entities.prompt_suggestion import (
41
+ PromptCandidate,
42
+ PromptCandidateSampleScore,
43
+ PromptCandidateScore,
44
+ PromptSuggestionResult,
45
+ )
40
46
  from evalvault.domain.entities.rag_trace import (
41
47
  GenerationData,
42
48
  RAGTraceData,
@@ -110,6 +116,10 @@ __all__ = [
110
116
  "PromptSet",
111
117
  "PromptSetBundle",
112
118
  "PromptSetItem",
119
+ "PromptCandidate",
120
+ "PromptCandidateSampleScore",
121
+ "PromptCandidateScore",
122
+ "PromptSuggestionResult",
113
123
  # RAG Trace
114
124
  "GenerationData",
115
125
  "RAGTraceData",