evalvault 1.75.0__py3-none-any.whl → 1.77.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/adapter.py +123 -64
- evalvault/adapters/inbound/api/main.py +2 -0
- evalvault/adapters/inbound/api/routers/config.py +3 -1
- evalvault/adapters/inbound/cli/app.py +3 -0
- evalvault/adapters/inbound/cli/commands/analyze.py +6 -1
- evalvault/adapters/inbound/cli/commands/method.py +3 -3
- evalvault/adapters/inbound/cli/commands/run.py +153 -30
- evalvault/adapters/inbound/cli/commands/run_helpers.py +166 -62
- evalvault/adapters/outbound/analysis/llm_report_module.py +515 -33
- evalvault/adapters/outbound/llm/factory.py +1 -1
- evalvault/adapters/outbound/phoenix/sync_service.py +100 -1
- evalvault/adapters/outbound/report/markdown_adapter.py +92 -0
- evalvault/adapters/outbound/storage/factory.py +1 -4
- evalvault/adapters/outbound/tracker/mlflow_adapter.py +209 -54
- evalvault/adapters/outbound/tracker/phoenix_adapter.py +178 -12
- evalvault/config/instrumentation.py +8 -6
- evalvault/config/phoenix_support.py +5 -0
- evalvault/config/runtime_services.py +122 -0
- evalvault/config/settings.py +40 -4
- evalvault/domain/services/evaluator.py +2 -0
- {evalvault-1.75.0.dist-info → evalvault-1.77.0.dist-info}/METADATA +2 -1
- {evalvault-1.75.0.dist-info → evalvault-1.77.0.dist-info}/RECORD +25 -24
- {evalvault-1.75.0.dist-info → evalvault-1.77.0.dist-info}/WHEEL +0 -0
- {evalvault-1.75.0.dist-info → evalvault-1.77.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.75.0.dist-info → evalvault-1.77.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -52,6 +52,8 @@ class PhoenixAdapter(TrackerPort):
|
|
|
52
52
|
self,
|
|
53
53
|
endpoint: str = "http://localhost:6006/v1/traces",
|
|
54
54
|
service_name: str = "evalvault",
|
|
55
|
+
project_name: str | None = None,
|
|
56
|
+
annotations_enabled: bool = True,
|
|
55
57
|
):
|
|
56
58
|
"""Initialize Phoenix adapter with OpenTelemetry.
|
|
57
59
|
|
|
@@ -61,11 +63,14 @@ class PhoenixAdapter(TrackerPort):
|
|
|
61
63
|
"""
|
|
62
64
|
self._endpoint = endpoint
|
|
63
65
|
self._service_name = service_name
|
|
66
|
+
self._project_name = project_name
|
|
67
|
+
self._annotations_enabled = annotations_enabled
|
|
64
68
|
self._tracer: Any | None = None
|
|
65
69
|
self._tracer_provider: TracerProvider | None = None
|
|
66
70
|
self._active_spans: dict[str, Any] = {}
|
|
67
71
|
self._tracer_any: Any | None = None
|
|
68
72
|
self._initialized = False
|
|
73
|
+
self._annotations_client: Any | None = None
|
|
69
74
|
|
|
70
75
|
def _ensure_initialized(self) -> None:
|
|
71
76
|
"""Lazy initialization of OpenTelemetry tracer."""
|
|
@@ -96,7 +101,10 @@ class PhoenixAdapter(TrackerPort):
|
|
|
96
101
|
return
|
|
97
102
|
|
|
98
103
|
# Create resource with service name
|
|
99
|
-
|
|
104
|
+
resource_attributes = {"service.name": self._service_name}
|
|
105
|
+
if self._project_name:
|
|
106
|
+
resource_attributes["project.name"] = self._project_name
|
|
107
|
+
resource = Resource.create(resource_attributes)
|
|
100
108
|
|
|
101
109
|
# Create tracer provider
|
|
102
110
|
self._tracer_provider = TracerProvider(resource=resource)
|
|
@@ -123,6 +131,64 @@ class PhoenixAdapter(TrackerPort):
|
|
|
123
131
|
"Failed to initialize Phoenix tracer. Check endpoint configuration and dependencies."
|
|
124
132
|
) from e
|
|
125
133
|
|
|
134
|
+
def _phoenix_base_url(self) -> str:
|
|
135
|
+
if "/v1/traces" in self._endpoint:
|
|
136
|
+
return self._endpoint.split("/v1/traces")[0]
|
|
137
|
+
return self._endpoint.rstrip("/")
|
|
138
|
+
|
|
139
|
+
def _get_annotations_client(self) -> Any | None:
|
|
140
|
+
if not self._annotations_enabled:
|
|
141
|
+
return None
|
|
142
|
+
if self._annotations_client is not None:
|
|
143
|
+
return self._annotations_client
|
|
144
|
+
try:
|
|
145
|
+
from phoenix.client import Client
|
|
146
|
+
except Exception:
|
|
147
|
+
return None
|
|
148
|
+
self._annotations_client = Client(base_url=self._phoenix_base_url())
|
|
149
|
+
return self._annotations_client
|
|
150
|
+
|
|
151
|
+
def _annotate_span(
|
|
152
|
+
self,
|
|
153
|
+
*,
|
|
154
|
+
span: Any,
|
|
155
|
+
name: str,
|
|
156
|
+
label: str,
|
|
157
|
+
score: float | None = None,
|
|
158
|
+
explanation: str | None = None,
|
|
159
|
+
) -> None:
|
|
160
|
+
client = self._get_annotations_client()
|
|
161
|
+
if client is None or span is None:
|
|
162
|
+
return
|
|
163
|
+
try:
|
|
164
|
+
from opentelemetry.trace import format_span_id
|
|
165
|
+
|
|
166
|
+
span_id = format_span_id(span.get_span_context().span_id)
|
|
167
|
+
spans_client = getattr(client, "spans", None)
|
|
168
|
+
add_span_annotation = (
|
|
169
|
+
getattr(spans_client, "add_span_annotation", None) if spans_client else None
|
|
170
|
+
)
|
|
171
|
+
if callable(add_span_annotation):
|
|
172
|
+
add_span_annotation(
|
|
173
|
+
annotation_name=name,
|
|
174
|
+
annotator_kind="CODE",
|
|
175
|
+
span_id=span_id,
|
|
176
|
+
label=label,
|
|
177
|
+
score=score,
|
|
178
|
+
explanation=explanation,
|
|
179
|
+
)
|
|
180
|
+
return
|
|
181
|
+
client.annotations.add_span_annotation(
|
|
182
|
+
annotation_name=name,
|
|
183
|
+
annotator_kind="CODE",
|
|
184
|
+
span_id=span_id,
|
|
185
|
+
label=label,
|
|
186
|
+
score=score,
|
|
187
|
+
explanation=explanation,
|
|
188
|
+
)
|
|
189
|
+
except Exception:
|
|
190
|
+
return
|
|
191
|
+
|
|
126
192
|
def start_trace(self, name: str, metadata: dict[str, Any] | None = None) -> str:
|
|
127
193
|
"""Start a new trace.
|
|
128
194
|
|
|
@@ -287,6 +353,7 @@ class PhoenixAdapter(TrackerPort):
|
|
|
287
353
|
|
|
288
354
|
# Calculate per-metric summary
|
|
289
355
|
metric_summary = {}
|
|
356
|
+
total_count = len(run.results) if run.results else 0
|
|
290
357
|
for metric_name in run.metrics_evaluated:
|
|
291
358
|
passed_count = sum(
|
|
292
359
|
1
|
|
@@ -299,9 +366,9 @@ class PhoenixAdapter(TrackerPort):
|
|
|
299
366
|
"average_score": round(avg_score, 4) if avg_score else 0.0,
|
|
300
367
|
"threshold": threshold,
|
|
301
368
|
"passed": passed_count,
|
|
302
|
-
"failed":
|
|
303
|
-
"total":
|
|
304
|
-
"pass_rate": round(passed_count /
|
|
369
|
+
"failed": total_count - passed_count,
|
|
370
|
+
"total": total_count,
|
|
371
|
+
"pass_rate": round(passed_count / total_count, 4) if total_count else 0.0,
|
|
305
372
|
}
|
|
306
373
|
|
|
307
374
|
# Start root trace
|
|
@@ -328,8 +395,17 @@ class PhoenixAdapter(TrackerPort):
|
|
|
328
395
|
|
|
329
396
|
# Set evaluation-specific attributes
|
|
330
397
|
span = self._active_spans[trace_id]
|
|
398
|
+
span.set_attribute("openinference.span.kind", "EVALUATOR")
|
|
331
399
|
span.set_attribute("evaluation.metrics", json.dumps(run.metrics_evaluated))
|
|
332
400
|
span.set_attribute("evaluation.thresholds", json.dumps(run.thresholds))
|
|
401
|
+
span.set_attribute("evaluation.status", "pass" if run.pass_rate >= 1.0 else "fail")
|
|
402
|
+
if run.tracker_metadata:
|
|
403
|
+
project_name = run.tracker_metadata.get("project_name")
|
|
404
|
+
if project_name:
|
|
405
|
+
span.set_attribute("project.name", project_name)
|
|
406
|
+
project_kind = run.tracker_metadata.get("evaluation_task") or "evaluation"
|
|
407
|
+
span.set_attribute("project.kind", project_kind)
|
|
408
|
+
span.set_attribute("project.status", "pass" if run.pass_rate >= 1.0 else "fail")
|
|
333
409
|
|
|
334
410
|
# Log average scores for each metric
|
|
335
411
|
for metric_name, summary in metric_summary.items():
|
|
@@ -369,6 +445,8 @@ class PhoenixAdapter(TrackerPort):
|
|
|
369
445
|
},
|
|
370
446
|
"metrics": metric_summary,
|
|
371
447
|
"custom_metrics": (run.tracker_metadata or {}).get("custom_metric_snapshot"),
|
|
448
|
+
"prompt_metadata": (run.tracker_metadata or {}).get("phoenix", {}).get("prompts"),
|
|
449
|
+
"tracker_metadata": run.tracker_metadata,
|
|
372
450
|
"test_cases": [
|
|
373
451
|
{
|
|
374
452
|
"test_case_id": result.test_case_id,
|
|
@@ -420,6 +498,23 @@ class PhoenixAdapter(TrackerPort):
|
|
|
420
498
|
f"test-case-{result.test_case_id}",
|
|
421
499
|
context=context,
|
|
422
500
|
) as span:
|
|
501
|
+
try:
|
|
502
|
+
from opentelemetry.trace import Status, StatusCode
|
|
503
|
+
|
|
504
|
+
span.set_status(Status(StatusCode.OK if result.all_passed else StatusCode.ERROR))
|
|
505
|
+
except Exception:
|
|
506
|
+
pass
|
|
507
|
+
span.set_attribute("openinference.span.kind", "EVALUATOR")
|
|
508
|
+
span.set_attribute("evaluation.status", "pass" if result.all_passed else "fail")
|
|
509
|
+
self._annotate_span(
|
|
510
|
+
span=span,
|
|
511
|
+
name="evaluation_result",
|
|
512
|
+
label="pass" if result.all_passed else "fail",
|
|
513
|
+
score=1.0 if result.all_passed else 0.0,
|
|
514
|
+
explanation="All metrics passed"
|
|
515
|
+
if result.all_passed
|
|
516
|
+
else "One or more metrics failed",
|
|
517
|
+
)
|
|
423
518
|
# Input data
|
|
424
519
|
safe_question = sanitize_text(result.question, max_chars=MAX_LOG_CHARS) or ""
|
|
425
520
|
safe_answer = sanitize_text(result.answer, max_chars=MAX_LOG_CHARS) or ""
|
|
@@ -439,6 +534,10 @@ class PhoenixAdapter(TrackerPort):
|
|
|
439
534
|
# Metrics
|
|
440
535
|
span.set_attribute("output.all_passed", result.all_passed)
|
|
441
536
|
span.set_attribute("output.tokens_used", result.tokens_used)
|
|
537
|
+
if result.tokens_used:
|
|
538
|
+
span.set_attribute("llm.token_count.total", result.tokens_used)
|
|
539
|
+
if result.cost_usd is not None:
|
|
540
|
+
span.set_attribute("llm.cost.total", result.cost_usd)
|
|
442
541
|
|
|
443
542
|
for metric in result.metrics:
|
|
444
543
|
span.set_attribute(f"metric.{metric.name}.score", metric.score)
|
|
@@ -486,6 +585,7 @@ class PhoenixAdapter(TrackerPort):
|
|
|
486
585
|
)
|
|
487
586
|
if result.latency_ms:
|
|
488
587
|
span.set_attribute("timing.latency_ms", result.latency_ms)
|
|
588
|
+
span.set_attribute("evaluation.latency_ms", result.latency_ms)
|
|
489
589
|
|
|
490
590
|
def log_retrieval(
|
|
491
591
|
self,
|
|
@@ -528,6 +628,13 @@ class PhoenixAdapter(TrackerPort):
|
|
|
528
628
|
if tracer is None:
|
|
529
629
|
raise RuntimeError("Phoenix tracer is not initialized")
|
|
530
630
|
with tracer.start_span("retrieval", context=context) as span:
|
|
631
|
+
try:
|
|
632
|
+
from opentelemetry.trace import Status, StatusCode
|
|
633
|
+
|
|
634
|
+
span.set_status(Status(StatusCode.OK))
|
|
635
|
+
except Exception:
|
|
636
|
+
pass
|
|
637
|
+
span.set_attribute("openinference.span.kind", "RETRIEVER")
|
|
531
638
|
# Set retrieval attributes
|
|
532
639
|
for key, value in data.to_span_attributes().items():
|
|
533
640
|
span.set_attribute(key, value)
|
|
@@ -541,14 +648,24 @@ class PhoenixAdapter(TrackerPort):
|
|
|
541
648
|
|
|
542
649
|
span.set_attribute("spec.version", "0.1")
|
|
543
650
|
span.set_attribute("rag.module", "retrieve")
|
|
651
|
+
if data.retrieval_time_ms:
|
|
652
|
+
span.set_attribute("retrieval.latency_ms", data.retrieval_time_ms)
|
|
544
653
|
|
|
545
654
|
documents_payload = _build_retrieval_payload(data.candidates)
|
|
546
655
|
span.set_attribute("custom.retrieval.doc_count", len(documents_payload))
|
|
547
656
|
if documents_payload:
|
|
548
657
|
span.set_attribute("retrieval.documents_json", serialize_json(documents_payload))
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
658
|
+
previews = [
|
|
659
|
+
item.get("content_preview")
|
|
660
|
+
for item in documents_payload
|
|
661
|
+
if item.get("content_preview")
|
|
662
|
+
]
|
|
663
|
+
if previews:
|
|
664
|
+
span.set_attribute("output.value", previews)
|
|
665
|
+
else:
|
|
666
|
+
doc_ids = _extract_doc_ids(documents_payload)
|
|
667
|
+
if doc_ids:
|
|
668
|
+
span.set_attribute("output.value", doc_ids)
|
|
552
669
|
|
|
553
670
|
# Log each retrieved document as an event
|
|
554
671
|
for i, doc in enumerate(data.candidates):
|
|
@@ -615,10 +732,31 @@ class PhoenixAdapter(TrackerPort):
|
|
|
615
732
|
if tracer is None:
|
|
616
733
|
raise RuntimeError("Phoenix tracer is not initialized")
|
|
617
734
|
with tracer.start_span("generation", context=context) as span:
|
|
735
|
+
try:
|
|
736
|
+
from opentelemetry.trace import Status, StatusCode
|
|
737
|
+
|
|
738
|
+
span.set_status(Status(StatusCode.OK))
|
|
739
|
+
except Exception:
|
|
740
|
+
pass
|
|
741
|
+
span.set_attribute("openinference.span.kind", "LLM")
|
|
618
742
|
# Set generation attributes
|
|
619
743
|
for key, value in data.to_span_attributes().items():
|
|
620
744
|
span.set_attribute(key, value)
|
|
621
745
|
|
|
746
|
+
if data.model:
|
|
747
|
+
span.set_attribute("llm.model_name", data.model)
|
|
748
|
+
provider = data.model.split("/")[0] if "/" in data.model else ""
|
|
749
|
+
if provider:
|
|
750
|
+
span.set_attribute("llm.provider", provider)
|
|
751
|
+
if data.input_tokens:
|
|
752
|
+
span.set_attribute("llm.token_count.prompt", data.input_tokens)
|
|
753
|
+
if data.output_tokens:
|
|
754
|
+
span.set_attribute("llm.token_count.completion", data.output_tokens)
|
|
755
|
+
if data.total_tokens:
|
|
756
|
+
span.set_attribute("llm.token_count.total", data.total_tokens)
|
|
757
|
+
if data.cost_usd is not None:
|
|
758
|
+
span.set_attribute("llm.cost.total", data.cost_usd)
|
|
759
|
+
|
|
622
760
|
# Set prompt/response (truncate if too long)
|
|
623
761
|
prompt = sanitize_text(data.prompt, max_chars=MAX_LOG_CHARS) or ""
|
|
624
762
|
response = sanitize_text(data.response, max_chars=MAX_LOG_CHARS) or ""
|
|
@@ -637,6 +775,13 @@ class PhoenixAdapter(TrackerPort):
|
|
|
637
775
|
safe_template = sanitize_text(data.prompt_template, max_chars=MAX_LOG_CHARS)
|
|
638
776
|
if safe_template:
|
|
639
777
|
span.set_attribute("generation.prompt_template", safe_template)
|
|
778
|
+
span.set_attribute("llm.prompt_template.template", safe_template)
|
|
779
|
+
span.set_attribute("llm.prompt_template.version", "v1")
|
|
780
|
+
prompt_vars = data.metadata.get("prompt_variables") if data.metadata else None
|
|
781
|
+
if prompt_vars:
|
|
782
|
+
span.set_attribute(
|
|
783
|
+
"llm.prompt_template.variables", json.dumps(prompt_vars, default=str)
|
|
784
|
+
)
|
|
640
785
|
|
|
641
786
|
def log_rag_trace(self, data: RAGTraceData) -> str:
|
|
642
787
|
"""Log a full RAG trace (retrieval + generation) to Phoenix."""
|
|
@@ -660,6 +805,8 @@ class PhoenixAdapter(TrackerPort):
|
|
|
660
805
|
span = self._active_spans[trace_id]
|
|
661
806
|
should_end = True
|
|
662
807
|
|
|
808
|
+
span.set_attribute("openinference.span.kind", "CHAIN")
|
|
809
|
+
|
|
663
810
|
for key, value in data.to_span_attributes().items():
|
|
664
811
|
span.set_attribute(key, value)
|
|
665
812
|
|
|
@@ -667,11 +814,23 @@ class PhoenixAdapter(TrackerPort):
|
|
|
667
814
|
self.log_retrieval(trace_id, data.retrieval)
|
|
668
815
|
if data.generation:
|
|
669
816
|
self.log_generation(trace_id, data.generation)
|
|
817
|
+
output_preview = ""
|
|
670
818
|
if data.final_answer:
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
819
|
+
output_preview = sanitize_text(data.final_answer, max_chars=MAX_LOG_CHARS)
|
|
820
|
+
if not output_preview and data.generation and data.generation.response:
|
|
821
|
+
output_preview = sanitize_text(data.generation.response, max_chars=MAX_LOG_CHARS)
|
|
822
|
+
if not output_preview and data.retrieval:
|
|
823
|
+
previews: list[str] = []
|
|
824
|
+
for doc in data.retrieval.candidates:
|
|
825
|
+
if not doc.content:
|
|
826
|
+
continue
|
|
827
|
+
preview = sanitize_text(doc.content, max_chars=MAX_CONTEXT_CHARS)
|
|
828
|
+
if preview:
|
|
829
|
+
previews.append(preview)
|
|
830
|
+
output_preview = "\n".join(previews[:3])
|
|
831
|
+
if output_preview:
|
|
832
|
+
span.set_attribute("rag.final_answer", output_preview)
|
|
833
|
+
span.set_attribute("output.value", output_preview)
|
|
675
834
|
|
|
676
835
|
if safe_query:
|
|
677
836
|
span.set_attribute("input.value", safe_query)
|
|
@@ -697,7 +856,14 @@ def _build_retrieval_payload(
|
|
|
697
856
|
payload: list[dict[str, Any]] = []
|
|
698
857
|
for index, doc in enumerate(documents, start=1):
|
|
699
858
|
doc_id = doc.chunk_id or doc.source or doc.metadata.get("doc_id") or f"doc_{index}"
|
|
700
|
-
|
|
859
|
+
preview = ""
|
|
860
|
+
if doc.content:
|
|
861
|
+
preview = sanitize_text(doc.content, max_chars=MAX_CONTEXT_CHARS)
|
|
862
|
+
item: dict[str, Any] = {
|
|
863
|
+
"doc_id": doc_id,
|
|
864
|
+
"score": doc.score,
|
|
865
|
+
"content_preview": preview,
|
|
866
|
+
}
|
|
701
867
|
if doc.source:
|
|
702
868
|
item["source"] = doc.source
|
|
703
869
|
if doc.rerank_score is not None:
|
|
@@ -26,6 +26,7 @@ _tracer_provider: TracerProvider | None = None
|
|
|
26
26
|
def setup_phoenix_instrumentation(
|
|
27
27
|
endpoint: str = "http://localhost:6006/v1/traces",
|
|
28
28
|
service_name: str = "evalvault",
|
|
29
|
+
project_name: str | None = None,
|
|
29
30
|
enable_langchain: bool = True,
|
|
30
31
|
enable_openai: bool = True,
|
|
31
32
|
sample_rate: float = 1.0,
|
|
@@ -73,12 +74,13 @@ def setup_phoenix_instrumentation(
|
|
|
73
74
|
return None
|
|
74
75
|
|
|
75
76
|
# Create resource with service name
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
77
|
+
resource_attributes = {
|
|
78
|
+
"service.name": service_name,
|
|
79
|
+
"service.version": "0.1.0",
|
|
80
|
+
}
|
|
81
|
+
if project_name:
|
|
82
|
+
resource_attributes["project.name"] = project_name
|
|
83
|
+
resource = Resource.create(resource_attributes)
|
|
82
84
|
|
|
83
85
|
# Clamp sample rate between 0 and 1
|
|
84
86
|
ratio = max(0.0, min(sample_rate, 1.0))
|
|
@@ -59,10 +59,15 @@ def ensure_phoenix_instrumentation(
|
|
|
59
59
|
if api_token:
|
|
60
60
|
headers = {"api-key": api_token}
|
|
61
61
|
|
|
62
|
+
project_name = getattr(settings, "phoenix_project_name", None)
|
|
63
|
+
if project_name is not None and not isinstance(project_name, str):
|
|
64
|
+
project_name = None
|
|
65
|
+
|
|
62
66
|
try:
|
|
63
67
|
setup_phoenix_instrumentation(
|
|
64
68
|
endpoint=endpoint,
|
|
65
69
|
service_name="evalvault",
|
|
70
|
+
project_name=project_name,
|
|
66
71
|
sample_rate=sample_rate,
|
|
67
72
|
headers=headers,
|
|
68
73
|
)
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import socket
|
|
5
|
+
import subprocess
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from shutil import which
|
|
8
|
+
from urllib.parse import urlparse
|
|
9
|
+
|
|
10
|
+
from evalvault.config.settings import Settings, is_production_profile
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
_PHOENIX_CONTAINER = "evalvault-phoenix"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass(frozen=True)
|
|
18
|
+
class Endpoint:
|
|
19
|
+
host: str
|
|
20
|
+
port: int
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _is_local_host(host: str | None) -> bool:
|
|
24
|
+
if not host:
|
|
25
|
+
return False
|
|
26
|
+
return host in {"localhost", "127.0.0.1", "0.0.0.0"}
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _port_is_open(host: str, port: int) -> bool:
|
|
30
|
+
try:
|
|
31
|
+
with socket.create_connection((host, port), timeout=0.2):
|
|
32
|
+
return True
|
|
33
|
+
except OSError:
|
|
34
|
+
return False
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _parse_http_endpoint(url: str | None, default_port: int) -> Endpoint | None:
|
|
38
|
+
if not url or not isinstance(url, str):
|
|
39
|
+
return None
|
|
40
|
+
parsed = urlparse(url)
|
|
41
|
+
if parsed.scheme and parsed.scheme not in {"http", "https"}:
|
|
42
|
+
return None
|
|
43
|
+
host = parsed.hostname or ""
|
|
44
|
+
port = parsed.port or default_port
|
|
45
|
+
if not host or port <= 0:
|
|
46
|
+
return None
|
|
47
|
+
return Endpoint(host=host, port=port)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _start_mlflow(port: int) -> bool:
|
|
51
|
+
if which("mlflow") is None:
|
|
52
|
+
logger.warning("MLflow CLI not found. Install with: uv sync --extra mlflow")
|
|
53
|
+
return False
|
|
54
|
+
try:
|
|
55
|
+
subprocess.Popen(
|
|
56
|
+
["mlflow", "server", "--host", "0.0.0.0", "--port", str(port)],
|
|
57
|
+
stdout=subprocess.DEVNULL,
|
|
58
|
+
stderr=subprocess.DEVNULL,
|
|
59
|
+
)
|
|
60
|
+
logger.info("Started MLflow server on port %s", port)
|
|
61
|
+
return True
|
|
62
|
+
except Exception as exc: # pragma: no cover - safety net
|
|
63
|
+
logger.warning("Failed to start MLflow server: %s", exc)
|
|
64
|
+
return False
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _start_phoenix(port: int) -> bool:
|
|
68
|
+
if which("docker") is None:
|
|
69
|
+
logger.warning("Docker not found. Phoenix auto-start skipped.")
|
|
70
|
+
return False
|
|
71
|
+
try:
|
|
72
|
+
start = subprocess.run(
|
|
73
|
+
["docker", "start", _PHOENIX_CONTAINER],
|
|
74
|
+
check=False,
|
|
75
|
+
capture_output=True,
|
|
76
|
+
text=True,
|
|
77
|
+
)
|
|
78
|
+
if start.returncode != 0:
|
|
79
|
+
subprocess.run(
|
|
80
|
+
[
|
|
81
|
+
"docker",
|
|
82
|
+
"run",
|
|
83
|
+
"-d",
|
|
84
|
+
"-p",
|
|
85
|
+
f"{port}:6006",
|
|
86
|
+
"--name",
|
|
87
|
+
_PHOENIX_CONTAINER,
|
|
88
|
+
"arizephoenix/phoenix:latest",
|
|
89
|
+
],
|
|
90
|
+
check=False,
|
|
91
|
+
capture_output=True,
|
|
92
|
+
text=True,
|
|
93
|
+
)
|
|
94
|
+
logger.info("Ensured Phoenix container is running on port %s", port)
|
|
95
|
+
return True
|
|
96
|
+
except Exception as exc: # pragma: no cover - safety net
|
|
97
|
+
logger.warning("Failed to start Phoenix container: %s", exc)
|
|
98
|
+
return False
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def ensure_local_observability(settings: Settings) -> None:
|
|
102
|
+
if is_production_profile(settings.evalvault_profile):
|
|
103
|
+
return
|
|
104
|
+
|
|
105
|
+
phoenix_endpoint = _parse_http_endpoint(
|
|
106
|
+
getattr(settings, "phoenix_endpoint", None) or "http://localhost:6006/v1/traces",
|
|
107
|
+
6006,
|
|
108
|
+
)
|
|
109
|
+
if (
|
|
110
|
+
phoenix_endpoint
|
|
111
|
+
and _is_local_host(phoenix_endpoint.host)
|
|
112
|
+
and not _port_is_open(phoenix_endpoint.host, phoenix_endpoint.port)
|
|
113
|
+
):
|
|
114
|
+
_start_phoenix(phoenix_endpoint.port)
|
|
115
|
+
|
|
116
|
+
mlflow_endpoint = _parse_http_endpoint(getattr(settings, "mlflow_tracking_uri", None), 5000)
|
|
117
|
+
if (
|
|
118
|
+
mlflow_endpoint
|
|
119
|
+
and _is_local_host(mlflow_endpoint.host)
|
|
120
|
+
and not _port_is_open(mlflow_endpoint.host, mlflow_endpoint.port)
|
|
121
|
+
):
|
|
122
|
+
_start_mlflow(mlflow_endpoint.port)
|
evalvault/config/settings.py
CHANGED
|
@@ -55,6 +55,33 @@ def _parse_cors_origins(cors_origins: str | None) -> list[str]:
|
|
|
55
55
|
return [origin.strip() for origin in cors_origins.split(",") if origin.strip()]
|
|
56
56
|
|
|
57
57
|
|
|
58
|
+
def resolve_tracker_providers(provider: str | None) -> list[str]:
|
|
59
|
+
if not provider:
|
|
60
|
+
return []
|
|
61
|
+
normalized = provider.strip().lower()
|
|
62
|
+
if normalized in {"none", "off", "disabled"}:
|
|
63
|
+
return ["none"]
|
|
64
|
+
aliases = {
|
|
65
|
+
"all": ["mlflow", "phoenix"],
|
|
66
|
+
"default": ["mlflow", "phoenix"],
|
|
67
|
+
}
|
|
68
|
+
if normalized in aliases:
|
|
69
|
+
return aliases[normalized]
|
|
70
|
+
separators = [",", "+", "/", "|"]
|
|
71
|
+
for sep in separators:
|
|
72
|
+
normalized = normalized.replace(sep, ",")
|
|
73
|
+
providers = [p.strip() for p in normalized.split(",") if p.strip()]
|
|
74
|
+
if not providers:
|
|
75
|
+
return []
|
|
76
|
+
if "none" in providers and len(providers) > 1:
|
|
77
|
+
raise ValueError("tracker_provider cannot combine 'none' with other providers")
|
|
78
|
+
deduped: list[str] = []
|
|
79
|
+
for entry in providers:
|
|
80
|
+
if entry not in deduped:
|
|
81
|
+
deduped.append(entry)
|
|
82
|
+
return deduped
|
|
83
|
+
|
|
84
|
+
|
|
58
85
|
SECRET_REFERENCE_FIELDS = (
|
|
59
86
|
"api_auth_tokens",
|
|
60
87
|
"knowledge_read_tokens",
|
|
@@ -83,13 +110,14 @@ def _validate_production_settings(settings: "Settings") -> None:
|
|
|
83
110
|
if settings.llm_provider == "openai" and not settings.openai_api_key:
|
|
84
111
|
missing.append("OPENAI_API_KEY")
|
|
85
112
|
|
|
86
|
-
|
|
113
|
+
providers = resolve_tracker_providers(settings.tracker_provider)
|
|
114
|
+
if "langfuse" in providers:
|
|
87
115
|
if not settings.langfuse_public_key:
|
|
88
116
|
missing.append("LANGFUSE_PUBLIC_KEY")
|
|
89
117
|
if not settings.langfuse_secret_key:
|
|
90
118
|
missing.append("LANGFUSE_SECRET_KEY")
|
|
91
119
|
|
|
92
|
-
if
|
|
120
|
+
if "mlflow" in providers and not settings.mlflow_tracking_uri:
|
|
93
121
|
missing.append("MLFLOW_TRACKING_URI")
|
|
94
122
|
|
|
95
123
|
if (
|
|
@@ -355,6 +383,14 @@ class Settings(BaseSettings):
|
|
|
355
383
|
default="http://localhost:6006/v1/traces",
|
|
356
384
|
description="Phoenix OTLP endpoint for traces",
|
|
357
385
|
)
|
|
386
|
+
phoenix_project_name: str = Field(
|
|
387
|
+
default="evalvault",
|
|
388
|
+
description="Phoenix project name for grouping traces",
|
|
389
|
+
)
|
|
390
|
+
phoenix_annotations_enabled: bool = Field(
|
|
391
|
+
default=True,
|
|
392
|
+
description="Enable automatic Phoenix span annotations",
|
|
393
|
+
)
|
|
358
394
|
phoenix_api_token: str | None = Field(
|
|
359
395
|
default=None,
|
|
360
396
|
description="Phoenix API token for cloud deployments (optional)",
|
|
@@ -372,8 +408,8 @@ class Settings(BaseSettings):
|
|
|
372
408
|
|
|
373
409
|
# Tracker Provider Selection
|
|
374
410
|
tracker_provider: str = Field(
|
|
375
|
-
default="
|
|
376
|
-
description="Tracker provider: 'langfuse', 'mlflow',
|
|
411
|
+
default="mlflow+phoenix",
|
|
412
|
+
description="Tracker provider: 'langfuse', 'mlflow', 'phoenix', 'none', or combinations",
|
|
377
413
|
)
|
|
378
414
|
|
|
379
415
|
# Cluster map configuration
|
|
@@ -1934,6 +1934,8 @@ class RagasEvaluator:
|
|
|
1934
1934
|
|
|
1935
1935
|
def _calculate_cost(self, model_name: str, prompt_tokens: int, completion_tokens: int) -> float:
|
|
1936
1936
|
"""Calculate estimated cost in USD based on model pricing."""
|
|
1937
|
+
if "ollama" in model_name:
|
|
1938
|
+
return 0.0
|
|
1937
1939
|
# Find matching model key (exact or substring match)
|
|
1938
1940
|
price_key = "openai/gpt-4o" # Default fallback
|
|
1939
1941
|
for key in self.MODEL_PRICING:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: evalvault
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.77.0
|
|
4
4
|
Summary: RAG evaluation system using Ragas with Phoenix/Langfuse tracing
|
|
5
5
|
Project-URL: Homepage, https://github.com/ntts9990/EvalVault
|
|
6
6
|
Project-URL: Documentation, https://github.com/ntts9990/EvalVault#readme
|
|
@@ -66,6 +66,7 @@ Requires-Dist: ijson>=3.3.0; extra == 'dev'
|
|
|
66
66
|
Requires-Dist: kiwipiepy>=0.18.0; extra == 'dev'
|
|
67
67
|
Requires-Dist: langchain-anthropic; extra == 'dev'
|
|
68
68
|
Requires-Dist: lm-eval[api]>=0.4.0; extra == 'dev'
|
|
69
|
+
Requires-Dist: manim>=0.18.0; extra == 'dev'
|
|
69
70
|
Requires-Dist: mkdocs-material>=9.5.0; extra == 'dev'
|
|
70
71
|
Requires-Dist: mkdocs>=1.5.0; extra == 'dev'
|
|
71
72
|
Requires-Dist: mkdocstrings[python]>=0.24.0; extra == 'dev'
|