judgeval 0.1.0__py3-none-any.whl → 0.23.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +173 -10
- judgeval/api/__init__.py +523 -0
- judgeval/api/api_types.py +413 -0
- judgeval/cli.py +112 -0
- judgeval/constants.py +7 -30
- judgeval/data/__init__.py +1 -3
- judgeval/data/evaluation_run.py +125 -0
- judgeval/data/example.py +14 -40
- judgeval/data/judgment_types.py +396 -146
- judgeval/data/result.py +11 -18
- judgeval/data/scorer_data.py +3 -26
- judgeval/data/scripts/openapi_transform.py +5 -5
- judgeval/data/trace.py +115 -194
- judgeval/dataset/__init__.py +335 -0
- judgeval/env.py +55 -0
- judgeval/evaluation/__init__.py +346 -0
- judgeval/exceptions.py +28 -0
- judgeval/integrations/langgraph/__init__.py +13 -0
- judgeval/integrations/openlit/__init__.py +51 -0
- judgeval/judges/__init__.py +2 -2
- judgeval/judges/litellm_judge.py +77 -16
- judgeval/judges/together_judge.py +88 -17
- judgeval/judges/utils.py +7 -20
- judgeval/judgment_attribute_keys.py +55 -0
- judgeval/{common/logger.py → logger.py} +24 -8
- judgeval/prompt/__init__.py +330 -0
- judgeval/scorers/__init__.py +11 -11
- judgeval/scorers/agent_scorer.py +15 -19
- judgeval/scorers/api_scorer.py +21 -23
- judgeval/scorers/base_scorer.py +54 -36
- judgeval/scorers/example_scorer.py +1 -3
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -24
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -10
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -10
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -14
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +171 -59
- judgeval/scorers/score.py +64 -47
- judgeval/scorers/utils.py +2 -107
- judgeval/tracer/__init__.py +1111 -2
- judgeval/tracer/constants.py +1 -0
- judgeval/tracer/exporters/__init__.py +40 -0
- judgeval/tracer/exporters/s3.py +119 -0
- judgeval/tracer/exporters/store.py +59 -0
- judgeval/tracer/exporters/utils.py +32 -0
- judgeval/tracer/keys.py +63 -0
- judgeval/tracer/llm/__init__.py +7 -0
- judgeval/tracer/llm/config.py +78 -0
- judgeval/tracer/llm/constants.py +9 -0
- judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
- judgeval/tracer/llm/llm_anthropic/config.py +6 -0
- judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
- judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
- judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
- judgeval/tracer/llm/llm_google/__init__.py +3 -0
- judgeval/tracer/llm/llm_google/config.py +6 -0
- judgeval/tracer/llm/llm_google/generate_content.py +127 -0
- judgeval/tracer/llm/llm_google/wrapper.py +30 -0
- judgeval/tracer/llm/llm_openai/__init__.py +3 -0
- judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
- judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
- judgeval/tracer/llm/llm_openai/config.py +6 -0
- judgeval/tracer/llm/llm_openai/responses.py +506 -0
- judgeval/tracer/llm/llm_openai/utils.py +42 -0
- judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
- judgeval/tracer/llm/llm_together/__init__.py +3 -0
- judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
- judgeval/tracer/llm/llm_together/config.py +6 -0
- judgeval/tracer/llm/llm_together/wrapper.py +52 -0
- judgeval/tracer/llm/providers.py +19 -0
- judgeval/tracer/managers.py +167 -0
- judgeval/tracer/processors/__init__.py +220 -0
- judgeval/tracer/utils.py +19 -0
- judgeval/trainer/__init__.py +14 -0
- judgeval/trainer/base_trainer.py +122 -0
- judgeval/trainer/config.py +123 -0
- judgeval/trainer/console.py +144 -0
- judgeval/trainer/fireworks_trainer.py +392 -0
- judgeval/trainer/trainable_model.py +252 -0
- judgeval/trainer/trainer.py +70 -0
- judgeval/utils/async_utils.py +39 -0
- judgeval/utils/decorators/__init__.py +0 -0
- judgeval/utils/decorators/dont_throw.py +37 -0
- judgeval/utils/decorators/use_once.py +13 -0
- judgeval/utils/file_utils.py +74 -28
- judgeval/utils/guards.py +36 -0
- judgeval/utils/meta.py +27 -0
- judgeval/utils/project.py +15 -0
- judgeval/utils/serialize.py +253 -0
- judgeval/utils/testing.py +70 -0
- judgeval/utils/url.py +10 -0
- judgeval/{version_check.py → utils/version_check.py} +5 -3
- judgeval/utils/wrappers/README.md +3 -0
- judgeval/utils/wrappers/__init__.py +15 -0
- judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
- judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
- judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
- judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
- judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
- judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
- judgeval/utils/wrappers/py.typed +0 -0
- judgeval/utils/wrappers/utils.py +35 -0
- judgeval/v1/__init__.py +88 -0
- judgeval/v1/data/__init__.py +7 -0
- judgeval/v1/data/example.py +44 -0
- judgeval/v1/data/scorer_data.py +42 -0
- judgeval/v1/data/scoring_result.py +44 -0
- judgeval/v1/datasets/__init__.py +6 -0
- judgeval/v1/datasets/dataset.py +214 -0
- judgeval/v1/datasets/dataset_factory.py +94 -0
- judgeval/v1/evaluation/__init__.py +6 -0
- judgeval/v1/evaluation/evaluation.py +182 -0
- judgeval/v1/evaluation/evaluation_factory.py +17 -0
- judgeval/v1/instrumentation/__init__.py +6 -0
- judgeval/v1/instrumentation/llm/__init__.py +7 -0
- judgeval/v1/instrumentation/llm/config.py +78 -0
- judgeval/v1/instrumentation/llm/constants.py +11 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/messages.py +414 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/messages_stream.py +307 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/wrapper.py +61 -0
- judgeval/v1/instrumentation/llm/llm_google/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_google/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_google/generate_content.py +121 -0
- judgeval/v1/instrumentation/llm/llm_google/wrapper.py +30 -0
- judgeval/v1/instrumentation/llm/llm_openai/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_openai/beta_chat_completions.py +212 -0
- judgeval/v1/instrumentation/llm/llm_openai/chat_completions.py +477 -0
- judgeval/v1/instrumentation/llm/llm_openai/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_openai/responses.py +472 -0
- judgeval/v1/instrumentation/llm/llm_openai/utils.py +41 -0
- judgeval/v1/instrumentation/llm/llm_openai/wrapper.py +63 -0
- judgeval/v1/instrumentation/llm/llm_together/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_together/chat_completions.py +382 -0
- judgeval/v1/instrumentation/llm/llm_together/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_together/wrapper.py +57 -0
- judgeval/v1/instrumentation/llm/providers.py +19 -0
- judgeval/v1/integrations/claude_agent_sdk/__init__.py +119 -0
- judgeval/v1/integrations/claude_agent_sdk/wrapper.py +564 -0
- judgeval/v1/integrations/langgraph/__init__.py +13 -0
- judgeval/v1/integrations/openlit/__init__.py +47 -0
- judgeval/v1/internal/api/__init__.py +525 -0
- judgeval/v1/internal/api/api_types.py +413 -0
- judgeval/v1/prompts/__init__.py +6 -0
- judgeval/v1/prompts/prompt.py +29 -0
- judgeval/v1/prompts/prompt_factory.py +189 -0
- judgeval/v1/py.typed +0 -0
- judgeval/v1/scorers/__init__.py +6 -0
- judgeval/v1/scorers/api_scorer.py +82 -0
- judgeval/v1/scorers/base_scorer.py +17 -0
- judgeval/v1/scorers/built_in/__init__.py +17 -0
- judgeval/v1/scorers/built_in/answer_correctness.py +28 -0
- judgeval/v1/scorers/built_in/answer_relevancy.py +28 -0
- judgeval/v1/scorers/built_in/built_in_factory.py +26 -0
- judgeval/v1/scorers/built_in/faithfulness.py +28 -0
- judgeval/v1/scorers/built_in/instruction_adherence.py +28 -0
- judgeval/v1/scorers/custom_scorer/__init__.py +6 -0
- judgeval/v1/scorers/custom_scorer/custom_scorer.py +50 -0
- judgeval/v1/scorers/custom_scorer/custom_scorer_factory.py +16 -0
- judgeval/v1/scorers/prompt_scorer/__init__.py +6 -0
- judgeval/v1/scorers/prompt_scorer/prompt_scorer.py +86 -0
- judgeval/v1/scorers/prompt_scorer/prompt_scorer_factory.py +85 -0
- judgeval/v1/scorers/scorers_factory.py +49 -0
- judgeval/v1/tracer/__init__.py +7 -0
- judgeval/v1/tracer/base_tracer.py +520 -0
- judgeval/v1/tracer/exporters/__init__.py +14 -0
- judgeval/v1/tracer/exporters/in_memory_span_exporter.py +25 -0
- judgeval/v1/tracer/exporters/judgment_span_exporter.py +42 -0
- judgeval/v1/tracer/exporters/noop_span_exporter.py +19 -0
- judgeval/v1/tracer/exporters/span_store.py +50 -0
- judgeval/v1/tracer/judgment_tracer_provider.py +70 -0
- judgeval/v1/tracer/processors/__init__.py +6 -0
- judgeval/v1/tracer/processors/_lifecycles/__init__.py +28 -0
- judgeval/v1/tracer/processors/_lifecycles/agent_id_processor.py +53 -0
- judgeval/v1/tracer/processors/_lifecycles/context_keys.py +11 -0
- judgeval/v1/tracer/processors/_lifecycles/customer_id_processor.py +29 -0
- judgeval/v1/tracer/processors/_lifecycles/registry.py +18 -0
- judgeval/v1/tracer/processors/judgment_span_processor.py +165 -0
- judgeval/v1/tracer/processors/noop_span_processor.py +42 -0
- judgeval/v1/tracer/tracer.py +67 -0
- judgeval/v1/tracer/tracer_factory.py +38 -0
- judgeval/v1/trainers/__init__.py +5 -0
- judgeval/v1/trainers/base_trainer.py +62 -0
- judgeval/v1/trainers/config.py +123 -0
- judgeval/v1/trainers/console.py +144 -0
- judgeval/v1/trainers/fireworks_trainer.py +392 -0
- judgeval/v1/trainers/trainable_model.py +252 -0
- judgeval/v1/trainers/trainers_factory.py +37 -0
- judgeval/v1/utils.py +18 -0
- judgeval/version.py +5 -0
- judgeval/warnings.py +4 -0
- judgeval-0.23.0.dist-info/METADATA +266 -0
- judgeval-0.23.0.dist-info/RECORD +201 -0
- judgeval-0.23.0.dist-info/entry_points.txt +2 -0
- judgeval/clients.py +0 -34
- judgeval/common/__init__.py +0 -13
- judgeval/common/api/__init__.py +0 -3
- judgeval/common/api/api.py +0 -352
- judgeval/common/api/constants.py +0 -165
- judgeval/common/exceptions.py +0 -27
- judgeval/common/storage/__init__.py +0 -6
- judgeval/common/storage/s3_storage.py +0 -98
- judgeval/common/tracer/__init__.py +0 -31
- judgeval/common/tracer/constants.py +0 -22
- judgeval/common/tracer/core.py +0 -1916
- judgeval/common/tracer/otel_exporter.py +0 -108
- judgeval/common/tracer/otel_span_processor.py +0 -234
- judgeval/common/tracer/span_processor.py +0 -37
- judgeval/common/tracer/span_transformer.py +0 -211
- judgeval/common/tracer/trace_manager.py +0 -92
- judgeval/common/utils.py +0 -940
- judgeval/data/datasets/__init__.py +0 -4
- judgeval/data/datasets/dataset.py +0 -341
- judgeval/data/datasets/eval_dataset_client.py +0 -214
- judgeval/data/tool.py +0 -5
- judgeval/data/trace_run.py +0 -37
- judgeval/evaluation_run.py +0 -75
- judgeval/integrations/langgraph.py +0 -843
- judgeval/judges/mixture_of_judges.py +0 -286
- judgeval/judgment_client.py +0 -369
- judgeval/rules.py +0 -521
- judgeval/run_evaluation.py +0 -684
- judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
- judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
- judgeval/utils/alerts.py +0 -93
- judgeval/utils/requests.py +0 -50
- judgeval-0.1.0.dist-info/METADATA +0 -202
- judgeval-0.1.0.dist-info/RECORD +0 -73
- {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/WHEEL +0 -0
- {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
JUDGEVAL_TRACER_INSTRUMENTING_MODULE_NAME = "judgeval"
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from opentelemetry.sdk.trace.export import (
|
|
2
|
+
SpanExportResult,
|
|
3
|
+
SpanExporter,
|
|
4
|
+
)
|
|
5
|
+
from opentelemetry.sdk.trace import ReadableSpan
|
|
6
|
+
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
|
|
7
|
+
from typing import Sequence
|
|
8
|
+
|
|
9
|
+
from judgeval.tracer.exporters.store import ABCSpanStore
|
|
10
|
+
from judgeval.tracer.exporters.s3 import S3Exporter
|
|
11
|
+
from judgeval.tracer.exporters.utils import deduplicate_spans
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class JudgmentSpanExporter(OTLPSpanExporter):
|
|
15
|
+
def __init__(
|
|
16
|
+
self, endpoint: str, api_key: str, organization_id: str, project_id: str
|
|
17
|
+
):
|
|
18
|
+
super().__init__(
|
|
19
|
+
endpoint=endpoint,
|
|
20
|
+
headers={
|
|
21
|
+
"Authorization": f"Bearer {api_key}",
|
|
22
|
+
"X-Organization-Id": organization_id,
|
|
23
|
+
"X-Project-Id": project_id,
|
|
24
|
+
},
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult:
|
|
28
|
+
return super().export(deduplicate_spans(spans))
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class InMemorySpanExporter(SpanExporter):
|
|
32
|
+
def __init__(self, store: ABCSpanStore):
|
|
33
|
+
self.store = store
|
|
34
|
+
|
|
35
|
+
def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult:
|
|
36
|
+
self.store.add(*spans)
|
|
37
|
+
return SpanExportResult.SUCCESS
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
__all__ = ("JudgmentSpanExporter", "InMemorySpanExporter", "S3Exporter")
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from typing import Literal, Sequence, Optional, TYPE_CHECKING, cast
|
|
6
|
+
import boto3
|
|
7
|
+
from botocore.client import Config
|
|
8
|
+
|
|
9
|
+
from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult
|
|
10
|
+
from opentelemetry.sdk.trace import ReadableSpan
|
|
11
|
+
from judgeval.env import (
|
|
12
|
+
JUDGMENT_S3_ACCESS_KEY_ID,
|
|
13
|
+
JUDGMENT_S3_SECRET_ACCESS_KEY,
|
|
14
|
+
JUDGMENT_S3_REGION_NAME,
|
|
15
|
+
JUDGMENT_S3_BUCKET_NAME,
|
|
16
|
+
JUDGMENT_S3_PREFIX,
|
|
17
|
+
JUDGMENT_S3_ENDPOINT_URL,
|
|
18
|
+
JUDGMENT_S3_SIGNATURE_VERSION,
|
|
19
|
+
JUDGMENT_S3_ADDRESSING_STYLE,
|
|
20
|
+
)
|
|
21
|
+
from judgeval.exceptions import JudgmentRuntimeError
|
|
22
|
+
from judgeval.logger import judgeval_logger
|
|
23
|
+
|
|
24
|
+
if TYPE_CHECKING:
|
|
25
|
+
from mypy_boto3_s3.client import S3Client
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class S3Exporter(SpanExporter):
|
|
29
|
+
__slots__ = ("bucket_name", "prefix", "s3_client")
|
|
30
|
+
|
|
31
|
+
bucket_name: str
|
|
32
|
+
prefix: str
|
|
33
|
+
s3_client: S3Client
|
|
34
|
+
|
|
35
|
+
def __init__(
|
|
36
|
+
self,
|
|
37
|
+
bucket_name: Optional[str] = JUDGMENT_S3_BUCKET_NAME,
|
|
38
|
+
region_name: Optional[str] = JUDGMENT_S3_REGION_NAME,
|
|
39
|
+
prefix: str = JUDGMENT_S3_PREFIX,
|
|
40
|
+
s3_access_key_id: Optional[str] = JUDGMENT_S3_ACCESS_KEY_ID,
|
|
41
|
+
s3_secret_access_key: Optional[str] = JUDGMENT_S3_SECRET_ACCESS_KEY,
|
|
42
|
+
endpoint_url: Optional[str] = JUDGMENT_S3_ENDPOINT_URL,
|
|
43
|
+
# https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html
|
|
44
|
+
signature_version: str = JUDGMENT_S3_SIGNATURE_VERSION,
|
|
45
|
+
addressing_style: str = JUDGMENT_S3_ADDRESSING_STYLE,
|
|
46
|
+
batch_size: int = 8,
|
|
47
|
+
):
|
|
48
|
+
if not bucket_name:
|
|
49
|
+
raise JudgmentRuntimeError("JUDGMENT_S3_BUCKET_NAME is not set")
|
|
50
|
+
|
|
51
|
+
if not region_name:
|
|
52
|
+
raise JudgmentRuntimeError("JUDGMENT_S3_REGION_NAME is not set")
|
|
53
|
+
|
|
54
|
+
if addressing_style not in ["auto", "virtual", "path"]:
|
|
55
|
+
raise JudgmentRuntimeError(f"Invalid addressing style: {addressing_style}")
|
|
56
|
+
addressing_style = cast(Literal["auto", "virtual", "path"], addressing_style)
|
|
57
|
+
|
|
58
|
+
self.bucket_name = bucket_name
|
|
59
|
+
self.prefix = prefix.rstrip("/")
|
|
60
|
+
self.batch_size = batch_size
|
|
61
|
+
|
|
62
|
+
self.s3_client = boto3.client(
|
|
63
|
+
"s3",
|
|
64
|
+
config=Config(
|
|
65
|
+
signature_version=signature_version,
|
|
66
|
+
s3={"addressing_style": addressing_style},
|
|
67
|
+
),
|
|
68
|
+
aws_access_key_id=s3_access_key_id,
|
|
69
|
+
aws_secret_access_key=s3_secret_access_key,
|
|
70
|
+
endpoint_url=endpoint_url,
|
|
71
|
+
region_name=region_name,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
def _upload_span(self, span: ReadableSpan) -> tuple[bool, str]:
|
|
75
|
+
"""Upload a single span to S3. Returns (success, key)."""
|
|
76
|
+
try:
|
|
77
|
+
span_context = span.get_span_context()
|
|
78
|
+
if not span_context:
|
|
79
|
+
return False, ""
|
|
80
|
+
|
|
81
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S%f")
|
|
82
|
+
trace_id = format(span_context.trace_id, "032x")
|
|
83
|
+
span_id = format(span_context.span_id, "016x")
|
|
84
|
+
key = f"{self.prefix}/{trace_id}/{span_id}/{timestamp}.json"
|
|
85
|
+
|
|
86
|
+
span_json = span.to_json(indent=0)
|
|
87
|
+
|
|
88
|
+
self.s3_client.put_object(
|
|
89
|
+
Bucket=self.bucket_name,
|
|
90
|
+
Key=key,
|
|
91
|
+
Body=span_json,
|
|
92
|
+
ContentType="application/json",
|
|
93
|
+
)
|
|
94
|
+
return True, key
|
|
95
|
+
except Exception as e:
|
|
96
|
+
judgeval_logger.error(
|
|
97
|
+
f"Error uploading span {span_context.span_id if span_context else 'unknown'}: {e}"
|
|
98
|
+
)
|
|
99
|
+
return False, ""
|
|
100
|
+
|
|
101
|
+
def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult:
|
|
102
|
+
if not spans:
|
|
103
|
+
return SpanExportResult.SUCCESS
|
|
104
|
+
|
|
105
|
+
try:
|
|
106
|
+
with ThreadPoolExecutor(
|
|
107
|
+
max_workers=min(len(spans), self.batch_size)
|
|
108
|
+
) as executor:
|
|
109
|
+
futures = [executor.submit(self._upload_span, span) for span in spans]
|
|
110
|
+
|
|
111
|
+
for future in as_completed(futures):
|
|
112
|
+
success, key = future.result()
|
|
113
|
+
if not success:
|
|
114
|
+
return SpanExportResult.FAILURE
|
|
115
|
+
return SpanExportResult.SUCCESS
|
|
116
|
+
|
|
117
|
+
except Exception as e:
|
|
118
|
+
judgeval_logger.error(f"Error exporting spans to S3: {e}")
|
|
119
|
+
return SpanExportResult.FAILURE
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import List, Dict
|
|
3
|
+
|
|
4
|
+
from opentelemetry.sdk.trace import ReadableSpan
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class ABCSpanStore(ABC):
|
|
8
|
+
@abstractmethod
|
|
9
|
+
def add(self, *spans: ReadableSpan): ...
|
|
10
|
+
|
|
11
|
+
@abstractmethod
|
|
12
|
+
def get_all(self) -> List[ReadableSpan]: ...
|
|
13
|
+
|
|
14
|
+
@abstractmethod
|
|
15
|
+
def get_by_trace_id(self, trace_id: str) -> List[ReadableSpan]: ...
|
|
16
|
+
|
|
17
|
+
@abstractmethod
|
|
18
|
+
def clear_trace(self, trace_id: str): ...
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class SpanStore(ABCSpanStore):
|
|
22
|
+
__slots__ = ("_spans_by_trace",)
|
|
23
|
+
|
|
24
|
+
_spans_by_trace: Dict[str, List[ReadableSpan]]
|
|
25
|
+
|
|
26
|
+
def __init__(self):
|
|
27
|
+
self._spans_by_trace = {}
|
|
28
|
+
|
|
29
|
+
def add(self, *spans: ReadableSpan):
|
|
30
|
+
for span in spans:
|
|
31
|
+
context = span.get_span_context()
|
|
32
|
+
if context is None:
|
|
33
|
+
continue
|
|
34
|
+
# Convert trace_id to hex string per OTEL spec
|
|
35
|
+
trace_id = format(context.trace_id, "032x")
|
|
36
|
+
if trace_id not in self._spans_by_trace:
|
|
37
|
+
self._spans_by_trace[trace_id] = []
|
|
38
|
+
self._spans_by_trace[trace_id].append(span)
|
|
39
|
+
|
|
40
|
+
def get_all(self) -> List[ReadableSpan]:
|
|
41
|
+
all_spans = []
|
|
42
|
+
for spans in self._spans_by_trace.values():
|
|
43
|
+
all_spans.extend(spans)
|
|
44
|
+
return all_spans
|
|
45
|
+
|
|
46
|
+
def get_by_trace_id(self, trace_id: str) -> List[ReadableSpan]:
|
|
47
|
+
"""Get all spans for a specific trace ID (32-char hex string)."""
|
|
48
|
+
return self._spans_by_trace.get(trace_id, [])
|
|
49
|
+
|
|
50
|
+
def clear_trace(self, trace_id: str):
|
|
51
|
+
"""Clear all spans for a specific trace ID (32-char hex string)."""
|
|
52
|
+
if trace_id in self._spans_by_trace:
|
|
53
|
+
del self._spans_by_trace[trace_id]
|
|
54
|
+
|
|
55
|
+
def __repr__(self) -> str:
|
|
56
|
+
total_spans = sum(len(spans) for spans in self._spans_by_trace.values())
|
|
57
|
+
return (
|
|
58
|
+
f"SpanStore(traces={len(self._spans_by_trace)}, total_spans={total_spans})"
|
|
59
|
+
)
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from typing import Sequence
|
|
2
|
+
from opentelemetry.sdk.trace import ReadableSpan
|
|
3
|
+
|
|
4
|
+
from judgeval.tracer.keys import AttributeKeys
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def deduplicate_spans(spans: Sequence[ReadableSpan]) -> Sequence[ReadableSpan]:
|
|
8
|
+
spans_by_key: dict[tuple[int, int], ReadableSpan] = {}
|
|
9
|
+
for span in spans:
|
|
10
|
+
if span.attributes and span.context:
|
|
11
|
+
update_id = span.attributes.get(AttributeKeys.JUDGMENT_UPDATE_ID)
|
|
12
|
+
|
|
13
|
+
if not isinstance(update_id, int):
|
|
14
|
+
continue
|
|
15
|
+
|
|
16
|
+
key = (span.context.trace_id, span.context.span_id)
|
|
17
|
+
if key not in spans_by_key:
|
|
18
|
+
spans_by_key[key] = span
|
|
19
|
+
else:
|
|
20
|
+
existing_attrs = spans_by_key[key].attributes
|
|
21
|
+
existing_update_id = (
|
|
22
|
+
existing_attrs.get(AttributeKeys.JUDGMENT_UPDATE_ID, 0)
|
|
23
|
+
if existing_attrs
|
|
24
|
+
else 0
|
|
25
|
+
)
|
|
26
|
+
if (
|
|
27
|
+
isinstance(existing_update_id, (int, float))
|
|
28
|
+
and update_id > existing_update_id
|
|
29
|
+
):
|
|
30
|
+
spans_by_key[key] = span
|
|
31
|
+
|
|
32
|
+
return list(spans_by_key.values())
|
judgeval/tracer/keys.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Identifiers used by Judgeval to store specific types of data in the spans.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from enum import Enum
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class AttributeKeys(str, Enum):
|
|
9
|
+
JUDGMENT_SPAN_KIND = "judgment.span_kind"
|
|
10
|
+
JUDGMENT_INPUT = "judgment.input"
|
|
11
|
+
JUDGMENT_OUTPUT = "judgment.output"
|
|
12
|
+
JUDGMENT_OFFLINE_MODE = "judgment.offline_mode"
|
|
13
|
+
JUDGMENT_UPDATE_ID = "judgment.update_id"
|
|
14
|
+
|
|
15
|
+
JUDGMENT_USAGE_METADATA = "judgment.usage.metadata"
|
|
16
|
+
|
|
17
|
+
JUDGMENT_CUSTOMER_ID = "judgment.customer_id"
|
|
18
|
+
|
|
19
|
+
JUDGMENT_AGENT_ID = "judgment.agent_id"
|
|
20
|
+
JUDGMENT_PARENT_AGENT_ID = "judgment.parent_agent_id"
|
|
21
|
+
JUDGMENT_AGENT_CLASS_NAME = "judgment.agent_class_name"
|
|
22
|
+
JUDGMENT_AGENT_INSTANCE_NAME = "judgment.agent_instance_name"
|
|
23
|
+
JUDGMENT_IS_AGENT_ENTRY_POINT = "judgment.is_agent_entry_point"
|
|
24
|
+
JUDGMENT_STATE_BEFORE = "judgment.state_before"
|
|
25
|
+
JUDGMENT_STATE_AFTER = "judgment.state_after"
|
|
26
|
+
|
|
27
|
+
PENDING_TRACE_EVAL = "judgment.pending_trace_eval"
|
|
28
|
+
|
|
29
|
+
JUDGMENT_LLM_PROVIDER = "judgment.llm.provider"
|
|
30
|
+
JUDGMENT_LLM_MODEL_NAME = "judgment.llm.model"
|
|
31
|
+
JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS = "judgment.usage.non_cached_input_tokens"
|
|
32
|
+
JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS = (
|
|
33
|
+
"judgment.usage.cache_creation_input_tokens"
|
|
34
|
+
)
|
|
35
|
+
JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS = "judgment.usage.cache_read_input_tokens"
|
|
36
|
+
JUDGMENT_USAGE_OUTPUT_TOKENS = "judgment.usage.output_tokens"
|
|
37
|
+
JUDGMENT_USAGE_TOTAL_COST_USD = "judgment.usage.total_cost_usd"
|
|
38
|
+
|
|
39
|
+
GEN_AI_PROMPT = "gen_ai.prompt"
|
|
40
|
+
GEN_AI_COMPLETION = "gen_ai.completion"
|
|
41
|
+
GEN_AI_SYSTEM = "gen_ai.system"
|
|
42
|
+
GEN_AI_REQUEST_TEMPERATURE = "gen_ai.request.temperature"
|
|
43
|
+
GEN_AI_REQUEST_MAX_TOKENS = "gen_ai.request.max_tokens"
|
|
44
|
+
GEN_AI_RESPONSE_FINISH_REASONS = "gen_ai.response.finish_reasons"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class InternalAttributeKeys(str, Enum):
|
|
48
|
+
"""
|
|
49
|
+
Internal attribute keys used for temporary state management in span processors.
|
|
50
|
+
These are NOT exported and are used only for internal span lifecycle management.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
DISABLE_PARTIAL_EMIT = "disable_partial_emit"
|
|
54
|
+
CANCELLED = "cancelled"
|
|
55
|
+
IS_CUSTOMER_CONTEXT_OWNER = "is_customer_context_owner"
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class ResourceKeys(str, Enum):
|
|
59
|
+
SERVICE_NAME = "service.name"
|
|
60
|
+
TELEMETRY_SDK_LANGUAGE = "telemetry.sdk.language"
|
|
61
|
+
TELEMETRY_SDK_NAME = "telemetry.sdk.name"
|
|
62
|
+
TELEMETRY_SDK_VERSION = "telemetry.sdk.version"
|
|
63
|
+
JUDGMENT_PROJECT_ID = "judgment.project_id"
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
3
|
+
from judgeval.logger import judgeval_logger
|
|
4
|
+
|
|
5
|
+
from judgeval.tracer.llm.constants import ProviderType
|
|
6
|
+
from judgeval.tracer.llm.providers import (
|
|
7
|
+
HAS_OPENAI,
|
|
8
|
+
HAS_TOGETHER,
|
|
9
|
+
HAS_ANTHROPIC,
|
|
10
|
+
HAS_GOOGLE_GENAI,
|
|
11
|
+
ApiClient,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from judgeval.tracer import Tracer
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _detect_provider(client: ApiClient) -> ProviderType:
|
|
19
|
+
if HAS_OPENAI:
|
|
20
|
+
from openai import OpenAI, AsyncOpenAI
|
|
21
|
+
|
|
22
|
+
if isinstance(client, (OpenAI, AsyncOpenAI)):
|
|
23
|
+
return ProviderType.OPENAI
|
|
24
|
+
|
|
25
|
+
if HAS_ANTHROPIC:
|
|
26
|
+
from anthropic import Anthropic, AsyncAnthropic
|
|
27
|
+
|
|
28
|
+
if isinstance(client, (Anthropic, AsyncAnthropic)):
|
|
29
|
+
return ProviderType.ANTHROPIC
|
|
30
|
+
|
|
31
|
+
if HAS_TOGETHER:
|
|
32
|
+
from together import Together, AsyncTogether # type: ignore[import-untyped]
|
|
33
|
+
|
|
34
|
+
if isinstance(client, (Together, AsyncTogether)):
|
|
35
|
+
return ProviderType.TOGETHER
|
|
36
|
+
|
|
37
|
+
if HAS_GOOGLE_GENAI:
|
|
38
|
+
from google.genai import Client as GoogleClient
|
|
39
|
+
|
|
40
|
+
if isinstance(client, GoogleClient):
|
|
41
|
+
return ProviderType.GOOGLE
|
|
42
|
+
|
|
43
|
+
judgeval_logger.warning(
|
|
44
|
+
f"Unknown client type {type(client)}, Trying to wrap as OpenAI-compatible. "
|
|
45
|
+
"If this is a mistake or you think we should support this client, please file an issue at https://github.com/JudgmentLabs/judgeval/issues!"
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
return ProviderType.DEFAULT
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def wrap_provider(tracer: Tracer, client: ApiClient) -> ApiClient:
|
|
52
|
+
"""
|
|
53
|
+
Wraps an API client to add tracing capabilities.
|
|
54
|
+
Supports OpenAI, Together, Anthropic, and Google GenAI clients.
|
|
55
|
+
"""
|
|
56
|
+
provider_type = _detect_provider(client)
|
|
57
|
+
|
|
58
|
+
if provider_type == ProviderType.OPENAI:
|
|
59
|
+
from .llm_openai.wrapper import wrap_openai_client
|
|
60
|
+
|
|
61
|
+
return wrap_openai_client(tracer, client)
|
|
62
|
+
elif provider_type == ProviderType.ANTHROPIC:
|
|
63
|
+
from .llm_anthropic.wrapper import wrap_anthropic_client
|
|
64
|
+
|
|
65
|
+
return wrap_anthropic_client(tracer, client)
|
|
66
|
+
elif provider_type == ProviderType.TOGETHER:
|
|
67
|
+
from .llm_together.wrapper import wrap_together_client
|
|
68
|
+
|
|
69
|
+
return wrap_together_client(tracer, client)
|
|
70
|
+
elif provider_type == ProviderType.GOOGLE:
|
|
71
|
+
from .llm_google.wrapper import wrap_google_client
|
|
72
|
+
|
|
73
|
+
return wrap_google_client(tracer, client)
|
|
74
|
+
else:
|
|
75
|
+
# Default to OpenAI-compatible wrapping for unknown clients
|
|
76
|
+
from .llm_openai.wrapper import wrap_openai_client
|
|
77
|
+
|
|
78
|
+
return wrap_openai_client(tracer, client)
|