judgeval 0.7.1__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +139 -12
- judgeval/api/__init__.py +501 -0
- judgeval/api/api_types.py +344 -0
- judgeval/cli.py +2 -4
- judgeval/constants.py +10 -26
- judgeval/data/evaluation_run.py +49 -26
- judgeval/data/example.py +2 -2
- judgeval/data/judgment_types.py +266 -82
- judgeval/data/result.py +4 -5
- judgeval/data/scorer_data.py +4 -2
- judgeval/data/tool.py +2 -2
- judgeval/data/trace.py +7 -50
- judgeval/data/trace_run.py +7 -4
- judgeval/{dataset.py → dataset/__init__.py} +43 -28
- judgeval/env.py +67 -0
- judgeval/{run_evaluation.py → evaluation/__init__.py} +29 -95
- judgeval/exceptions.py +27 -0
- judgeval/integrations/langgraph/__init__.py +788 -0
- judgeval/judges/__init__.py +2 -2
- judgeval/judges/litellm_judge.py +75 -15
- judgeval/judges/together_judge.py +86 -18
- judgeval/judges/utils.py +7 -21
- judgeval/{common/logger.py → logger.py} +8 -6
- judgeval/scorers/__init__.py +0 -4
- judgeval/scorers/agent_scorer.py +3 -7
- judgeval/scorers/api_scorer.py +8 -13
- judgeval/scorers/base_scorer.py +52 -32
- judgeval/scorers/example_scorer.py +1 -3
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -14
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +45 -20
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +3 -3
- judgeval/scorers/score.py +21 -31
- judgeval/scorers/trace_api_scorer.py +5 -0
- judgeval/scorers/utils.py +1 -103
- judgeval/tracer/__init__.py +1075 -2
- judgeval/tracer/constants.py +1 -0
- judgeval/tracer/exporters/__init__.py +37 -0
- judgeval/tracer/exporters/s3.py +119 -0
- judgeval/tracer/exporters/store.py +43 -0
- judgeval/tracer/exporters/utils.py +32 -0
- judgeval/tracer/keys.py +67 -0
- judgeval/tracer/llm/__init__.py +1233 -0
- judgeval/{common/tracer → tracer/llm}/providers.py +5 -10
- judgeval/{local_eval_queue.py → tracer/local_eval_queue.py} +15 -10
- judgeval/tracer/managers.py +188 -0
- judgeval/tracer/processors/__init__.py +181 -0
- judgeval/tracer/utils.py +20 -0
- judgeval/trainer/__init__.py +5 -0
- judgeval/{common/trainer → trainer}/config.py +12 -9
- judgeval/{common/trainer → trainer}/console.py +2 -9
- judgeval/{common/trainer → trainer}/trainable_model.py +12 -7
- judgeval/{common/trainer → trainer}/trainer.py +119 -17
- judgeval/utils/async_utils.py +2 -3
- judgeval/utils/decorators.py +24 -0
- judgeval/utils/file_utils.py +37 -4
- judgeval/utils/guards.py +32 -0
- judgeval/utils/meta.py +14 -0
- judgeval/{common/api/json_encoder.py → utils/serialize.py} +7 -1
- judgeval/utils/testing.py +88 -0
- judgeval/utils/url.py +10 -0
- judgeval/{version_check.py → utils/version_check.py} +3 -3
- judgeval/version.py +5 -0
- judgeval/warnings.py +4 -0
- {judgeval-0.7.1.dist-info → judgeval-0.9.0.dist-info}/METADATA +12 -14
- judgeval-0.9.0.dist-info/RECORD +80 -0
- judgeval/clients.py +0 -35
- judgeval/common/__init__.py +0 -13
- judgeval/common/api/__init__.py +0 -3
- judgeval/common/api/api.py +0 -375
- judgeval/common/api/constants.py +0 -186
- judgeval/common/exceptions.py +0 -27
- judgeval/common/storage/__init__.py +0 -6
- judgeval/common/storage/s3_storage.py +0 -97
- judgeval/common/tracer/__init__.py +0 -31
- judgeval/common/tracer/constants.py +0 -22
- judgeval/common/tracer/core.py +0 -2427
- judgeval/common/tracer/otel_exporter.py +0 -108
- judgeval/common/tracer/otel_span_processor.py +0 -188
- judgeval/common/tracer/span_processor.py +0 -37
- judgeval/common/tracer/span_transformer.py +0 -207
- judgeval/common/tracer/trace_manager.py +0 -101
- judgeval/common/trainer/__init__.py +0 -5
- judgeval/common/utils.py +0 -948
- judgeval/integrations/langgraph.py +0 -844
- judgeval/judges/mixture_of_judges.py +0 -287
- judgeval/judgment_client.py +0 -267
- judgeval/rules.py +0 -521
- judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
- judgeval/utils/alerts.py +0 -93
- judgeval/utils/requests.py +0 -50
- judgeval-0.7.1.dist-info/RECORD +0 -82
- {judgeval-0.7.1.dist-info → judgeval-0.9.0.dist-info}/WHEEL +0 -0
- {judgeval-0.7.1.dist-info → judgeval-0.9.0.dist-info}/entry_points.txt +0 -0
- {judgeval-0.7.1.dist-info → judgeval-0.9.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1 @@
|
|
1
|
+
JUDGEVAL_TRACER_INSTRUMENTING_MODULE_NAME = "opentelemetry.instrumentation.judgeval"
|
@@ -0,0 +1,37 @@
|
|
1
|
+
from opentelemetry.sdk.trace.export import (
|
2
|
+
SpanExportResult,
|
3
|
+
SpanExporter,
|
4
|
+
)
|
5
|
+
from opentelemetry.sdk.trace import ReadableSpan
|
6
|
+
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
|
7
|
+
from typing import Sequence
|
8
|
+
|
9
|
+
from judgeval.tracer.exporters.store import ABCSpanStore
|
10
|
+
from judgeval.tracer.exporters.s3 import S3Exporter
|
11
|
+
from judgeval.tracer.exporters.utils import deduplicate_spans
|
12
|
+
|
13
|
+
|
14
|
+
class JudgmentSpanExporter(OTLPSpanExporter):
|
15
|
+
def __init__(self, endpoint: str, api_key: str, organization_id: str):
|
16
|
+
super().__init__(
|
17
|
+
endpoint=endpoint,
|
18
|
+
headers={
|
19
|
+
"Authorization": f"Bearer {api_key}",
|
20
|
+
"X-Organization-Id": organization_id,
|
21
|
+
},
|
22
|
+
)
|
23
|
+
|
24
|
+
def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult:
|
25
|
+
return super().export(deduplicate_spans(spans))
|
26
|
+
|
27
|
+
|
28
|
+
class InMemorySpanExporter(SpanExporter):
|
29
|
+
def __init__(self, store: ABCSpanStore):
|
30
|
+
self.store = store
|
31
|
+
|
32
|
+
def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult:
|
33
|
+
self.store.add(*spans)
|
34
|
+
return SpanExportResult.SUCCESS
|
35
|
+
|
36
|
+
|
37
|
+
__all__ = ("JudgmentSpanExporter", "InMemorySpanExporter", "S3Exporter")
|
@@ -0,0 +1,119 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
4
|
+
from datetime import datetime
|
5
|
+
from typing import Literal, Sequence, Optional, TYPE_CHECKING, cast
|
6
|
+
import boto3
|
7
|
+
from botocore.client import Config
|
8
|
+
|
9
|
+
from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult
|
10
|
+
from opentelemetry.sdk.trace import ReadableSpan
|
11
|
+
from judgeval.env import (
|
12
|
+
JUDGMENT_S3_ACCESS_KEY_ID,
|
13
|
+
JUDGMENT_S3_SECRET_ACCESS_KEY,
|
14
|
+
JUDGMENT_S3_REGION_NAME,
|
15
|
+
JUDGMENT_S3_BUCKET_NAME,
|
16
|
+
JUDGMENT_S3_PREFIX,
|
17
|
+
JUDGMENT_S3_ENDPOINT_URL,
|
18
|
+
JUDGMENT_S3_SIGNATURE_VERSION,
|
19
|
+
JUDGMENT_S3_ADDRESSING_STYLE,
|
20
|
+
)
|
21
|
+
from judgeval.exceptions import JudgmentRuntimeError
|
22
|
+
from judgeval.logger import judgeval_logger
|
23
|
+
|
24
|
+
if TYPE_CHECKING:
|
25
|
+
from mypy_boto3_s3.client import S3Client
|
26
|
+
|
27
|
+
|
28
|
+
class S3Exporter(SpanExporter):
|
29
|
+
__slots__ = ("bucket_name", "prefix", "s3_client")
|
30
|
+
|
31
|
+
bucket_name: str
|
32
|
+
prefix: str
|
33
|
+
s3_client: S3Client
|
34
|
+
|
35
|
+
def __init__(
|
36
|
+
self,
|
37
|
+
bucket_name: Optional[str] = JUDGMENT_S3_BUCKET_NAME,
|
38
|
+
region_name: Optional[str] = JUDGMENT_S3_REGION_NAME,
|
39
|
+
prefix: str = JUDGMENT_S3_PREFIX,
|
40
|
+
s3_access_key_id: Optional[str] = JUDGMENT_S3_ACCESS_KEY_ID,
|
41
|
+
s3_secret_access_key: Optional[str] = JUDGMENT_S3_SECRET_ACCESS_KEY,
|
42
|
+
endpoint_url: Optional[str] = JUDGMENT_S3_ENDPOINT_URL,
|
43
|
+
# https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html
|
44
|
+
signature_version: str = JUDGMENT_S3_SIGNATURE_VERSION,
|
45
|
+
addressing_style: str = JUDGMENT_S3_ADDRESSING_STYLE,
|
46
|
+
batch_size: int = 8,
|
47
|
+
):
|
48
|
+
if not bucket_name:
|
49
|
+
raise JudgmentRuntimeError("JUDGMENT_S3_BUCKET_NAME is not set")
|
50
|
+
|
51
|
+
if not region_name:
|
52
|
+
raise JudgmentRuntimeError("JUDGMENT_S3_REGION_NAME is not set")
|
53
|
+
|
54
|
+
if addressing_style not in ["auto", "virtual", "path"]:
|
55
|
+
raise JudgmentRuntimeError(f"Invalid addressing style: {addressing_style}")
|
56
|
+
addressing_style = cast(Literal["auto", "virtual", "path"], addressing_style)
|
57
|
+
|
58
|
+
self.bucket_name = bucket_name
|
59
|
+
self.prefix = prefix.rstrip("/")
|
60
|
+
self.batch_size = batch_size
|
61
|
+
|
62
|
+
self.s3_client = boto3.client(
|
63
|
+
"s3",
|
64
|
+
config=Config(
|
65
|
+
signature_version=signature_version,
|
66
|
+
s3={"addressing_style": addressing_style},
|
67
|
+
),
|
68
|
+
aws_access_key_id=s3_access_key_id,
|
69
|
+
aws_secret_access_key=s3_secret_access_key,
|
70
|
+
endpoint_url=endpoint_url,
|
71
|
+
region_name=region_name,
|
72
|
+
)
|
73
|
+
|
74
|
+
def _upload_span(self, span: ReadableSpan) -> tuple[bool, str]:
|
75
|
+
"""Upload a single span to S3. Returns (success, key)."""
|
76
|
+
try:
|
77
|
+
span_context = span.get_span_context()
|
78
|
+
if not span_context:
|
79
|
+
return False, ""
|
80
|
+
|
81
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S%f")
|
82
|
+
trace_id = format(span_context.trace_id, "032x")
|
83
|
+
span_id = format(span_context.span_id, "016x")
|
84
|
+
key = f"{self.prefix}/{trace_id}/{span_id}/{timestamp}.json"
|
85
|
+
|
86
|
+
span_json = span.to_json(indent=0)
|
87
|
+
|
88
|
+
self.s3_client.put_object(
|
89
|
+
Bucket=self.bucket_name,
|
90
|
+
Key=key,
|
91
|
+
Body=span_json,
|
92
|
+
ContentType="application/json",
|
93
|
+
)
|
94
|
+
return True, key
|
95
|
+
except Exception as e:
|
96
|
+
judgeval_logger.error(
|
97
|
+
f"Error uploading span {span_context.span_id if span_context else 'unknown'}: {e}"
|
98
|
+
)
|
99
|
+
return False, ""
|
100
|
+
|
101
|
+
def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult:
|
102
|
+
if not spans:
|
103
|
+
return SpanExportResult.SUCCESS
|
104
|
+
|
105
|
+
try:
|
106
|
+
with ThreadPoolExecutor(
|
107
|
+
max_workers=min(len(spans), self.batch_size)
|
108
|
+
) as executor:
|
109
|
+
futures = [executor.submit(self._upload_span, span) for span in spans]
|
110
|
+
|
111
|
+
for future in as_completed(futures):
|
112
|
+
success, key = future.result()
|
113
|
+
if not success:
|
114
|
+
return SpanExportResult.FAILURE
|
115
|
+
return SpanExportResult.SUCCESS
|
116
|
+
|
117
|
+
except Exception as e:
|
118
|
+
judgeval_logger.error(f"Error exporting spans to S3: {e}")
|
119
|
+
return SpanExportResult.FAILURE
|
@@ -0,0 +1,43 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
from typing import List
|
3
|
+
|
4
|
+
from opentelemetry.sdk.trace import ReadableSpan
|
5
|
+
|
6
|
+
|
7
|
+
class ABCSpanStore(ABC):
|
8
|
+
@abstractmethod
|
9
|
+
def add(self, *spans: ReadableSpan): ...
|
10
|
+
|
11
|
+
@abstractmethod
|
12
|
+
def get(self, id: str) -> ReadableSpan: ...
|
13
|
+
|
14
|
+
@abstractmethod
|
15
|
+
def get_all(self) -> List[ReadableSpan]: ...
|
16
|
+
|
17
|
+
|
18
|
+
class SpanStore(ABCSpanStore):
|
19
|
+
__slots__ = ("spans",)
|
20
|
+
|
21
|
+
spans: List[ReadableSpan]
|
22
|
+
|
23
|
+
def __init__(self):
|
24
|
+
self.spans = []
|
25
|
+
|
26
|
+
def add(self, *spans: ReadableSpan):
|
27
|
+
self.spans.extend(spans)
|
28
|
+
|
29
|
+
def get(self, id: str) -> ReadableSpan:
|
30
|
+
for span in self.spans:
|
31
|
+
context = span.get_span_context()
|
32
|
+
if context is None:
|
33
|
+
continue
|
34
|
+
if context.span_id == id:
|
35
|
+
return span
|
36
|
+
|
37
|
+
raise ValueError(f"Span with id {id} not found")
|
38
|
+
|
39
|
+
def get_all(self) -> List[ReadableSpan]:
|
40
|
+
return self.spans
|
41
|
+
|
42
|
+
def __repr__(self) -> str:
|
43
|
+
return f"SpanStore(spans={self.spans})"
|
@@ -0,0 +1,32 @@
|
|
1
|
+
from typing import Sequence
|
2
|
+
from opentelemetry.sdk.trace import ReadableSpan
|
3
|
+
|
4
|
+
from judgeval.tracer.keys import AttributeKeys
|
5
|
+
|
6
|
+
|
7
|
+
def deduplicate_spans(spans: Sequence[ReadableSpan]) -> Sequence[ReadableSpan]:
|
8
|
+
spans_by_key: dict[tuple[int, int], ReadableSpan] = {}
|
9
|
+
for span in spans:
|
10
|
+
if span.attributes and span.context:
|
11
|
+
update_id = span.attributes.get(AttributeKeys.JUDGMENT_UPDATE_ID)
|
12
|
+
|
13
|
+
if not isinstance(update_id, int):
|
14
|
+
continue
|
15
|
+
|
16
|
+
key = (span.context.trace_id, span.context.span_id)
|
17
|
+
if key not in spans_by_key:
|
18
|
+
spans_by_key[key] = span
|
19
|
+
else:
|
20
|
+
existing_attrs = spans_by_key[key].attributes
|
21
|
+
existing_update_id = (
|
22
|
+
existing_attrs.get(AttributeKeys.JUDGMENT_UPDATE_ID, 0)
|
23
|
+
if existing_attrs
|
24
|
+
else 0
|
25
|
+
)
|
26
|
+
if (
|
27
|
+
isinstance(existing_update_id, (int, float))
|
28
|
+
and update_id > existing_update_id
|
29
|
+
):
|
30
|
+
spans_by_key[key] = span
|
31
|
+
|
32
|
+
return list(spans_by_key.values())
|
judgeval/tracer/keys.py
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
"""
|
2
|
+
Identifiers used by Judgeval to store specific types of data in the spans.
|
3
|
+
"""
|
4
|
+
|
5
|
+
from opentelemetry.semconv.resource import ResourceAttributes
|
6
|
+
from opentelemetry.semconv._incubating.attributes import gen_ai_attributes
|
7
|
+
from enum import Enum
|
8
|
+
|
9
|
+
|
10
|
+
class AttributeKeys(str, Enum):
|
11
|
+
# General function tracing attributes (custom namespace)
|
12
|
+
JUDGMENT_SPAN_KIND = "judgment.span_kind"
|
13
|
+
JUDGMENT_INPUT = "judgment.input"
|
14
|
+
JUDGMENT_OUTPUT = "judgment.output"
|
15
|
+
JUDGMENT_OFFLINE_MODE = "judgment.offline_mode"
|
16
|
+
JUDGMENT_UPDATE_ID = "judgment.update_id"
|
17
|
+
|
18
|
+
# Custom tracking attributes
|
19
|
+
JUDGMENT_CUSTOMER_ID = "judgment.customer_id"
|
20
|
+
|
21
|
+
# Agent specific attributes (custom namespace)
|
22
|
+
JUDGMENT_AGENT_ID = "judgment.agent_id"
|
23
|
+
JUDGMENT_PARENT_AGENT_ID = "judgment.parent_agent_id"
|
24
|
+
JUDGMENT_AGENT_CLASS_NAME = "judgment.agent_class_name"
|
25
|
+
JUDGMENT_AGENT_INSTANCE_NAME = "judgment.agent_instance_name"
|
26
|
+
JUDGMENT_IS_AGENT_ENTRY_POINT = "judgment.is_agent_entry_point"
|
27
|
+
JUDGMENT_CUMULATIVE_LLM_COST = "judgment.cumulative_llm_cost"
|
28
|
+
JUDGMENT_STATE_BEFORE = "judgment.state_before"
|
29
|
+
JUDGMENT_STATE_AFTER = "judgment.state_after"
|
30
|
+
|
31
|
+
# Evaluation-specific attributes (custom namespace)
|
32
|
+
PENDING_TRACE_EVAL = "judgment.pending_trace_eval"
|
33
|
+
|
34
|
+
# GenAI-specific attributes (semantic conventions)
|
35
|
+
GEN_AI_PROMPT = gen_ai_attributes.GEN_AI_PROMPT
|
36
|
+
GEN_AI_COMPLETION = gen_ai_attributes.GEN_AI_COMPLETION
|
37
|
+
GEN_AI_REQUEST_MODEL = gen_ai_attributes.GEN_AI_REQUEST_MODEL
|
38
|
+
GEN_AI_RESPONSE_MODEL = gen_ai_attributes.GEN_AI_RESPONSE_MODEL
|
39
|
+
GEN_AI_SYSTEM = gen_ai_attributes.GEN_AI_SYSTEM
|
40
|
+
GEN_AI_USAGE_INPUT_TOKENS = gen_ai_attributes.GEN_AI_USAGE_INPUT_TOKENS
|
41
|
+
GEN_AI_USAGE_OUTPUT_TOKENS = gen_ai_attributes.GEN_AI_USAGE_OUTPUT_TOKENS
|
42
|
+
GEN_AI_USAGE_COMPLETION_TOKENS = gen_ai_attributes.GEN_AI_USAGE_COMPLETION_TOKENS
|
43
|
+
GEN_AI_REQUEST_TEMPERATURE = gen_ai_attributes.GEN_AI_REQUEST_TEMPERATURE
|
44
|
+
GEN_AI_REQUEST_MAX_TOKENS = gen_ai_attributes.GEN_AI_REQUEST_MAX_TOKENS
|
45
|
+
GEN_AI_RESPONSE_FINISH_REASONS = gen_ai_attributes.GEN_AI_RESPONSE_FINISH_REASONS
|
46
|
+
|
47
|
+
# GenAI-specific attributes (custom namespace)
|
48
|
+
GEN_AI_USAGE_TOTAL_COST = "gen_ai.usage.total_cost_usd"
|
49
|
+
|
50
|
+
|
51
|
+
class InternalAttributeKeys(str, Enum):
|
52
|
+
"""
|
53
|
+
Internal attribute keys used for temporary state management in span processors.
|
54
|
+
These are NOT exported and are used only for internal span lifecycle management.
|
55
|
+
"""
|
56
|
+
|
57
|
+
# Span control attributes
|
58
|
+
DISABLE_PARTIAL_EMIT = "disable_partial_emit"
|
59
|
+
CANCELLED = "cancelled"
|
60
|
+
|
61
|
+
|
62
|
+
class ResourceKeys(str, Enum):
|
63
|
+
SERVICE_NAME = ResourceAttributes.SERVICE_NAME
|
64
|
+
TELEMETRY_SDK_LANGUAGE = ResourceAttributes.TELEMETRY_SDK_LANGUAGE
|
65
|
+
TELEMETRY_SDK_NAME = ResourceAttributes.TELEMETRY_SDK_NAME
|
66
|
+
TELEMETRY_SDK_VERSION = ResourceAttributes.TELEMETRY_SDK_VERSION
|
67
|
+
JUDGMENT_PROJECT_ID = "judgment.project_id"
|