judgeval 0.1.0__py3-none-any.whl → 0.23.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +173 -10
- judgeval/api/__init__.py +523 -0
- judgeval/api/api_types.py +413 -0
- judgeval/cli.py +112 -0
- judgeval/constants.py +7 -30
- judgeval/data/__init__.py +1 -3
- judgeval/data/evaluation_run.py +125 -0
- judgeval/data/example.py +14 -40
- judgeval/data/judgment_types.py +396 -146
- judgeval/data/result.py +11 -18
- judgeval/data/scorer_data.py +3 -26
- judgeval/data/scripts/openapi_transform.py +5 -5
- judgeval/data/trace.py +115 -194
- judgeval/dataset/__init__.py +335 -0
- judgeval/env.py +55 -0
- judgeval/evaluation/__init__.py +346 -0
- judgeval/exceptions.py +28 -0
- judgeval/integrations/langgraph/__init__.py +13 -0
- judgeval/integrations/openlit/__init__.py +51 -0
- judgeval/judges/__init__.py +2 -2
- judgeval/judges/litellm_judge.py +77 -16
- judgeval/judges/together_judge.py +88 -17
- judgeval/judges/utils.py +7 -20
- judgeval/judgment_attribute_keys.py +55 -0
- judgeval/{common/logger.py → logger.py} +24 -8
- judgeval/prompt/__init__.py +330 -0
- judgeval/scorers/__init__.py +11 -11
- judgeval/scorers/agent_scorer.py +15 -19
- judgeval/scorers/api_scorer.py +21 -23
- judgeval/scorers/base_scorer.py +54 -36
- judgeval/scorers/example_scorer.py +1 -3
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -24
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -10
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -10
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -14
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +171 -59
- judgeval/scorers/score.py +64 -47
- judgeval/scorers/utils.py +2 -107
- judgeval/tracer/__init__.py +1111 -2
- judgeval/tracer/constants.py +1 -0
- judgeval/tracer/exporters/__init__.py +40 -0
- judgeval/tracer/exporters/s3.py +119 -0
- judgeval/tracer/exporters/store.py +59 -0
- judgeval/tracer/exporters/utils.py +32 -0
- judgeval/tracer/keys.py +63 -0
- judgeval/tracer/llm/__init__.py +7 -0
- judgeval/tracer/llm/config.py +78 -0
- judgeval/tracer/llm/constants.py +9 -0
- judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
- judgeval/tracer/llm/llm_anthropic/config.py +6 -0
- judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
- judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
- judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
- judgeval/tracer/llm/llm_google/__init__.py +3 -0
- judgeval/tracer/llm/llm_google/config.py +6 -0
- judgeval/tracer/llm/llm_google/generate_content.py +127 -0
- judgeval/tracer/llm/llm_google/wrapper.py +30 -0
- judgeval/tracer/llm/llm_openai/__init__.py +3 -0
- judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
- judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
- judgeval/tracer/llm/llm_openai/config.py +6 -0
- judgeval/tracer/llm/llm_openai/responses.py +506 -0
- judgeval/tracer/llm/llm_openai/utils.py +42 -0
- judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
- judgeval/tracer/llm/llm_together/__init__.py +3 -0
- judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
- judgeval/tracer/llm/llm_together/config.py +6 -0
- judgeval/tracer/llm/llm_together/wrapper.py +52 -0
- judgeval/tracer/llm/providers.py +19 -0
- judgeval/tracer/managers.py +167 -0
- judgeval/tracer/processors/__init__.py +220 -0
- judgeval/tracer/utils.py +19 -0
- judgeval/trainer/__init__.py +14 -0
- judgeval/trainer/base_trainer.py +122 -0
- judgeval/trainer/config.py +123 -0
- judgeval/trainer/console.py +144 -0
- judgeval/trainer/fireworks_trainer.py +392 -0
- judgeval/trainer/trainable_model.py +252 -0
- judgeval/trainer/trainer.py +70 -0
- judgeval/utils/async_utils.py +39 -0
- judgeval/utils/decorators/__init__.py +0 -0
- judgeval/utils/decorators/dont_throw.py +37 -0
- judgeval/utils/decorators/use_once.py +13 -0
- judgeval/utils/file_utils.py +74 -28
- judgeval/utils/guards.py +36 -0
- judgeval/utils/meta.py +27 -0
- judgeval/utils/project.py +15 -0
- judgeval/utils/serialize.py +253 -0
- judgeval/utils/testing.py +70 -0
- judgeval/utils/url.py +10 -0
- judgeval/{version_check.py → utils/version_check.py} +5 -3
- judgeval/utils/wrappers/README.md +3 -0
- judgeval/utils/wrappers/__init__.py +15 -0
- judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
- judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
- judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
- judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
- judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
- judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
- judgeval/utils/wrappers/py.typed +0 -0
- judgeval/utils/wrappers/utils.py +35 -0
- judgeval/v1/__init__.py +88 -0
- judgeval/v1/data/__init__.py +7 -0
- judgeval/v1/data/example.py +44 -0
- judgeval/v1/data/scorer_data.py +42 -0
- judgeval/v1/data/scoring_result.py +44 -0
- judgeval/v1/datasets/__init__.py +6 -0
- judgeval/v1/datasets/dataset.py +214 -0
- judgeval/v1/datasets/dataset_factory.py +94 -0
- judgeval/v1/evaluation/__init__.py +6 -0
- judgeval/v1/evaluation/evaluation.py +182 -0
- judgeval/v1/evaluation/evaluation_factory.py +17 -0
- judgeval/v1/instrumentation/__init__.py +6 -0
- judgeval/v1/instrumentation/llm/__init__.py +7 -0
- judgeval/v1/instrumentation/llm/config.py +78 -0
- judgeval/v1/instrumentation/llm/constants.py +11 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/messages.py +414 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/messages_stream.py +307 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/wrapper.py +61 -0
- judgeval/v1/instrumentation/llm/llm_google/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_google/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_google/generate_content.py +121 -0
- judgeval/v1/instrumentation/llm/llm_google/wrapper.py +30 -0
- judgeval/v1/instrumentation/llm/llm_openai/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_openai/beta_chat_completions.py +212 -0
- judgeval/v1/instrumentation/llm/llm_openai/chat_completions.py +477 -0
- judgeval/v1/instrumentation/llm/llm_openai/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_openai/responses.py +472 -0
- judgeval/v1/instrumentation/llm/llm_openai/utils.py +41 -0
- judgeval/v1/instrumentation/llm/llm_openai/wrapper.py +63 -0
- judgeval/v1/instrumentation/llm/llm_together/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_together/chat_completions.py +382 -0
- judgeval/v1/instrumentation/llm/llm_together/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_together/wrapper.py +57 -0
- judgeval/v1/instrumentation/llm/providers.py +19 -0
- judgeval/v1/integrations/claude_agent_sdk/__init__.py +119 -0
- judgeval/v1/integrations/claude_agent_sdk/wrapper.py +564 -0
- judgeval/v1/integrations/langgraph/__init__.py +13 -0
- judgeval/v1/integrations/openlit/__init__.py +47 -0
- judgeval/v1/internal/api/__init__.py +525 -0
- judgeval/v1/internal/api/api_types.py +413 -0
- judgeval/v1/prompts/__init__.py +6 -0
- judgeval/v1/prompts/prompt.py +29 -0
- judgeval/v1/prompts/prompt_factory.py +189 -0
- judgeval/v1/py.typed +0 -0
- judgeval/v1/scorers/__init__.py +6 -0
- judgeval/v1/scorers/api_scorer.py +82 -0
- judgeval/v1/scorers/base_scorer.py +17 -0
- judgeval/v1/scorers/built_in/__init__.py +17 -0
- judgeval/v1/scorers/built_in/answer_correctness.py +28 -0
- judgeval/v1/scorers/built_in/answer_relevancy.py +28 -0
- judgeval/v1/scorers/built_in/built_in_factory.py +26 -0
- judgeval/v1/scorers/built_in/faithfulness.py +28 -0
- judgeval/v1/scorers/built_in/instruction_adherence.py +28 -0
- judgeval/v1/scorers/custom_scorer/__init__.py +6 -0
- judgeval/v1/scorers/custom_scorer/custom_scorer.py +50 -0
- judgeval/v1/scorers/custom_scorer/custom_scorer_factory.py +16 -0
- judgeval/v1/scorers/prompt_scorer/__init__.py +6 -0
- judgeval/v1/scorers/prompt_scorer/prompt_scorer.py +86 -0
- judgeval/v1/scorers/prompt_scorer/prompt_scorer_factory.py +85 -0
- judgeval/v1/scorers/scorers_factory.py +49 -0
- judgeval/v1/tracer/__init__.py +7 -0
- judgeval/v1/tracer/base_tracer.py +520 -0
- judgeval/v1/tracer/exporters/__init__.py +14 -0
- judgeval/v1/tracer/exporters/in_memory_span_exporter.py +25 -0
- judgeval/v1/tracer/exporters/judgment_span_exporter.py +42 -0
- judgeval/v1/tracer/exporters/noop_span_exporter.py +19 -0
- judgeval/v1/tracer/exporters/span_store.py +50 -0
- judgeval/v1/tracer/judgment_tracer_provider.py +70 -0
- judgeval/v1/tracer/processors/__init__.py +6 -0
- judgeval/v1/tracer/processors/_lifecycles/__init__.py +28 -0
- judgeval/v1/tracer/processors/_lifecycles/agent_id_processor.py +53 -0
- judgeval/v1/tracer/processors/_lifecycles/context_keys.py +11 -0
- judgeval/v1/tracer/processors/_lifecycles/customer_id_processor.py +29 -0
- judgeval/v1/tracer/processors/_lifecycles/registry.py +18 -0
- judgeval/v1/tracer/processors/judgment_span_processor.py +165 -0
- judgeval/v1/tracer/processors/noop_span_processor.py +42 -0
- judgeval/v1/tracer/tracer.py +67 -0
- judgeval/v1/tracer/tracer_factory.py +38 -0
- judgeval/v1/trainers/__init__.py +5 -0
- judgeval/v1/trainers/base_trainer.py +62 -0
- judgeval/v1/trainers/config.py +123 -0
- judgeval/v1/trainers/console.py +144 -0
- judgeval/v1/trainers/fireworks_trainer.py +392 -0
- judgeval/v1/trainers/trainable_model.py +252 -0
- judgeval/v1/trainers/trainers_factory.py +37 -0
- judgeval/v1/utils.py +18 -0
- judgeval/version.py +5 -0
- judgeval/warnings.py +4 -0
- judgeval-0.23.0.dist-info/METADATA +266 -0
- judgeval-0.23.0.dist-info/RECORD +201 -0
- judgeval-0.23.0.dist-info/entry_points.txt +2 -0
- judgeval/clients.py +0 -34
- judgeval/common/__init__.py +0 -13
- judgeval/common/api/__init__.py +0 -3
- judgeval/common/api/api.py +0 -352
- judgeval/common/api/constants.py +0 -165
- judgeval/common/exceptions.py +0 -27
- judgeval/common/storage/__init__.py +0 -6
- judgeval/common/storage/s3_storage.py +0 -98
- judgeval/common/tracer/__init__.py +0 -31
- judgeval/common/tracer/constants.py +0 -22
- judgeval/common/tracer/core.py +0 -1916
- judgeval/common/tracer/otel_exporter.py +0 -108
- judgeval/common/tracer/otel_span_processor.py +0 -234
- judgeval/common/tracer/span_processor.py +0 -37
- judgeval/common/tracer/span_transformer.py +0 -211
- judgeval/common/tracer/trace_manager.py +0 -92
- judgeval/common/utils.py +0 -940
- judgeval/data/datasets/__init__.py +0 -4
- judgeval/data/datasets/dataset.py +0 -341
- judgeval/data/datasets/eval_dataset_client.py +0 -214
- judgeval/data/tool.py +0 -5
- judgeval/data/trace_run.py +0 -37
- judgeval/evaluation_run.py +0 -75
- judgeval/integrations/langgraph.py +0 -843
- judgeval/judges/mixture_of_judges.py +0 -286
- judgeval/judgment_client.py +0 -369
- judgeval/rules.py +0 -521
- judgeval/run_evaluation.py +0 -684
- judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
- judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
- judgeval/utils/alerts.py +0 -93
- judgeval/utils/requests.py +0 -50
- judgeval-0.1.0.dist-info/METADATA +0 -202
- judgeval-0.1.0.dist-info/RECORD +0 -73
- {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/WHEEL +0 -0
- {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/licenses/LICENSE.md +0 -0
judgeval/tracer/__init__.py
CHANGED
|
@@ -1,3 +1,1112 @@
|
|
|
1
|
-
from
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from contextvars import ContextVar
|
|
3
|
+
import atexit
|
|
4
|
+
import functools
|
|
5
|
+
import inspect
|
|
6
|
+
import random
|
|
7
|
+
from typing import (
|
|
8
|
+
Any,
|
|
9
|
+
Union,
|
|
10
|
+
Callable,
|
|
11
|
+
Dict,
|
|
12
|
+
List,
|
|
13
|
+
Optional,
|
|
14
|
+
Tuple,
|
|
15
|
+
Type,
|
|
16
|
+
TypeVar,
|
|
17
|
+
overload,
|
|
18
|
+
Literal,
|
|
19
|
+
TypedDict,
|
|
20
|
+
Generator,
|
|
21
|
+
AsyncGenerator,
|
|
22
|
+
Iterable,
|
|
23
|
+
)
|
|
24
|
+
import contextvars
|
|
25
|
+
import asyncio
|
|
26
|
+
from functools import partial
|
|
27
|
+
from warnings import warn
|
|
2
28
|
|
|
3
|
-
|
|
29
|
+
from opentelemetry.sdk.trace import TracerProvider
|
|
30
|
+
from opentelemetry.sdk.resources import Resource
|
|
31
|
+
from opentelemetry.trace import (
|
|
32
|
+
Status,
|
|
33
|
+
StatusCode,
|
|
34
|
+
Tracer as ABCTracer,
|
|
35
|
+
Span,
|
|
36
|
+
get_current_span,
|
|
37
|
+
get_tracer_provider,
|
|
38
|
+
set_tracer_provider,
|
|
39
|
+
INVALID_SPAN_CONTEXT,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
from judgeval.data.evaluation_run import ExampleEvaluationRun, TraceEvaluationRun
|
|
43
|
+
from judgeval.data.example import Example
|
|
44
|
+
from judgeval.env import (
|
|
45
|
+
JUDGMENT_API_KEY,
|
|
46
|
+
JUDGMENT_ORG_ID,
|
|
47
|
+
JUDGMENT_ENABLE_MONITORING,
|
|
48
|
+
JUDGMENT_ENABLE_EVALUATIONS,
|
|
49
|
+
)
|
|
50
|
+
from judgeval.logger import judgeval_logger
|
|
51
|
+
from judgeval.scorers.api_scorer import TraceAPIScorerConfig, ExampleAPIScorerConfig
|
|
52
|
+
from judgeval.scorers.example_scorer import ExampleScorer
|
|
53
|
+
from judgeval.tracer.constants import JUDGEVAL_TRACER_INSTRUMENTING_MODULE_NAME
|
|
54
|
+
from judgeval.tracer.managers import (
|
|
55
|
+
sync_span_context,
|
|
56
|
+
async_span_context,
|
|
57
|
+
sync_agent_context,
|
|
58
|
+
async_agent_context,
|
|
59
|
+
)
|
|
60
|
+
from judgeval.utils.decorators.dont_throw import dont_throw
|
|
61
|
+
from judgeval.utils.guards import expect_api_key, expect_organization_id
|
|
62
|
+
from judgeval.utils.serialize import safe_serialize
|
|
63
|
+
from judgeval.utils.meta import SingletonMeta
|
|
64
|
+
from judgeval.version import get_version
|
|
65
|
+
from judgeval.warnings import JudgmentWarning
|
|
66
|
+
|
|
67
|
+
from judgeval.tracer.keys import AttributeKeys, InternalAttributeKeys
|
|
68
|
+
from judgeval.api import JudgmentSyncClient
|
|
69
|
+
from judgeval.tracer.llm import wrap_provider
|
|
70
|
+
from judgeval.utils.url import url_for
|
|
71
|
+
from judgeval.tracer.processors import (
|
|
72
|
+
JudgmentSpanProcessor,
|
|
73
|
+
NoOpJudgmentSpanProcessor,
|
|
74
|
+
)
|
|
75
|
+
from judgeval.tracer.utils import set_span_attribute, TraceScorerConfig
|
|
76
|
+
from judgeval.utils.project import _resolve_project_id
|
|
77
|
+
from opentelemetry.trace import use_span
|
|
78
|
+
|
|
79
|
+
C = TypeVar("C", bound=Callable)
|
|
80
|
+
Cls = TypeVar("Cls", bound=Type)
|
|
81
|
+
ApiClient = TypeVar("ApiClient", bound=Any)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class AgentContext(TypedDict):
|
|
85
|
+
agent_id: str
|
|
86
|
+
class_name: str | None
|
|
87
|
+
instance_name: str | None
|
|
88
|
+
track_state: bool
|
|
89
|
+
track_attributes: List[str] | None
|
|
90
|
+
field_mappings: Dict[str, str]
|
|
91
|
+
instance: Any
|
|
92
|
+
is_agent_entry_point: bool
|
|
93
|
+
parent_agent_id: str | None
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class Tracer(metaclass=SingletonMeta):
|
|
97
|
+
__slots__ = (
|
|
98
|
+
"api_key",
|
|
99
|
+
"organization_id",
|
|
100
|
+
"project_name",
|
|
101
|
+
"enable_monitoring",
|
|
102
|
+
"enable_evaluation",
|
|
103
|
+
"resource_attributes",
|
|
104
|
+
"api_client",
|
|
105
|
+
"judgment_processor",
|
|
106
|
+
"tracer",
|
|
107
|
+
"agent_context",
|
|
108
|
+
"customer_id",
|
|
109
|
+
"_initialized",
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
api_key: str | None
|
|
113
|
+
organization_id: str | None
|
|
114
|
+
project_name: str
|
|
115
|
+
enable_monitoring: bool
|
|
116
|
+
enable_evaluation: bool
|
|
117
|
+
resource_attributes: Optional[Dict[str, Any]]
|
|
118
|
+
api_client: JudgmentSyncClient
|
|
119
|
+
judgment_processor: JudgmentSpanProcessor
|
|
120
|
+
tracer: ABCTracer
|
|
121
|
+
agent_context: ContextVar[Optional[AgentContext]]
|
|
122
|
+
customer_id: ContextVar[Optional[str]]
|
|
123
|
+
_initialized: bool
|
|
124
|
+
|
|
125
|
+
def __init__(
|
|
126
|
+
self,
|
|
127
|
+
/,
|
|
128
|
+
*,
|
|
129
|
+
project_name: str,
|
|
130
|
+
api_key: str | None = None,
|
|
131
|
+
organization_id: str | None = None,
|
|
132
|
+
enable_monitoring: bool = JUDGMENT_ENABLE_MONITORING.lower() == "true",
|
|
133
|
+
enable_evaluation: bool = JUDGMENT_ENABLE_EVALUATIONS.lower() == "true",
|
|
134
|
+
resource_attributes: Optional[Dict[str, Any]] = None,
|
|
135
|
+
initialize: bool = True,
|
|
136
|
+
):
|
|
137
|
+
if not hasattr(self, "_initialized"):
|
|
138
|
+
self._initialized = False
|
|
139
|
+
self.agent_context = ContextVar("current_agent_context", default=None)
|
|
140
|
+
self.customer_id = ContextVar("current_customer_id", default=None)
|
|
141
|
+
|
|
142
|
+
self.project_name = project_name
|
|
143
|
+
self.api_key = expect_api_key(api_key or JUDGMENT_API_KEY)
|
|
144
|
+
self.organization_id = expect_organization_id(
|
|
145
|
+
organization_id or JUDGMENT_ORG_ID
|
|
146
|
+
)
|
|
147
|
+
self.enable_monitoring = enable_monitoring
|
|
148
|
+
self.enable_evaluation = enable_evaluation
|
|
149
|
+
self.resource_attributes = resource_attributes
|
|
150
|
+
|
|
151
|
+
if self.api_key and self.organization_id:
|
|
152
|
+
self.api_client = JudgmentSyncClient(
|
|
153
|
+
api_key=self.api_key, organization_id=self.organization_id
|
|
154
|
+
)
|
|
155
|
+
else:
|
|
156
|
+
judgeval_logger.error(
|
|
157
|
+
"API Key or Organization ID is not set. You must set them in the environment variables to use the tracer."
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
if initialize:
|
|
161
|
+
self.initialize()
|
|
162
|
+
|
|
163
|
+
def initialize(self) -> Tracer:
|
|
164
|
+
if self._initialized:
|
|
165
|
+
return self
|
|
166
|
+
|
|
167
|
+
self.judgment_processor = NoOpJudgmentSpanProcessor()
|
|
168
|
+
if self.enable_monitoring:
|
|
169
|
+
project_id = _resolve_project_id(
|
|
170
|
+
self.project_name, self.api_key, self.organization_id
|
|
171
|
+
)
|
|
172
|
+
if self.api_key and self.organization_id and project_id:
|
|
173
|
+
self.judgment_processor = self.get_processor(
|
|
174
|
+
tracer=self,
|
|
175
|
+
project_name=self.project_name,
|
|
176
|
+
project_id=project_id,
|
|
177
|
+
api_key=self.api_key,
|
|
178
|
+
organization_id=self.organization_id,
|
|
179
|
+
resource_attributes=self.resource_attributes,
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
resource = Resource.create(self.judgment_processor.resource_attributes)
|
|
183
|
+
provider = TracerProvider(resource=resource)
|
|
184
|
+
provider.add_span_processor(self.judgment_processor)
|
|
185
|
+
set_tracer_provider(provider)
|
|
186
|
+
else:
|
|
187
|
+
if self.api_key and self.organization_id:
|
|
188
|
+
judgeval_logger.error(
|
|
189
|
+
f"Failed to resolve or autocreate project {self.project_name}, please create it first at https://app.judgmentlabs.ai/org/{self.organization_id}/projects. Skipping Judgment export."
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
self.tracer = get_tracer_provider().get_tracer(
|
|
193
|
+
JUDGEVAL_TRACER_INSTRUMENTING_MODULE_NAME,
|
|
194
|
+
get_version(),
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
self._initialized = True
|
|
198
|
+
atexit.register(self._atexit_flush)
|
|
199
|
+
return self
|
|
200
|
+
|
|
201
|
+
@staticmethod
|
|
202
|
+
def get_exporter(
|
|
203
|
+
project_id: str,
|
|
204
|
+
api_key: Optional[str] = None,
|
|
205
|
+
organization_id: Optional[str] = None,
|
|
206
|
+
):
|
|
207
|
+
from judgeval.tracer.exporters import JudgmentSpanExporter
|
|
208
|
+
|
|
209
|
+
api_key = api_key or JUDGMENT_API_KEY
|
|
210
|
+
organization_id = organization_id or JUDGMENT_ORG_ID
|
|
211
|
+
|
|
212
|
+
if not api_key or not organization_id:
|
|
213
|
+
judgeval_logger.error(
|
|
214
|
+
"API Key or Organization ID is not set. You must set them in the environment variables to use the tracer."
|
|
215
|
+
)
|
|
216
|
+
return None
|
|
217
|
+
|
|
218
|
+
return JudgmentSpanExporter(
|
|
219
|
+
endpoint=url_for("/otel/v1/traces"),
|
|
220
|
+
api_key=api_key,
|
|
221
|
+
organization_id=organization_id,
|
|
222
|
+
project_id=project_id,
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
@staticmethod
|
|
226
|
+
def get_processor(
|
|
227
|
+
tracer: Tracer,
|
|
228
|
+
project_name: str,
|
|
229
|
+
project_id: str,
|
|
230
|
+
api_key: Optional[str] = None,
|
|
231
|
+
organization_id: Optional[str] = None,
|
|
232
|
+
max_queue_size: int = 2**18,
|
|
233
|
+
export_timeout_millis: int = 30000,
|
|
234
|
+
resource_attributes: Optional[Dict[str, Any]] = None,
|
|
235
|
+
) -> JudgmentSpanProcessor:
|
|
236
|
+
"""Create a JudgmentSpanProcessor using the correct constructor."""
|
|
237
|
+
api_key = api_key or JUDGMENT_API_KEY
|
|
238
|
+
organization_id = organization_id or JUDGMENT_ORG_ID
|
|
239
|
+
if not api_key or not organization_id:
|
|
240
|
+
judgeval_logger.error(
|
|
241
|
+
"API Key or Organization ID is not set. You must set them in the environment variables to use the tracer."
|
|
242
|
+
)
|
|
243
|
+
return NoOpJudgmentSpanProcessor()
|
|
244
|
+
return JudgmentSpanProcessor(
|
|
245
|
+
tracer,
|
|
246
|
+
project_name,
|
|
247
|
+
project_id,
|
|
248
|
+
api_key,
|
|
249
|
+
organization_id,
|
|
250
|
+
max_queue_size=max_queue_size,
|
|
251
|
+
export_timeout_millis=export_timeout_millis,
|
|
252
|
+
resource_attributes=resource_attributes,
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
def get_current_span(self):
|
|
256
|
+
return get_current_span()
|
|
257
|
+
|
|
258
|
+
def get_tracer(self):
|
|
259
|
+
return self.tracer
|
|
260
|
+
|
|
261
|
+
def get_current_agent_context(self):
|
|
262
|
+
return self.agent_context
|
|
263
|
+
|
|
264
|
+
def get_current_customer_context(self):
|
|
265
|
+
return self.customer_id
|
|
266
|
+
|
|
267
|
+
def get_span_processor(self) -> JudgmentSpanProcessor:
|
|
268
|
+
"""Get the internal span processor of this tracer instance."""
|
|
269
|
+
return self.judgment_processor
|
|
270
|
+
|
|
271
|
+
@dont_throw
|
|
272
|
+
def set_customer_id(self, customer_id: str) -> None:
|
|
273
|
+
if not customer_id:
|
|
274
|
+
judgeval_logger.warning("Customer ID is empty, skipping.")
|
|
275
|
+
return
|
|
276
|
+
|
|
277
|
+
span = self.get_current_span()
|
|
278
|
+
|
|
279
|
+
if not span or not span.is_recording():
|
|
280
|
+
judgeval_logger.warning(
|
|
281
|
+
"No active span found. Customer ID will not be set."
|
|
282
|
+
)
|
|
283
|
+
return
|
|
284
|
+
|
|
285
|
+
if self.get_current_customer_context().get():
|
|
286
|
+
judgeval_logger.warning("Customer ID is already set, skipping.")
|
|
287
|
+
return
|
|
288
|
+
|
|
289
|
+
if span and span.is_recording():
|
|
290
|
+
set_span_attribute(span, AttributeKeys.JUDGMENT_CUSTOMER_ID, customer_id)
|
|
291
|
+
self.get_current_customer_context().set(customer_id)
|
|
292
|
+
|
|
293
|
+
self.get_span_processor().set_internal_attribute(
|
|
294
|
+
span_context=span.get_span_context(),
|
|
295
|
+
key=InternalAttributeKeys.IS_CUSTOMER_CONTEXT_OWNER,
|
|
296
|
+
value=True,
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
def _maybe_clear_customer_context(self, span: Span) -> None:
|
|
300
|
+
if self.get_span_processor().get_internal_attribute(
|
|
301
|
+
span_context=span.get_span_context(),
|
|
302
|
+
key=InternalAttributeKeys.IS_CUSTOMER_CONTEXT_OWNER,
|
|
303
|
+
default=False,
|
|
304
|
+
):
|
|
305
|
+
self.get_current_customer_context().set(None)
|
|
306
|
+
|
|
307
|
+
@dont_throw
|
|
308
|
+
def _add_agent_attributes_to_span(self, span):
|
|
309
|
+
"""Add agent ID, class name, and instance name to span if they exist in context"""
|
|
310
|
+
current_agent_context = self.agent_context.get()
|
|
311
|
+
if not current_agent_context:
|
|
312
|
+
return
|
|
313
|
+
|
|
314
|
+
set_span_attribute(
|
|
315
|
+
span, AttributeKeys.JUDGMENT_AGENT_ID, current_agent_context["agent_id"]
|
|
316
|
+
)
|
|
317
|
+
set_span_attribute(
|
|
318
|
+
span,
|
|
319
|
+
AttributeKeys.JUDGMENT_AGENT_CLASS_NAME,
|
|
320
|
+
current_agent_context["class_name"],
|
|
321
|
+
)
|
|
322
|
+
set_span_attribute(
|
|
323
|
+
span,
|
|
324
|
+
AttributeKeys.JUDGMENT_AGENT_INSTANCE_NAME,
|
|
325
|
+
current_agent_context["instance_name"],
|
|
326
|
+
)
|
|
327
|
+
set_span_attribute(
|
|
328
|
+
span,
|
|
329
|
+
AttributeKeys.JUDGMENT_PARENT_AGENT_ID,
|
|
330
|
+
current_agent_context["parent_agent_id"],
|
|
331
|
+
)
|
|
332
|
+
set_span_attribute(
|
|
333
|
+
span,
|
|
334
|
+
AttributeKeys.JUDGMENT_IS_AGENT_ENTRY_POINT,
|
|
335
|
+
current_agent_context["is_agent_entry_point"],
|
|
336
|
+
)
|
|
337
|
+
current_agent_context["is_agent_entry_point"] = False
|
|
338
|
+
|
|
339
|
+
@dont_throw
|
|
340
|
+
def _record_instance_state(self, record_point: Literal["before", "after"], span):
|
|
341
|
+
current_agent_context = self.agent_context.get()
|
|
342
|
+
|
|
343
|
+
if current_agent_context and current_agent_context.get("track_state"):
|
|
344
|
+
instance = current_agent_context.get("instance")
|
|
345
|
+
track_attributes = current_agent_context.get("track_attributes")
|
|
346
|
+
field_mappings = current_agent_context.get("field_mappings", {})
|
|
347
|
+
|
|
348
|
+
if track_attributes is not None:
|
|
349
|
+
attributes = {
|
|
350
|
+
field_mappings.get(attr, attr): getattr(instance, attr, None)
|
|
351
|
+
for attr in track_attributes
|
|
352
|
+
}
|
|
353
|
+
else:
|
|
354
|
+
attributes = {
|
|
355
|
+
field_mappings.get(k, k): v
|
|
356
|
+
for k, v in instance.__dict__.items()
|
|
357
|
+
if not k.startswith("_")
|
|
358
|
+
}
|
|
359
|
+
set_span_attribute(
|
|
360
|
+
span,
|
|
361
|
+
(
|
|
362
|
+
AttributeKeys.JUDGMENT_STATE_BEFORE
|
|
363
|
+
if record_point == "before"
|
|
364
|
+
else AttributeKeys.JUDGMENT_STATE_AFTER
|
|
365
|
+
),
|
|
366
|
+
safe_serialize(attributes),
|
|
367
|
+
)
|
|
368
|
+
|
|
369
|
+
@dont_throw
|
|
370
|
+
def _add_customer_id_to_span(self, span):
|
|
371
|
+
customer_id = self.get_current_customer_context().get()
|
|
372
|
+
if customer_id:
|
|
373
|
+
set_span_attribute(span, AttributeKeys.JUDGMENT_CUSTOMER_ID, customer_id)
|
|
374
|
+
|
|
375
|
+
@dont_throw
|
|
376
|
+
def _inject_judgment_context(self, span):
|
|
377
|
+
self._add_agent_attributes_to_span(span)
|
|
378
|
+
self._add_customer_id_to_span(span)
|
|
379
|
+
|
|
380
|
+
def _set_pending_trace_eval(
|
|
381
|
+
self,
|
|
382
|
+
span: Span,
|
|
383
|
+
scorer_config: TraceScorerConfig,
|
|
384
|
+
args: Tuple[Any, ...],
|
|
385
|
+
kwargs: Dict[str, Any],
|
|
386
|
+
):
|
|
387
|
+
if not self.enable_evaluation:
|
|
388
|
+
return
|
|
389
|
+
|
|
390
|
+
scorer = scorer_config.scorer
|
|
391
|
+
run_condition = scorer_config.run_condition
|
|
392
|
+
sampling_rate = scorer_config.sampling_rate
|
|
393
|
+
|
|
394
|
+
if scorer is None:
|
|
395
|
+
judgeval_logger.error("Prompt Scorer was not found, skipping evaluation.")
|
|
396
|
+
return
|
|
397
|
+
if not isinstance(scorer, (TraceAPIScorerConfig)):
|
|
398
|
+
judgeval_logger.error(
|
|
399
|
+
"Scorer must be an instance of TraceAPIScorerConfig, got %s, skipping evaluation."
|
|
400
|
+
% type(scorer)
|
|
401
|
+
)
|
|
402
|
+
return
|
|
403
|
+
|
|
404
|
+
if run_condition is not None and not run_condition(*args, **kwargs):
|
|
405
|
+
return
|
|
406
|
+
|
|
407
|
+
if sampling_rate < 0 or sampling_rate > 1:
|
|
408
|
+
judgeval_logger.error(
|
|
409
|
+
"Sampling rate must be between 0 and 1, got %s, skipping evaluation."
|
|
410
|
+
% sampling_rate
|
|
411
|
+
)
|
|
412
|
+
return
|
|
413
|
+
|
|
414
|
+
percentage = random.uniform(0, 1)
|
|
415
|
+
if percentage > sampling_rate:
|
|
416
|
+
judgeval_logger.info(
|
|
417
|
+
"Sampling rate is %s, skipping evaluation." % sampling_rate
|
|
418
|
+
)
|
|
419
|
+
return
|
|
420
|
+
|
|
421
|
+
span_context = span.get_span_context()
|
|
422
|
+
if span_context == INVALID_SPAN_CONTEXT:
|
|
423
|
+
return
|
|
424
|
+
trace_id = format(span_context.trace_id, "032x")
|
|
425
|
+
span_id = format(span_context.span_id, "016x")
|
|
426
|
+
eval_run_name = f"async_trace_evaluate_{span_id}"
|
|
427
|
+
|
|
428
|
+
eval_run = TraceEvaluationRun(
|
|
429
|
+
project_name=self.project_name,
|
|
430
|
+
eval_name=eval_run_name,
|
|
431
|
+
scorers=[scorer],
|
|
432
|
+
trace_and_span_ids=[(trace_id, span_id)],
|
|
433
|
+
)
|
|
434
|
+
span.set_attribute(
|
|
435
|
+
AttributeKeys.PENDING_TRACE_EVAL,
|
|
436
|
+
safe_serialize(eval_run.model_dump(warnings=False)),
|
|
437
|
+
)
|
|
438
|
+
|
|
439
|
+
def _create_traced_sync_generator(
|
|
440
|
+
self,
|
|
441
|
+
generator: Generator,
|
|
442
|
+
main_span: Span,
|
|
443
|
+
disable_generator_yield_span: bool = False,
|
|
444
|
+
):
|
|
445
|
+
"""Create a traced synchronous generator that wraps each yield in a span."""
|
|
446
|
+
preserved_context = contextvars.copy_context()
|
|
447
|
+
return _ContextPreservedSyncGeneratorWrapper(
|
|
448
|
+
self,
|
|
449
|
+
generator,
|
|
450
|
+
preserved_context,
|
|
451
|
+
main_span,
|
|
452
|
+
None,
|
|
453
|
+
disable_generator_yield_span,
|
|
454
|
+
)
|
|
455
|
+
|
|
456
|
+
def _create_traced_async_generator(
|
|
457
|
+
self,
|
|
458
|
+
async_generator: AsyncGenerator,
|
|
459
|
+
main_span: Span,
|
|
460
|
+
disable_generator_yield_span: bool = False,
|
|
461
|
+
):
|
|
462
|
+
"""Create a traced asynchronous generator that wraps each yield in a span."""
|
|
463
|
+
preserved_context = contextvars.copy_context()
|
|
464
|
+
return _ContextPreservedAsyncGeneratorWrapper(
|
|
465
|
+
self,
|
|
466
|
+
async_generator,
|
|
467
|
+
preserved_context,
|
|
468
|
+
main_span,
|
|
469
|
+
None,
|
|
470
|
+
disable_generator_yield_span,
|
|
471
|
+
)
|
|
472
|
+
|
|
473
|
+
def _wrap_sync(
|
|
474
|
+
self,
|
|
475
|
+
f: Callable,
|
|
476
|
+
name: Optional[str],
|
|
477
|
+
attributes: Optional[Dict[str, Any]],
|
|
478
|
+
scorer_config: TraceScorerConfig | None = None,
|
|
479
|
+
disable_generator_yield_span: bool = False,
|
|
480
|
+
):
|
|
481
|
+
@functools.wraps(f)
|
|
482
|
+
def wrapper(*args, **kwargs):
|
|
483
|
+
n = name or f.__qualname__
|
|
484
|
+
with sync_span_context(self, n, attributes) as span:
|
|
485
|
+
is_return_type_generator = False
|
|
486
|
+
|
|
487
|
+
self._inject_judgment_context(span)
|
|
488
|
+
self._record_instance_state("before", span)
|
|
489
|
+
try:
|
|
490
|
+
set_span_attribute(
|
|
491
|
+
span,
|
|
492
|
+
AttributeKeys.JUDGMENT_INPUT,
|
|
493
|
+
safe_serialize(format_inputs(f, args, kwargs)),
|
|
494
|
+
)
|
|
495
|
+
|
|
496
|
+
self.judgment_processor.emit_partial()
|
|
497
|
+
|
|
498
|
+
if scorer_config:
|
|
499
|
+
self._set_pending_trace_eval(span, scorer_config, args, kwargs)
|
|
500
|
+
|
|
501
|
+
result = f(*args, **kwargs)
|
|
502
|
+
|
|
503
|
+
if inspect.isgenerator(result):
|
|
504
|
+
is_return_type_generator = True
|
|
505
|
+
set_span_attribute(
|
|
506
|
+
span, AttributeKeys.JUDGMENT_OUTPUT, "<generator>"
|
|
507
|
+
)
|
|
508
|
+
self._record_instance_state("after", span)
|
|
509
|
+
return self._create_traced_sync_generator(
|
|
510
|
+
result, span, disable_generator_yield_span
|
|
511
|
+
)
|
|
512
|
+
elif inspect.isasyncgen(result):
|
|
513
|
+
is_return_type_generator = True
|
|
514
|
+
set_span_attribute(
|
|
515
|
+
span, AttributeKeys.JUDGMENT_OUTPUT, "<async_generator>"
|
|
516
|
+
)
|
|
517
|
+
self._record_instance_state("after", span)
|
|
518
|
+
return self._create_traced_async_generator(
|
|
519
|
+
result, span, disable_generator_yield_span
|
|
520
|
+
)
|
|
521
|
+
else:
|
|
522
|
+
set_span_attribute(
|
|
523
|
+
span, AttributeKeys.JUDGMENT_OUTPUT, safe_serialize(result)
|
|
524
|
+
)
|
|
525
|
+
self._record_instance_state("after", span)
|
|
526
|
+
self._maybe_clear_customer_context(span)
|
|
527
|
+
return result
|
|
528
|
+
except Exception as user_exc:
|
|
529
|
+
span.record_exception(user_exc)
|
|
530
|
+
span.set_status(Status(StatusCode.ERROR, str(user_exc)))
|
|
531
|
+
self._maybe_clear_customer_context(span)
|
|
532
|
+
raise
|
|
533
|
+
finally:
|
|
534
|
+
if not is_return_type_generator:
|
|
535
|
+
span.end()
|
|
536
|
+
|
|
537
|
+
return wrapper
|
|
538
|
+
|
|
539
|
+
def _wrap_async(
|
|
540
|
+
self,
|
|
541
|
+
f: Callable,
|
|
542
|
+
name: Optional[str],
|
|
543
|
+
attributes: Optional[Dict[str, Any]],
|
|
544
|
+
scorer_config: TraceScorerConfig | None = None,
|
|
545
|
+
disable_generator_yield_span: bool = False,
|
|
546
|
+
):
|
|
547
|
+
@functools.wraps(f)
|
|
548
|
+
async def wrapper(*args, **kwargs):
|
|
549
|
+
n = name or f.__qualname__
|
|
550
|
+
async with async_span_context(self, n, attributes) as span:
|
|
551
|
+
is_return_type_generator = False
|
|
552
|
+
self._inject_judgment_context(span)
|
|
553
|
+
self._record_instance_state("before", span)
|
|
554
|
+
try:
|
|
555
|
+
set_span_attribute(
|
|
556
|
+
span,
|
|
557
|
+
AttributeKeys.JUDGMENT_INPUT,
|
|
558
|
+
safe_serialize(format_inputs(f, args, kwargs)),
|
|
559
|
+
)
|
|
560
|
+
|
|
561
|
+
self.judgment_processor.emit_partial()
|
|
562
|
+
|
|
563
|
+
if scorer_config:
|
|
564
|
+
self._set_pending_trace_eval(span, scorer_config, args, kwargs)
|
|
565
|
+
|
|
566
|
+
result = await f(*args, **kwargs)
|
|
567
|
+
if inspect.isasyncgen(result):
|
|
568
|
+
is_return_type_generator = True
|
|
569
|
+
set_span_attribute(
|
|
570
|
+
span, AttributeKeys.JUDGMENT_OUTPUT, "<async_generator>"
|
|
571
|
+
)
|
|
572
|
+
self._record_instance_state("after", span)
|
|
573
|
+
return self._create_traced_async_generator(
|
|
574
|
+
result, span, disable_generator_yield_span
|
|
575
|
+
)
|
|
576
|
+
elif inspect.isgenerator(result):
|
|
577
|
+
is_return_type_generator = True
|
|
578
|
+
set_span_attribute(
|
|
579
|
+
span, AttributeKeys.JUDGMENT_OUTPUT, "<generator>"
|
|
580
|
+
)
|
|
581
|
+
self._record_instance_state("after", span)
|
|
582
|
+
return self._create_traced_sync_generator(
|
|
583
|
+
result, span, disable_generator_yield_span
|
|
584
|
+
)
|
|
585
|
+
else:
|
|
586
|
+
set_span_attribute(
|
|
587
|
+
span, AttributeKeys.JUDGMENT_OUTPUT, safe_serialize(result)
|
|
588
|
+
)
|
|
589
|
+
self._record_instance_state("after", span)
|
|
590
|
+
self._maybe_clear_customer_context(span)
|
|
591
|
+
return result
|
|
592
|
+
except Exception as user_exc:
|
|
593
|
+
span.record_exception(user_exc)
|
|
594
|
+
span.set_status(Status(StatusCode.ERROR, str(user_exc)))
|
|
595
|
+
self._maybe_clear_customer_context(span)
|
|
596
|
+
raise
|
|
597
|
+
finally:
|
|
598
|
+
if not is_return_type_generator:
|
|
599
|
+
span.end()
|
|
600
|
+
|
|
601
|
+
return wrapper
|
|
602
|
+
|
|
603
|
+
@overload
|
|
604
|
+
def observe(
|
|
605
|
+
self,
|
|
606
|
+
func: C,
|
|
607
|
+
/,
|
|
608
|
+
*,
|
|
609
|
+
span_type: str | None = None,
|
|
610
|
+
span_name: str | None = None,
|
|
611
|
+
attributes: Optional[Dict[str, Any]] = None,
|
|
612
|
+
scorer_config: TraceScorerConfig | None = None,
|
|
613
|
+
) -> C: ...
|
|
614
|
+
|
|
615
|
+
@overload
|
|
616
|
+
def observe(
|
|
617
|
+
self,
|
|
618
|
+
func: None = None,
|
|
619
|
+
/,
|
|
620
|
+
*,
|
|
621
|
+
span_type: str | None = None,
|
|
622
|
+
span_name: str | None = None,
|
|
623
|
+
attributes: Optional[Dict[str, Any]] = None,
|
|
624
|
+
scorer_config: TraceScorerConfig | None = None,
|
|
625
|
+
) -> Callable[[C], C]: ...
|
|
626
|
+
|
|
627
|
+
def observe(
|
|
628
|
+
self,
|
|
629
|
+
func: Callable | None = None,
|
|
630
|
+
/,
|
|
631
|
+
*,
|
|
632
|
+
span_type: str | None = "span",
|
|
633
|
+
span_name: str | None = None,
|
|
634
|
+
attributes: Optional[Dict[str, Any]] = None,
|
|
635
|
+
scorer_config: TraceScorerConfig | None = None,
|
|
636
|
+
disable_generator_yield_span: bool = False,
|
|
637
|
+
) -> Callable | None:
|
|
638
|
+
if func is None:
|
|
639
|
+
return partial(
|
|
640
|
+
self.observe,
|
|
641
|
+
span_type=span_type,
|
|
642
|
+
span_name=span_name,
|
|
643
|
+
attributes=attributes,
|
|
644
|
+
scorer_config=scorer_config,
|
|
645
|
+
disable_generator_yield_span=disable_generator_yield_span,
|
|
646
|
+
)
|
|
647
|
+
|
|
648
|
+
if not self.enable_monitoring:
|
|
649
|
+
return func
|
|
650
|
+
|
|
651
|
+
# Handle functions (including generator functions) - detect generators at runtime
|
|
652
|
+
name = span_name or getattr(func, "__qualname__", "function")
|
|
653
|
+
func_attributes: Dict[str, Any] = {
|
|
654
|
+
AttributeKeys.JUDGMENT_SPAN_KIND: span_type,
|
|
655
|
+
**(attributes or {}),
|
|
656
|
+
}
|
|
657
|
+
|
|
658
|
+
if inspect.iscoroutinefunction(func):
|
|
659
|
+
return self._wrap_async(
|
|
660
|
+
func, name, func_attributes, scorer_config, disable_generator_yield_span
|
|
661
|
+
)
|
|
662
|
+
else:
|
|
663
|
+
return self._wrap_sync(
|
|
664
|
+
func, name, func_attributes, scorer_config, disable_generator_yield_span
|
|
665
|
+
)
|
|
666
|
+
|
|
667
|
+
@overload
|
|
668
|
+
def agent(
|
|
669
|
+
self,
|
|
670
|
+
func: C,
|
|
671
|
+
/,
|
|
672
|
+
*,
|
|
673
|
+
identifier: str | None = None,
|
|
674
|
+
track_state: bool = False,
|
|
675
|
+
track_attributes: List[str] | None = None,
|
|
676
|
+
field_mappings: Dict[str, str] = {},
|
|
677
|
+
) -> C: ...
|
|
678
|
+
|
|
679
|
+
@overload
|
|
680
|
+
def agent(
|
|
681
|
+
self,
|
|
682
|
+
func: None = None,
|
|
683
|
+
/,
|
|
684
|
+
*,
|
|
685
|
+
identifier: str | None = None,
|
|
686
|
+
track_state: bool = False,
|
|
687
|
+
track_attributes: List[str] | None = None,
|
|
688
|
+
field_mappings: Dict[str, str] = {},
|
|
689
|
+
) -> Callable[[C], C]: ...
|
|
690
|
+
|
|
691
|
+
def agent(
|
|
692
|
+
self,
|
|
693
|
+
func: Callable | None = None,
|
|
694
|
+
/,
|
|
695
|
+
*,
|
|
696
|
+
identifier: str | None = None,
|
|
697
|
+
track_state: bool = False,
|
|
698
|
+
track_attributes: List[str] | None = None,
|
|
699
|
+
field_mappings: Dict[str, str] = {},
|
|
700
|
+
) -> Callable | None:
|
|
701
|
+
"""
|
|
702
|
+
Agent decorator that creates an agent ID and propagates it to child spans.
|
|
703
|
+
Also captures and propagates the class name if the decorated function is a method.
|
|
704
|
+
Optionally captures instance name based on the specified identifier attribute.
|
|
705
|
+
|
|
706
|
+
This decorator should be used in combination with @observe decorator:
|
|
707
|
+
|
|
708
|
+
class MyAgent:
|
|
709
|
+
def __init__(self, name):
|
|
710
|
+
self.name = name
|
|
711
|
+
|
|
712
|
+
@judgment.agent(identifier="name")
|
|
713
|
+
@judgment.observe(span_type="function")
|
|
714
|
+
def my_agent_method(self):
|
|
715
|
+
# This span and all child spans will have:
|
|
716
|
+
# - agent_id: auto-generated UUID
|
|
717
|
+
# - class_name: "MyAgent"
|
|
718
|
+
# - instance_name: self.name value
|
|
719
|
+
pass
|
|
720
|
+
|
|
721
|
+
Args:
|
|
722
|
+
identifier: Name of the instance attribute to use as the instance name
|
|
723
|
+
"""
|
|
724
|
+
if func is None:
|
|
725
|
+
return partial(
|
|
726
|
+
self.agent,
|
|
727
|
+
identifier=identifier,
|
|
728
|
+
track_state=track_state,
|
|
729
|
+
track_attributes=track_attributes,
|
|
730
|
+
field_mappings=field_mappings,
|
|
731
|
+
)
|
|
732
|
+
|
|
733
|
+
if not self.enable_monitoring:
|
|
734
|
+
return func
|
|
735
|
+
|
|
736
|
+
class_name = None
|
|
737
|
+
if hasattr(func, "__qualname__") and "." in func.__qualname__:
|
|
738
|
+
parts = func.__qualname__.split(".")
|
|
739
|
+
if len(parts) >= 2:
|
|
740
|
+
class_name = parts[-2]
|
|
741
|
+
|
|
742
|
+
if inspect.iscoroutinefunction(func):
|
|
743
|
+
|
|
744
|
+
@functools.wraps(func)
|
|
745
|
+
async def async_wrapper(*args, **kwargs):
|
|
746
|
+
async with async_agent_context(
|
|
747
|
+
tracer=self,
|
|
748
|
+
args=args,
|
|
749
|
+
class_name=class_name,
|
|
750
|
+
identifier=identifier,
|
|
751
|
+
track_state=track_state,
|
|
752
|
+
track_attributes=track_attributes,
|
|
753
|
+
field_mappings=field_mappings,
|
|
754
|
+
):
|
|
755
|
+
return await func(*args, **kwargs)
|
|
756
|
+
|
|
757
|
+
return async_wrapper
|
|
758
|
+
else:
|
|
759
|
+
|
|
760
|
+
@functools.wraps(func)
|
|
761
|
+
def sync_wrapper(*args, **kwargs):
|
|
762
|
+
with sync_agent_context(
|
|
763
|
+
tracer=self,
|
|
764
|
+
args=args,
|
|
765
|
+
class_name=class_name,
|
|
766
|
+
identifier=identifier,
|
|
767
|
+
track_state=track_state,
|
|
768
|
+
track_attributes=track_attributes,
|
|
769
|
+
field_mappings=field_mappings,
|
|
770
|
+
):
|
|
771
|
+
return func(*args, **kwargs)
|
|
772
|
+
|
|
773
|
+
return sync_wrapper
|
|
774
|
+
|
|
775
|
+
def wrap(self, client: ApiClient) -> ApiClient:
|
|
776
|
+
return wrap_provider(self, client)
|
|
777
|
+
|
|
778
|
+
def force_flush(self, timeout_millis: int = 30000) -> bool:
|
|
779
|
+
"""Force flush all pending spans and block until completion.
|
|
780
|
+
|
|
781
|
+
Args:
|
|
782
|
+
timeout_millis: Maximum time to wait for flush completion in milliseconds
|
|
783
|
+
|
|
784
|
+
Returns:
|
|
785
|
+
True if processor flushed successfully within timeout, False otherwise
|
|
786
|
+
"""
|
|
787
|
+
try:
|
|
788
|
+
return self.judgment_processor.force_flush(timeout_millis)
|
|
789
|
+
except Exception as e:
|
|
790
|
+
judgeval_logger.warning(f"Error flushing processor: {e}")
|
|
791
|
+
return False
|
|
792
|
+
|
|
793
|
+
def _atexit_flush(self, timeout_millis: int = 30000) -> None:
|
|
794
|
+
"""Internal method called on program exit to flush remaining spans.
|
|
795
|
+
|
|
796
|
+
This blocks until all spans are flushed or timeout is reached to ensure
|
|
797
|
+
proper cleanup before program termination.
|
|
798
|
+
"""
|
|
799
|
+
try:
|
|
800
|
+
self.force_flush(timeout_millis=timeout_millis)
|
|
801
|
+
except Exception as e:
|
|
802
|
+
judgeval_logger.warning(f"Error during atexit flush: {e}")
|
|
803
|
+
|
|
804
|
+
@dont_throw
|
|
805
|
+
def async_evaluate(
|
|
806
|
+
self,
|
|
807
|
+
/,
|
|
808
|
+
*,
|
|
809
|
+
scorer: Union[ExampleAPIScorerConfig, ExampleScorer, None],
|
|
810
|
+
example: Example,
|
|
811
|
+
sampling_rate: float = 1.0,
|
|
812
|
+
):
|
|
813
|
+
if not self.enable_evaluation or not self.enable_monitoring:
|
|
814
|
+
judgeval_logger.info("Evaluation is not enabled, skipping evaluation")
|
|
815
|
+
return
|
|
816
|
+
|
|
817
|
+
if scorer is None:
|
|
818
|
+
judgeval_logger.error("Prompt Scorer was not found, skipping evaluation.")
|
|
819
|
+
return
|
|
820
|
+
|
|
821
|
+
if not isinstance(scorer, (ExampleAPIScorerConfig, ExampleScorer)):
|
|
822
|
+
judgeval_logger.error(
|
|
823
|
+
"Scorer must be an instance of ExampleAPIScorerConfig or ExampleScorer, got %s, skipping evaluation."
|
|
824
|
+
% type(scorer)
|
|
825
|
+
)
|
|
826
|
+
return
|
|
827
|
+
|
|
828
|
+
if not isinstance(example, Example):
|
|
829
|
+
judgeval_logger.error(
|
|
830
|
+
"Example must be an instance of Example, got %s, skipping evaluation."
|
|
831
|
+
% type(example)
|
|
832
|
+
)
|
|
833
|
+
return
|
|
834
|
+
|
|
835
|
+
if sampling_rate < 0 or sampling_rate > 1:
|
|
836
|
+
judgeval_logger.error(
|
|
837
|
+
"Sampling rate must be between 0 and 1, got %s, skipping evaluation."
|
|
838
|
+
% sampling_rate
|
|
839
|
+
)
|
|
840
|
+
return
|
|
841
|
+
|
|
842
|
+
percentage = random.uniform(0, 1)
|
|
843
|
+
if percentage > sampling_rate:
|
|
844
|
+
judgeval_logger.info(
|
|
845
|
+
"Sampling rate is %s, skipping evaluation." % sampling_rate
|
|
846
|
+
)
|
|
847
|
+
return
|
|
848
|
+
|
|
849
|
+
span_context = self.get_current_span().get_span_context()
|
|
850
|
+
if span_context == INVALID_SPAN_CONTEXT:
|
|
851
|
+
judgeval_logger.warning(
|
|
852
|
+
"No span context was found for async_evaluate, skipping evaluation. Please make sure to use the @observe decorator on the function you are evaluating."
|
|
853
|
+
)
|
|
854
|
+
return
|
|
855
|
+
|
|
856
|
+
trace_id = format(span_context.trace_id, "032x")
|
|
857
|
+
span_id = format(span_context.span_id, "016x")
|
|
858
|
+
hosted_scoring = isinstance(scorer, ExampleAPIScorerConfig) or (
|
|
859
|
+
isinstance(scorer, ExampleScorer) and scorer.server_hosted
|
|
860
|
+
)
|
|
861
|
+
eval_run = ExampleEvaluationRun(
|
|
862
|
+
project_name=self.project_name,
|
|
863
|
+
# note this name doesnt matter because we don't save the experiment only the example and scorer_data
|
|
864
|
+
eval_name=f"async_evaluate_{span_id}",
|
|
865
|
+
examples=[example],
|
|
866
|
+
scorers=[scorer],
|
|
867
|
+
trace_span_id=span_id,
|
|
868
|
+
trace_id=trace_id,
|
|
869
|
+
)
|
|
870
|
+
if hosted_scoring:
|
|
871
|
+
self.api_client.add_to_run_eval_queue_examples(
|
|
872
|
+
eval_run.model_dump(warnings=False) # type: ignore
|
|
873
|
+
)
|
|
874
|
+
else:
|
|
875
|
+
judgeval_logger.warning(
|
|
876
|
+
"The scorer provided is not hosted, skipping evaluation."
|
|
877
|
+
)
|
|
878
|
+
|
|
879
|
+
|
|
880
|
+
def wrap(client: ApiClient) -> ApiClient:
|
|
881
|
+
try:
|
|
882
|
+
tracer = Tracer.get_instance()
|
|
883
|
+
if tracer is None or not isinstance(tracer, Tracer):
|
|
884
|
+
warn(
|
|
885
|
+
"No Tracer instance found, client will not be wrapped. "
|
|
886
|
+
"Create a Tracer instance first.",
|
|
887
|
+
JudgmentWarning,
|
|
888
|
+
stacklevel=2,
|
|
889
|
+
)
|
|
890
|
+
return client
|
|
891
|
+
if not tracer._initialized:
|
|
892
|
+
warn(
|
|
893
|
+
"Tracer not initialized, client will not be wrapped. "
|
|
894
|
+
"Call Tracer.initialize() first to setup the tracer.",
|
|
895
|
+
JudgmentWarning,
|
|
896
|
+
stacklevel=2,
|
|
897
|
+
)
|
|
898
|
+
return client
|
|
899
|
+
return tracer.wrap(client)
|
|
900
|
+
except Exception:
|
|
901
|
+
warn(
|
|
902
|
+
"Error accessing tracer singleton, client will not be wrapped.",
|
|
903
|
+
JudgmentWarning,
|
|
904
|
+
stacklevel=2,
|
|
905
|
+
)
|
|
906
|
+
return client
|
|
907
|
+
|
|
908
|
+
|
|
909
|
+
def format_inputs(
|
|
910
|
+
f: Callable, args: Tuple[Any, ...], kwargs: Dict[str, Any]
|
|
911
|
+
) -> Dict[str, Any]:
|
|
912
|
+
try:
|
|
913
|
+
params = list(inspect.signature(f).parameters.values())
|
|
914
|
+
inputs = {}
|
|
915
|
+
arg_i = 0
|
|
916
|
+
for param in params:
|
|
917
|
+
if param.kind == inspect.Parameter.POSITIONAL_OR_KEYWORD:
|
|
918
|
+
if arg_i < len(args):
|
|
919
|
+
inputs[param.name] = args[arg_i]
|
|
920
|
+
arg_i += 1
|
|
921
|
+
elif param.name in kwargs:
|
|
922
|
+
inputs[param.name] = kwargs[param.name]
|
|
923
|
+
elif param.kind == inspect.Parameter.VAR_POSITIONAL:
|
|
924
|
+
inputs[param.name] = args[arg_i:]
|
|
925
|
+
arg_i = len(args)
|
|
926
|
+
elif param.kind == inspect.Parameter.VAR_KEYWORD:
|
|
927
|
+
inputs[param.name] = kwargs
|
|
928
|
+
return inputs
|
|
929
|
+
except Exception:
|
|
930
|
+
return {}
|
|
931
|
+
|
|
932
|
+
|
|
933
|
+
class _ContextPreservedSyncGeneratorWrapper:
|
|
934
|
+
"""Sync generator wrapper that ensures each iteration runs in preserved context."""
|
|
935
|
+
|
|
936
|
+
def __init__(
|
|
937
|
+
self,
|
|
938
|
+
tracer: Tracer,
|
|
939
|
+
generator: Generator,
|
|
940
|
+
context: contextvars.Context,
|
|
941
|
+
span: Span,
|
|
942
|
+
transform_fn: Optional[Callable[[Iterable], str]],
|
|
943
|
+
disable_generator_yield_span: bool = False,
|
|
944
|
+
) -> None:
|
|
945
|
+
self.tracer = tracer
|
|
946
|
+
self.generator = generator
|
|
947
|
+
self.context = context
|
|
948
|
+
self.span = span
|
|
949
|
+
self.transform_fn = transform_fn
|
|
950
|
+
self._finished = False
|
|
951
|
+
self.disable_generator_yield_span = disable_generator_yield_span
|
|
952
|
+
|
|
953
|
+
def __iter__(self) -> "_ContextPreservedSyncGeneratorWrapper":
|
|
954
|
+
return self
|
|
955
|
+
|
|
956
|
+
def __next__(self) -> Any:
|
|
957
|
+
try:
|
|
958
|
+
# Run the generator's __next__ in the preserved context
|
|
959
|
+
item = self.context.run(next, self.generator)
|
|
960
|
+
|
|
961
|
+
if not self.disable_generator_yield_span:
|
|
962
|
+
with use_span(self.span):
|
|
963
|
+
span_name = (
|
|
964
|
+
str(self.span.name)
|
|
965
|
+
if hasattr(self.span, "name")
|
|
966
|
+
else "generator_item"
|
|
967
|
+
) # type: ignore[attr-defined]
|
|
968
|
+
with self.tracer.get_tracer().start_as_current_span(
|
|
969
|
+
span_name,
|
|
970
|
+
attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "generator_item"},
|
|
971
|
+
end_on_exit=True,
|
|
972
|
+
) as child_span:
|
|
973
|
+
set_span_attribute(
|
|
974
|
+
child_span,
|
|
975
|
+
AttributeKeys.JUDGMENT_OUTPUT,
|
|
976
|
+
safe_serialize(item),
|
|
977
|
+
)
|
|
978
|
+
|
|
979
|
+
return item
|
|
980
|
+
|
|
981
|
+
except StopIteration:
|
|
982
|
+
# Handle output and span cleanup when generator is exhausted
|
|
983
|
+
if not self._finished:
|
|
984
|
+
set_span_attribute(
|
|
985
|
+
self.span, AttributeKeys.JUDGMENT_SPAN_KIND, "generator"
|
|
986
|
+
)
|
|
987
|
+
self.span.end()
|
|
988
|
+
self._finished = True
|
|
989
|
+
|
|
990
|
+
raise # Re-raise StopIteration
|
|
991
|
+
|
|
992
|
+
except Exception as e:
|
|
993
|
+
if not self._finished:
|
|
994
|
+
self.span.record_exception(e)
|
|
995
|
+
self.span.set_status(
|
|
996
|
+
Status(StatusCode.ERROR, str(e) or type(e).__name__)
|
|
997
|
+
)
|
|
998
|
+
self.tracer._maybe_clear_customer_context(self.span)
|
|
999
|
+
self.span.end()
|
|
1000
|
+
self._finished = True
|
|
1001
|
+
|
|
1002
|
+
raise
|
|
1003
|
+
|
|
1004
|
+
def close(self) -> None:
|
|
1005
|
+
"""Close the generator (minimal implementation)."""
|
|
1006
|
+
try:
|
|
1007
|
+
self.generator.close()
|
|
1008
|
+
finally:
|
|
1009
|
+
if not self._finished:
|
|
1010
|
+
set_span_attribute(
|
|
1011
|
+
self.span, AttributeKeys.JUDGMENT_SPAN_KIND, "generator"
|
|
1012
|
+
)
|
|
1013
|
+
self.tracer._maybe_clear_customer_context(self.span)
|
|
1014
|
+
self.span.end()
|
|
1015
|
+
self._finished = True
|
|
1016
|
+
|
|
1017
|
+
|
|
1018
|
+
class _ContextPreservedAsyncGeneratorWrapper:
|
|
1019
|
+
"""Async generator wrapper that ensures each iteration runs in preserved context."""
|
|
1020
|
+
|
|
1021
|
+
def __init__(
|
|
1022
|
+
self,
|
|
1023
|
+
tracer: Tracer,
|
|
1024
|
+
generator: AsyncGenerator,
|
|
1025
|
+
context: contextvars.Context,
|
|
1026
|
+
span: Span,
|
|
1027
|
+
transform_fn: Optional[Callable[[Iterable], str]],
|
|
1028
|
+
disable_generator_yield_span: bool = False,
|
|
1029
|
+
) -> None:
|
|
1030
|
+
self.tracer = tracer
|
|
1031
|
+
self.generator = generator
|
|
1032
|
+
self.context = context
|
|
1033
|
+
self.span = span
|
|
1034
|
+
self.transform_fn = transform_fn
|
|
1035
|
+
self._finished = False
|
|
1036
|
+
self.disable_generator_yield_span = disable_generator_yield_span
|
|
1037
|
+
|
|
1038
|
+
def __aiter__(self) -> "_ContextPreservedAsyncGeneratorWrapper":
|
|
1039
|
+
return self
|
|
1040
|
+
|
|
1041
|
+
async def __anext__(self) -> Any:
|
|
1042
|
+
try:
|
|
1043
|
+
# Run the generator's __anext__ in the preserved context
|
|
1044
|
+
try:
|
|
1045
|
+
# Python 3.10+ approach with context parameter
|
|
1046
|
+
item = await asyncio.create_task(
|
|
1047
|
+
self.generator.__anext__(), # type: ignore
|
|
1048
|
+
context=self.context,
|
|
1049
|
+
) # type: ignore
|
|
1050
|
+
except TypeError:
|
|
1051
|
+
# Python < 3.10 fallback - context parameter not supported
|
|
1052
|
+
item = await self.generator.__anext__()
|
|
1053
|
+
|
|
1054
|
+
if not self.disable_generator_yield_span:
|
|
1055
|
+
with use_span(self.span):
|
|
1056
|
+
span_name = (
|
|
1057
|
+
str(self.span.name)
|
|
1058
|
+
if hasattr(self.span, "name")
|
|
1059
|
+
else "generator_item"
|
|
1060
|
+
) # type: ignore[attr-defined]
|
|
1061
|
+
with self.tracer.get_tracer().start_as_current_span(
|
|
1062
|
+
span_name,
|
|
1063
|
+
attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "generator_item"},
|
|
1064
|
+
end_on_exit=True,
|
|
1065
|
+
) as child_span:
|
|
1066
|
+
set_span_attribute(
|
|
1067
|
+
child_span,
|
|
1068
|
+
AttributeKeys.JUDGMENT_OUTPUT,
|
|
1069
|
+
safe_serialize(item),
|
|
1070
|
+
)
|
|
1071
|
+
|
|
1072
|
+
return item
|
|
1073
|
+
|
|
1074
|
+
except StopAsyncIteration:
|
|
1075
|
+
# Handle output and span cleanup when generator is exhausted
|
|
1076
|
+
if not self._finished:
|
|
1077
|
+
set_span_attribute(
|
|
1078
|
+
self.span, AttributeKeys.JUDGMENT_SPAN_KIND, "generator"
|
|
1079
|
+
)
|
|
1080
|
+
self.span.end()
|
|
1081
|
+
self._finished = True
|
|
1082
|
+
raise # Re-raise StopAsyncIteration
|
|
1083
|
+
except Exception as e:
|
|
1084
|
+
if not self._finished:
|
|
1085
|
+
self.span.record_exception(e)
|
|
1086
|
+
self.span.set_status(
|
|
1087
|
+
Status(StatusCode.ERROR, str(e) or type(e).__name__)
|
|
1088
|
+
)
|
|
1089
|
+
self.tracer._maybe_clear_customer_context(self.span)
|
|
1090
|
+
self.span.end()
|
|
1091
|
+
self._finished = True
|
|
1092
|
+
|
|
1093
|
+
raise
|
|
1094
|
+
|
|
1095
|
+
async def aclose(self) -> None:
|
|
1096
|
+
"""Close the async generator (minimal implementation)."""
|
|
1097
|
+
try:
|
|
1098
|
+
await self.generator.aclose()
|
|
1099
|
+
finally:
|
|
1100
|
+
if not self._finished:
|
|
1101
|
+
set_span_attribute(
|
|
1102
|
+
self.span, AttributeKeys.JUDGMENT_SPAN_KIND, "generator"
|
|
1103
|
+
)
|
|
1104
|
+
self.tracer._maybe_clear_customer_context(self.span)
|
|
1105
|
+
self.span.end()
|
|
1106
|
+
self._finished = True
|
|
1107
|
+
|
|
1108
|
+
|
|
1109
|
+
__all__ = [
|
|
1110
|
+
"Tracer",
|
|
1111
|
+
"wrap",
|
|
1112
|
+
]
|