judgeval 0.9.3__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +2 -2
- judgeval/api/__init__.py +28 -96
- judgeval/api/api_types.py +49 -140
- judgeval/constants.py +1 -5
- judgeval/data/__init__.py +1 -3
- judgeval/data/example.py +4 -2
- judgeval/data/judgment_types.py +57 -165
- judgeval/data/result.py +1 -2
- judgeval/data/trace.py +14 -40
- judgeval/dataset/__init__.py +15 -42
- judgeval/evaluation/__init__.py +23 -34
- judgeval/scorers/__init__.py +9 -7
- judgeval/scorers/api_scorer.py +8 -0
- judgeval/scorers/base_scorer.py +0 -1
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -10
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +43 -4
- judgeval/tracer/__init__.py +40 -93
- judgeval/tracer/local_eval_queue.py +2 -2
- judgeval/tracer/processors/__init__.py +84 -6
- judgeval/tracer/utils.py +1 -1
- judgeval/trainer/trainer.py +4 -4
- judgeval/utils/serialize.py +7 -1
- {judgeval-0.9.3.dist-info → judgeval-0.10.0.dist-info}/METADATA +2 -2
- {judgeval-0.9.3.dist-info → judgeval-0.10.0.dist-info}/RECORD +31 -36
- judgeval/data/trace_run.py +0 -39
- judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
- judgeval/scorers/trace_api_scorer.py +0 -5
- {judgeval-0.9.3.dist-info → judgeval-0.10.0.dist-info}/WHEEL +0 -0
- {judgeval-0.9.3.dist-info → judgeval-0.10.0.dist-info}/entry_points.txt +0 -0
- {judgeval-0.9.3.dist-info → judgeval-0.10.0.dist-info}/licenses/LICENSE.md +0 -0
judgeval/scorers/base_scorer.py
CHANGED
@@ -10,24 +10,16 @@ from judgeval.scorers.judgeval_scorers.api_scorers.answer_correctness import (
|
|
10
10
|
from judgeval.scorers.judgeval_scorers.api_scorers.instruction_adherence import (
|
11
11
|
InstructionAdherenceScorer,
|
12
12
|
)
|
13
|
-
from judgeval.scorers.judgeval_scorers.api_scorers.derailment_scorer import (
|
14
|
-
DerailmentScorer,
|
15
|
-
)
|
16
|
-
from judgeval.scorers.judgeval_scorers.api_scorers.tool_order import ToolOrderScorer
|
17
13
|
from judgeval.scorers.judgeval_scorers.api_scorers.prompt_scorer import (
|
14
|
+
TracePromptScorer,
|
18
15
|
PromptScorer,
|
19
16
|
)
|
20
|
-
from judgeval.scorers.judgeval_scorers.api_scorers.tool_dependency import (
|
21
|
-
ToolDependencyScorer,
|
22
|
-
)
|
23
17
|
|
24
18
|
__all__ = [
|
25
19
|
"FaithfulnessScorer",
|
26
20
|
"AnswerRelevancyScorer",
|
27
21
|
"AnswerCorrectnessScorer",
|
28
22
|
"InstructionAdherenceScorer",
|
29
|
-
"
|
30
|
-
"ToolOrderScorer",
|
23
|
+
"TracePromptScorer",
|
31
24
|
"PromptScorer",
|
32
|
-
"ToolDependencyScorer",
|
33
25
|
]
|
@@ -6,13 +6,13 @@ TODO add link to docs page for this scorer
|
|
6
6
|
"""
|
7
7
|
|
8
8
|
# Internal imports
|
9
|
-
from judgeval.scorers.api_scorer import
|
9
|
+
from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
|
10
10
|
from judgeval.constants import APIScorerType
|
11
11
|
from judgeval.data import ExampleParams
|
12
12
|
from typing import List
|
13
13
|
|
14
14
|
|
15
|
-
class AnswerCorrectnessScorer(
|
15
|
+
class AnswerCorrectnessScorer(ExampleAPIScorerConfig):
|
16
16
|
score_type: APIScorerType = APIScorerType.ANSWER_CORRECTNESS
|
17
17
|
required_params: List[ExampleParams] = [
|
18
18
|
ExampleParams.INPUT,
|
@@ -1,10 +1,10 @@
|
|
1
|
-
from judgeval.scorers.api_scorer import
|
1
|
+
from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
|
2
2
|
from judgeval.constants import APIScorerType
|
3
3
|
from judgeval.data import ExampleParams
|
4
4
|
from typing import List
|
5
5
|
|
6
6
|
|
7
|
-
class AnswerRelevancyScorer(
|
7
|
+
class AnswerRelevancyScorer(ExampleAPIScorerConfig):
|
8
8
|
score_type: APIScorerType = APIScorerType.ANSWER_RELEVANCY
|
9
9
|
required_params: List[ExampleParams] = [
|
10
10
|
ExampleParams.INPUT,
|
@@ -6,13 +6,13 @@ TODO add link to docs page for this scorer
|
|
6
6
|
"""
|
7
7
|
|
8
8
|
# Internal imports
|
9
|
-
from judgeval.scorers.api_scorer import
|
9
|
+
from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
|
10
10
|
from judgeval.constants import APIScorerType
|
11
11
|
from judgeval.data import ExampleParams
|
12
12
|
from typing import List
|
13
13
|
|
14
14
|
|
15
|
-
class FaithfulnessScorer(
|
15
|
+
class FaithfulnessScorer(ExampleAPIScorerConfig):
|
16
16
|
score_type: APIScorerType = APIScorerType.FAITHFULNESS
|
17
17
|
required_params: List[ExampleParams] = [
|
18
18
|
ExampleParams.INPUT,
|
@@ -6,12 +6,12 @@ TODO add link to docs page for this scorer
|
|
6
6
|
"""
|
7
7
|
|
8
8
|
# Internal imports
|
9
|
-
from judgeval.scorers.api_scorer import
|
9
|
+
from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
|
10
10
|
from judgeval.constants import APIScorerType
|
11
11
|
from judgeval.data import ExampleParams
|
12
12
|
|
13
13
|
|
14
|
-
class InstructionAdherenceScorer(
|
14
|
+
class InstructionAdherenceScorer(ExampleAPIScorerConfig):
|
15
15
|
def __init__(self, threshold: float):
|
16
16
|
super().__init__(
|
17
17
|
threshold=threshold,
|
@@ -1,4 +1,8 @@
|
|
1
|
-
from judgeval.scorers.api_scorer import
|
1
|
+
from judgeval.scorers.api_scorer import (
|
2
|
+
APIScorerConfig,
|
3
|
+
ExampleAPIScorerConfig,
|
4
|
+
TraceAPIScorerConfig,
|
5
|
+
)
|
2
6
|
from judgeval.constants import APIScorerType
|
3
7
|
from typing import Dict, Any, Optional
|
4
8
|
from judgeval.api import JudgmentSyncClient
|
@@ -6,6 +10,7 @@ from judgeval.exceptions import JudgmentAPIError
|
|
6
10
|
import os
|
7
11
|
from copy import copy
|
8
12
|
from judgeval.logger import judgeval_logger
|
13
|
+
from abc import ABC
|
9
14
|
|
10
15
|
|
11
16
|
def push_prompt_scorer(
|
@@ -15,6 +20,7 @@ def push_prompt_scorer(
|
|
15
20
|
options: Optional[Dict[str, float]] = None,
|
16
21
|
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
|
17
22
|
organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
|
23
|
+
is_trace: Optional[bool] = None,
|
18
24
|
) -> str:
|
19
25
|
client = JudgmentSyncClient(judgment_api_key, organization_id)
|
20
26
|
try:
|
@@ -24,6 +30,7 @@ def push_prompt_scorer(
|
|
24
30
|
"prompt": prompt,
|
25
31
|
"threshold": threshold,
|
26
32
|
"options": options,
|
33
|
+
"is_trace": is_trace,
|
27
34
|
}
|
28
35
|
)
|
29
36
|
except JudgmentAPIError as e:
|
@@ -88,7 +95,7 @@ def scorer_exists(
|
|
88
95
|
)
|
89
96
|
|
90
97
|
|
91
|
-
class
|
98
|
+
class BasePromptScorer(ABC, APIScorerConfig):
|
92
99
|
"""
|
93
100
|
In the Judgment backend, this scorer is implemented as a PromptScorer that takes
|
94
101
|
1. a system role that may involve the Example object
|
@@ -97,9 +104,9 @@ class PromptScorer(APIScorerConfig):
|
|
97
104
|
and uses a judge to execute the evaluation from the system role and classify into one of the options
|
98
105
|
"""
|
99
106
|
|
107
|
+
score_type: APIScorerType
|
100
108
|
prompt: str
|
101
109
|
options: Optional[Dict[str, float]] = None
|
102
|
-
score_type: APIScorerType = APIScorerType.PROMPT_SCORER
|
103
110
|
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or ""
|
104
111
|
organization_id: str = os.getenv("JUDGMENT_ORG_ID") or ""
|
105
112
|
|
@@ -111,7 +118,18 @@ class PromptScorer(APIScorerConfig):
|
|
111
118
|
organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
|
112
119
|
):
|
113
120
|
scorer_config = fetch_prompt_scorer(name, judgment_api_key, organization_id)
|
121
|
+
if scorer_config["is_trace"] != issubclass(cls, TracePromptScorer):
|
122
|
+
raise JudgmentAPIError(
|
123
|
+
status_code=400,
|
124
|
+
detail=f"Scorer with name {name} is not a {cls.__name__}",
|
125
|
+
response=None, # type: ignore
|
126
|
+
)
|
127
|
+
if issubclass(cls, TracePromptScorer):
|
128
|
+
score_type = APIScorerType.TRACE_PROMPT_SCORER
|
129
|
+
else:
|
130
|
+
score_type = APIScorerType.PROMPT_SCORER
|
114
131
|
return cls(
|
132
|
+
score_type=score_type,
|
115
133
|
name=name,
|
116
134
|
prompt=scorer_config["prompt"],
|
117
135
|
threshold=scorer_config["threshold"],
|
@@ -131,11 +149,24 @@ class PromptScorer(APIScorerConfig):
|
|
131
149
|
organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
|
132
150
|
):
|
133
151
|
if not scorer_exists(name, judgment_api_key, organization_id):
|
152
|
+
if issubclass(cls, TracePromptScorer):
|
153
|
+
is_trace = True
|
154
|
+
score_type = APIScorerType.TRACE_PROMPT_SCORER
|
155
|
+
else:
|
156
|
+
is_trace = False
|
157
|
+
score_type = APIScorerType.PROMPT_SCORER
|
134
158
|
push_prompt_scorer(
|
135
|
-
name,
|
159
|
+
name,
|
160
|
+
prompt,
|
161
|
+
threshold,
|
162
|
+
options,
|
163
|
+
judgment_api_key,
|
164
|
+
organization_id,
|
165
|
+
is_trace,
|
136
166
|
)
|
137
167
|
judgeval_logger.info(f"Successfully created PromptScorer: {name}")
|
138
168
|
return cls(
|
169
|
+
score_type=score_type,
|
139
170
|
name=name,
|
140
171
|
prompt=prompt,
|
141
172
|
threshold=threshold,
|
@@ -251,3 +282,11 @@ class PromptScorer(APIScorerConfig):
|
|
251
282
|
k: getattr(self, k) for k in extra_fields if getattr(self, k) is not None
|
252
283
|
}
|
253
284
|
return base
|
285
|
+
|
286
|
+
|
287
|
+
class PromptScorer(BasePromptScorer, ExampleAPIScorerConfig):
|
288
|
+
pass
|
289
|
+
|
290
|
+
|
291
|
+
class TracePromptScorer(BasePromptScorer, TraceAPIScorerConfig):
|
292
|
+
pass
|
judgeval/tracer/__init__.py
CHANGED
@@ -43,8 +43,7 @@ from judgeval.env import (
|
|
43
43
|
JUDGMENT_ORG_ID,
|
44
44
|
)
|
45
45
|
from judgeval.logger import judgeval_logger
|
46
|
-
from judgeval.scorers.api_scorer import
|
47
|
-
from judgeval.scorers.trace_api_scorer import TraceAPIScorerConfig
|
46
|
+
from judgeval.scorers.api_scorer import ExampleAPIScorerConfig, TraceAPIScorerConfig
|
48
47
|
from judgeval.scorers.base_scorer import BaseScorer
|
49
48
|
from judgeval.tracer.constants import JUDGEVAL_TRACER_INSTRUMENTING_MODULE_NAME
|
50
49
|
from judgeval.tracer.managers import (
|
@@ -57,7 +56,7 @@ from judgeval.utils.serialize import safe_serialize
|
|
57
56
|
from judgeval.version import get_version
|
58
57
|
from judgeval.warnings import JudgmentWarning
|
59
58
|
|
60
|
-
from judgeval.tracer.keys import AttributeKeys,
|
59
|
+
from judgeval.tracer.keys import AttributeKeys, InternalAttributeKeys
|
61
60
|
from judgeval.api import JudgmentSyncClient
|
62
61
|
from judgeval.tracer.llm import wrap_provider
|
63
62
|
from judgeval.utils.url import url_for
|
@@ -65,6 +64,7 @@ from judgeval.tracer.local_eval_queue import LocalEvaluationQueue
|
|
65
64
|
from judgeval.tracer.processors import (
|
66
65
|
JudgmentSpanProcessor,
|
67
66
|
NoOpJudgmentSpanProcessor,
|
67
|
+
NoOpSpanProcessor,
|
68
68
|
)
|
69
69
|
from judgeval.tracer.utils import set_span_attribute, TraceScorerConfig
|
70
70
|
|
@@ -85,19 +85,6 @@ class AgentContext(TypedDict):
|
|
85
85
|
parent_agent_id: str | None
|
86
86
|
|
87
87
|
|
88
|
-
def resolve_project_id(
|
89
|
-
api_key: str, organization_id: str, project_name: str
|
90
|
-
) -> str | None:
|
91
|
-
try:
|
92
|
-
client = JudgmentSyncClient(
|
93
|
-
api_key=api_key,
|
94
|
-
organization_id=organization_id,
|
95
|
-
)
|
96
|
-
return client.projects_resolve({"project_name": project_name})["project_id"]
|
97
|
-
except Exception:
|
98
|
-
return None
|
99
|
-
|
100
|
-
|
101
88
|
class Tracer:
|
102
89
|
_active_tracers: List[Tracer] = []
|
103
90
|
|
@@ -188,38 +175,20 @@ class Tracer:
|
|
188
175
|
self.cost_context = ContextVar("current_cost_context", default=None)
|
189
176
|
|
190
177
|
if self.enable_monitoring:
|
191
|
-
project_id = resolve_project_id(
|
192
|
-
self.api_key, self.organization_id, self.project_name
|
193
|
-
)
|
194
|
-
|
195
|
-
resource_attributes = resource_attributes or {}
|
196
|
-
resource_attributes.update(
|
197
|
-
{
|
198
|
-
ResourceKeys.SERVICE_NAME: self.project_name,
|
199
|
-
ResourceKeys.TELEMETRY_SDK_NAME: "judgeval",
|
200
|
-
ResourceKeys.TELEMETRY_SDK_VERSION: get_version(),
|
201
|
-
}
|
202
|
-
)
|
203
|
-
|
204
|
-
if project_id is not None:
|
205
|
-
resource_attributes[ResourceKeys.JUDGMENT_PROJECT_ID] = project_id
|
206
|
-
else:
|
207
|
-
judgeval_logger.error(
|
208
|
-
f"Failed to resolve project {self.project_name}, please create it first at https://app.judgmentlabs.ai/projects. Skipping Judgment export."
|
209
|
-
)
|
210
|
-
|
211
|
-
resource = Resource.create(resource_attributes)
|
212
|
-
|
213
178
|
self.judgment_processor = JudgmentSpanProcessor(
|
214
179
|
self,
|
215
|
-
self.
|
180
|
+
self.project_name,
|
216
181
|
self.api_key,
|
217
182
|
self.organization_id,
|
218
183
|
max_queue_size=2**18,
|
219
184
|
export_timeout_millis=30000,
|
185
|
+
resource_attributes=resource_attributes,
|
220
186
|
)
|
221
|
-
|
187
|
+
|
188
|
+
resource = Resource.create(self.judgment_processor.resource_attributes)
|
222
189
|
self.provider = TracerProvider(resource=resource)
|
190
|
+
|
191
|
+
self.processors.append(self.judgment_processor)
|
223
192
|
for processor in self.processors:
|
224
193
|
self.provider.add_span_processor(processor)
|
225
194
|
|
@@ -253,6 +222,14 @@ class Tracer:
|
|
253
222
|
def get_current_cost_context(self):
|
254
223
|
return self.cost_context
|
255
224
|
|
225
|
+
def get_processor(self):
|
226
|
+
"""Get the judgment span processor instance.
|
227
|
+
|
228
|
+
Returns:
|
229
|
+
The JudgmentSpanProcessor or NoOpJudgmentSpanProcessor instance used by this tracer.
|
230
|
+
"""
|
231
|
+
return self.judgment_processor
|
232
|
+
|
256
233
|
def set_customer_id(self, customer_id: str) -> None:
|
257
234
|
span = self.get_current_span()
|
258
235
|
if span and span.is_recording():
|
@@ -507,11 +484,11 @@ class Tracer:
|
|
507
484
|
safe_serialize(format_inputs(f, args, kwargs)),
|
508
485
|
)
|
509
486
|
|
487
|
+
self.judgment_processor.emit_partial()
|
488
|
+
|
510
489
|
if scorer_config:
|
511
490
|
self._set_pending_trace_eval(span, scorer_config, args, kwargs)
|
512
491
|
|
513
|
-
self.judgment_processor.emit_partial()
|
514
|
-
|
515
492
|
result = f(*args, **kwargs)
|
516
493
|
except Exception as user_exc:
|
517
494
|
span.record_exception(user_exc)
|
@@ -559,13 +536,13 @@ class Tracer:
|
|
559
536
|
safe_serialize(format_inputs(f, args, kwargs)),
|
560
537
|
)
|
561
538
|
|
539
|
+
self.judgment_processor.emit_partial()
|
540
|
+
|
562
541
|
if scorer_config:
|
563
542
|
self._set_pending_trace_eval(
|
564
543
|
main_span, scorer_config, args, kwargs
|
565
544
|
)
|
566
545
|
|
567
|
-
self.judgment_processor.emit_partial()
|
568
|
-
|
569
546
|
generator = f(*args, **kwargs)
|
570
547
|
set_span_attribute(
|
571
548
|
main_span, AttributeKeys.JUDGMENT_OUTPUT, "<generator>"
|
@@ -609,11 +586,11 @@ class Tracer:
|
|
609
586
|
safe_serialize(format_inputs(f, args, kwargs)),
|
610
587
|
)
|
611
588
|
|
589
|
+
self.judgment_processor.emit_partial()
|
590
|
+
|
612
591
|
if scorer_config:
|
613
592
|
self._set_pending_trace_eval(span, scorer_config, args, kwargs)
|
614
593
|
|
615
|
-
self.judgment_processor.emit_partial()
|
616
|
-
|
617
594
|
result = await f(*args, **kwargs)
|
618
595
|
except Exception as user_exc:
|
619
596
|
span.record_exception(user_exc)
|
@@ -661,13 +638,13 @@ class Tracer:
|
|
661
638
|
safe_serialize(format_inputs(f, args, kwargs)),
|
662
639
|
)
|
663
640
|
|
641
|
+
self.judgment_processor.emit_partial()
|
642
|
+
|
664
643
|
if scorer_config:
|
665
644
|
self._set_pending_trace_eval(
|
666
645
|
main_span, scorer_config, args, kwargs
|
667
646
|
)
|
668
647
|
|
669
|
-
self.judgment_processor.emit_partial()
|
670
|
-
|
671
648
|
async_generator = f(*args, **kwargs)
|
672
649
|
set_span_attribute(
|
673
650
|
main_span, AttributeKeys.JUDGMENT_OUTPUT, "<async_generator>"
|
@@ -847,42 +824,6 @@ class Tracer:
|
|
847
824
|
|
848
825
|
return sync_wrapper
|
849
826
|
|
850
|
-
@overload
|
851
|
-
def observe_tools(
|
852
|
-
self,
|
853
|
-
cls: Cls,
|
854
|
-
/,
|
855
|
-
*,
|
856
|
-
exclude_methods: List[str] = [],
|
857
|
-
include_private: bool = False,
|
858
|
-
) -> Cls: ...
|
859
|
-
|
860
|
-
@overload
|
861
|
-
def observe_tools(
|
862
|
-
self,
|
863
|
-
cls: None = None,
|
864
|
-
/,
|
865
|
-
*,
|
866
|
-
exclude_methods: List[str] = [],
|
867
|
-
include_private: bool = False,
|
868
|
-
) -> Callable[[Cls], Cls]: ...
|
869
|
-
|
870
|
-
def observe_tools(
|
871
|
-
self,
|
872
|
-
cls: Cls | None = None,
|
873
|
-
/,
|
874
|
-
*,
|
875
|
-
exclude_methods: List[str] = [],
|
876
|
-
include_private: bool = False,
|
877
|
-
) -> Cls | Callable[[Cls], Cls]:
|
878
|
-
if cls is None:
|
879
|
-
return partial(
|
880
|
-
self.observe_tools,
|
881
|
-
exclude_methods=exclude_methods,
|
882
|
-
include_private=include_private,
|
883
|
-
)
|
884
|
-
return cls
|
885
|
-
|
886
827
|
def wrap(self, client: ApiClient) -> ApiClient:
|
887
828
|
return wrap_provider(self, client)
|
888
829
|
|
@@ -913,11 +854,7 @@ class Tracer:
|
|
913
854
|
proper cleanup before program termination.
|
914
855
|
"""
|
915
856
|
try:
|
916
|
-
|
917
|
-
if not success:
|
918
|
-
judgeval_logger.warning(
|
919
|
-
"Some spans may not have been exported before program exit"
|
920
|
-
)
|
857
|
+
self.force_flush(timeout_millis=30000)
|
921
858
|
except Exception as e:
|
922
859
|
judgeval_logger.warning(f"Error during atexit flush: {e}")
|
923
860
|
|
@@ -925,7 +862,7 @@ class Tracer:
|
|
925
862
|
self,
|
926
863
|
/,
|
927
864
|
*,
|
928
|
-
scorer: Union[
|
865
|
+
scorer: Union[ExampleAPIScorerConfig, BaseScorer],
|
929
866
|
example: Example,
|
930
867
|
model: str = JUDGMENT_DEFAULT_GPT_MODEL,
|
931
868
|
sampling_rate: float = 1.0,
|
@@ -934,9 +871,9 @@ class Tracer:
|
|
934
871
|
judgeval_logger.info("Evaluation is not enabled, skipping evaluation")
|
935
872
|
return
|
936
873
|
|
937
|
-
if not isinstance(scorer, (
|
874
|
+
if not isinstance(scorer, (ExampleAPIScorerConfig, BaseScorer)):
|
938
875
|
judgeval_logger.error(
|
939
|
-
"Scorer must be an instance of
|
876
|
+
"Scorer must be an instance of ExampleAPIScorerConfig or BaseScorer, got %s, skipping evaluation."
|
940
877
|
% type(scorer)
|
941
878
|
)
|
942
879
|
return
|
@@ -965,7 +902,7 @@ class Tracer:
|
|
965
902
|
span_context = self.get_current_span().get_span_context()
|
966
903
|
trace_id = format(span_context.trace_id, "032x")
|
967
904
|
span_id = format(span_context.span_id, "016x")
|
968
|
-
hosted_scoring = isinstance(scorer,
|
905
|
+
hosted_scoring = isinstance(scorer, ExampleAPIScorerConfig) or (
|
969
906
|
isinstance(scorer, BaseScorer) and scorer.server_hosted
|
970
907
|
)
|
971
908
|
eval_run_name = f"async_evaluate_{span_id}" # note this name doesnt matter because we don't save the experiment only the example and scorer_data
|
@@ -1074,3 +1011,13 @@ def format_inputs(
|
|
1074
1011
|
return inputs
|
1075
1012
|
except Exception:
|
1076
1013
|
return {}
|
1014
|
+
|
1015
|
+
|
1016
|
+
# Export processor classes for direct access
|
1017
|
+
__all__ = [
|
1018
|
+
"Tracer",
|
1019
|
+
"wrap",
|
1020
|
+
"JudgmentSpanProcessor",
|
1021
|
+
"NoOpJudgmentSpanProcessor",
|
1022
|
+
"NoOpSpanProcessor",
|
1023
|
+
]
|
@@ -24,7 +24,7 @@ class LocalEvaluationQueue:
|
|
24
24
|
"""Lightweight in-memory queue for local evaluation runs.
|
25
25
|
|
26
26
|
Only supports EvaluationRuns with local scorers (BaseScorer instances).
|
27
|
-
API scorers (
|
27
|
+
API scorers (ExampleAPIScorerConfig) are not supported as they have their own queue.
|
28
28
|
"""
|
29
29
|
|
30
30
|
def __init__(
|
@@ -54,7 +54,7 @@ class LocalEvaluationQueue:
|
|
54
54
|
if not evaluation_run.custom_scorers:
|
55
55
|
raise ValueError(
|
56
56
|
"LocalEvaluationQueue only supports runs with local scorers (BaseScorer). "
|
57
|
-
"Found only
|
57
|
+
"Found only ExampleAPIScorerConfig instances."
|
58
58
|
)
|
59
59
|
|
60
60
|
return safe_run_async(
|
@@ -6,8 +6,13 @@ from opentelemetry.sdk.trace import ReadableSpan, Span, SpanProcessor, SpanConte
|
|
6
6
|
from opentelemetry.sdk.trace.export import (
|
7
7
|
BatchSpanProcessor,
|
8
8
|
)
|
9
|
+
from opentelemetry.sdk.resources import Resource
|
9
10
|
from judgeval.tracer.exporters import JudgmentSpanExporter
|
10
|
-
from judgeval.tracer.keys import AttributeKeys, InternalAttributeKeys
|
11
|
+
from judgeval.tracer.keys import AttributeKeys, InternalAttributeKeys, ResourceKeys
|
12
|
+
from judgeval.api import JudgmentSyncClient
|
13
|
+
from judgeval.logger import judgeval_logger
|
14
|
+
from judgeval.utils.url import url_for
|
15
|
+
from judgeval.version import get_version
|
11
16
|
|
12
17
|
if TYPE_CHECKING:
|
13
18
|
from judgeval.tracer import Tracer
|
@@ -31,15 +36,27 @@ class JudgmentSpanProcessor(BatchSpanProcessor):
|
|
31
36
|
def __init__(
|
32
37
|
self,
|
33
38
|
tracer: Tracer,
|
34
|
-
|
39
|
+
project_name: str,
|
35
40
|
api_key: str,
|
36
41
|
organization_id: str,
|
37
42
|
/,
|
38
43
|
*,
|
39
44
|
max_queue_size: int = 2**18,
|
40
45
|
export_timeout_millis: int = 30000,
|
46
|
+
resource_attributes: Optional[dict[str, Any]] = None,
|
41
47
|
):
|
42
48
|
self.tracer = tracer
|
49
|
+
self.project_name = project_name
|
50
|
+
self.api_key = api_key
|
51
|
+
self.organization_id = organization_id
|
52
|
+
|
53
|
+
# Resolve project_id
|
54
|
+
self.project_id = self._resolve_project_id()
|
55
|
+
|
56
|
+
# Set up resource attributes with project_id
|
57
|
+
self._setup_resource_attributes(resource_attributes or {})
|
58
|
+
|
59
|
+
endpoint = url_for("/otel/v1/traces")
|
43
60
|
super().__init__(
|
44
61
|
JudgmentSpanExporter(
|
45
62
|
endpoint=endpoint,
|
@@ -53,6 +70,38 @@ class JudgmentSpanProcessor(BatchSpanProcessor):
|
|
53
70
|
defaultdict(dict)
|
54
71
|
)
|
55
72
|
|
73
|
+
def _resolve_project_id(self) -> str | None:
|
74
|
+
"""Resolve project_id from project_name using the API."""
|
75
|
+
try:
|
76
|
+
client = JudgmentSyncClient(
|
77
|
+
api_key=self.api_key,
|
78
|
+
organization_id=self.organization_id,
|
79
|
+
)
|
80
|
+
return client.projects_resolve({"project_name": self.project_name})[
|
81
|
+
"project_id"
|
82
|
+
]
|
83
|
+
except Exception:
|
84
|
+
return None
|
85
|
+
|
86
|
+
def _setup_resource_attributes(self, resource_attributes: dict[str, Any]) -> None:
|
87
|
+
"""Set up resource attributes including project_id."""
|
88
|
+
resource_attributes.update(
|
89
|
+
{
|
90
|
+
ResourceKeys.SERVICE_NAME: self.project_name,
|
91
|
+
ResourceKeys.TELEMETRY_SDK_NAME: "judgeval",
|
92
|
+
ResourceKeys.TELEMETRY_SDK_VERSION: get_version(),
|
93
|
+
}
|
94
|
+
)
|
95
|
+
|
96
|
+
if self.project_id is not None:
|
97
|
+
resource_attributes[ResourceKeys.JUDGMENT_PROJECT_ID] = self.project_id
|
98
|
+
else:
|
99
|
+
judgeval_logger.error(
|
100
|
+
f"Failed to resolve project {self.project_name}, please create it first at https://app.judgmentlabs.ai/org/{self.organization_id}/projects. Skipping Judgment export."
|
101
|
+
)
|
102
|
+
|
103
|
+
self.resource_attributes = resource_attributes
|
104
|
+
|
56
105
|
def _get_span_key(self, span_context: SpanContext) -> tuple[int, int]:
|
57
106
|
return (span_context.trace_id, span_context.span_id)
|
58
107
|
|
@@ -103,11 +152,18 @@ class JudgmentSpanProcessor(BatchSpanProcessor):
|
|
103
152
|
|
104
153
|
attributes = dict(current_span.attributes or {})
|
105
154
|
attributes[AttributeKeys.JUDGMENT_UPDATE_ID] = current_update_id
|
155
|
+
|
156
|
+
existing_resource_attrs = (
|
157
|
+
dict(current_span.resource.attributes) if current_span.resource else {}
|
158
|
+
)
|
159
|
+
merged_resource_attrs = {**existing_resource_attrs, **self.resource_attributes}
|
160
|
+
merged_resource = Resource.create(merged_resource_attrs)
|
161
|
+
|
106
162
|
partial_span = ReadableSpan(
|
107
163
|
name=current_span.name,
|
108
164
|
context=span_context,
|
109
165
|
parent=current_span.parent,
|
110
|
-
resource=
|
166
|
+
resource=merged_resource,
|
111
167
|
attributes=attributes,
|
112
168
|
events=current_span.events,
|
113
169
|
links=current_span.links,
|
@@ -137,11 +193,20 @@ class JudgmentSpanProcessor(BatchSpanProcessor):
|
|
137
193
|
attributes = dict(span.attributes or {})
|
138
194
|
attributes[AttributeKeys.JUDGMENT_UPDATE_ID] = 20
|
139
195
|
|
196
|
+
existing_resource_attrs = (
|
197
|
+
dict(span.resource.attributes) if span.resource else {}
|
198
|
+
)
|
199
|
+
merged_resource_attrs = {
|
200
|
+
**existing_resource_attrs,
|
201
|
+
**self.resource_attributes,
|
202
|
+
}
|
203
|
+
merged_resource = Resource.create(merged_resource_attrs)
|
204
|
+
|
140
205
|
final_span = ReadableSpan(
|
141
206
|
name=span.name,
|
142
207
|
context=span.context,
|
143
208
|
parent=span.parent,
|
144
|
-
resource=
|
209
|
+
resource=merged_resource,
|
145
210
|
attributes=attributes,
|
146
211
|
events=span.events,
|
147
212
|
links=span.links,
|
@@ -160,7 +225,7 @@ class JudgmentSpanProcessor(BatchSpanProcessor):
|
|
160
225
|
|
161
226
|
class NoOpJudgmentSpanProcessor(JudgmentSpanProcessor):
|
162
227
|
def __init__(self):
|
163
|
-
|
228
|
+
pass
|
164
229
|
|
165
230
|
def on_start(self, span: Span, parent_context: Optional[Context] = None) -> None:
|
166
231
|
pass
|
@@ -177,5 +242,18 @@ class NoOpJudgmentSpanProcessor(JudgmentSpanProcessor):
|
|
177
242
|
def emit_partial(self) -> None:
|
178
243
|
pass
|
179
244
|
|
245
|
+
def set_internal_attribute(
|
246
|
+
self, span_context: SpanContext, key: str, value: Any
|
247
|
+
) -> None:
|
248
|
+
pass
|
249
|
+
|
250
|
+
def get_internal_attribute(
|
251
|
+
self, span_context: SpanContext, key: str, default: Any = None
|
252
|
+
) -> Any:
|
253
|
+
return default
|
254
|
+
|
255
|
+
def increment_update_id(self, span_context: SpanContext) -> int:
|
256
|
+
return 0
|
257
|
+
|
180
258
|
|
181
|
-
__all__ =
|
259
|
+
__all__ = ["NoOpSpanProcessor", "JudgmentSpanProcessor", "NoOpJudgmentSpanProcessor"]
|
judgeval/tracer/utils.py
CHANGED
@@ -2,7 +2,7 @@ from typing import Any
|
|
2
2
|
from opentelemetry.trace import Span
|
3
3
|
from pydantic import BaseModel
|
4
4
|
from typing import Callable, Optional
|
5
|
-
from judgeval.scorers.
|
5
|
+
from judgeval.scorers.api_scorer import TraceAPIScorerConfig
|
6
6
|
from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
|
7
7
|
|
8
8
|
|
judgeval/trainer/trainer.py
CHANGED
@@ -10,7 +10,7 @@ from judgeval.tracer.exporters.store import SpanStore
|
|
10
10
|
from judgeval.tracer.exporters import InMemorySpanExporter
|
11
11
|
from judgeval.tracer.keys import AttributeKeys
|
12
12
|
from judgeval import JudgmentClient
|
13
|
-
from judgeval.scorers import BaseScorer,
|
13
|
+
from judgeval.scorers import BaseScorer, ExampleAPIScorerConfig
|
14
14
|
from judgeval.data import Example
|
15
15
|
from .console import _spinner_progress, _print_progress, _print_progress_update
|
16
16
|
from judgeval.exceptions import JudgmentRuntimeError
|
@@ -154,7 +154,7 @@ class JudgmentTrainer:
|
|
154
154
|
async def generate_rollouts_and_rewards(
|
155
155
|
self,
|
156
156
|
agent_function: Callable[[Any], Any],
|
157
|
-
scorers: List[Union[
|
157
|
+
scorers: List[Union[ExampleAPIScorerConfig, BaseScorer]],
|
158
158
|
prompts: List[Any],
|
159
159
|
num_prompts_per_step: Optional[int] = None,
|
160
160
|
num_generations_per_prompt: Optional[int] = None,
|
@@ -264,7 +264,7 @@ class JudgmentTrainer:
|
|
264
264
|
async def run_reinforcement_learning(
|
265
265
|
self,
|
266
266
|
agent_function: Callable[[Any], Any],
|
267
|
-
scorers: List[Union[
|
267
|
+
scorers: List[Union[ExampleAPIScorerConfig, BaseScorer]],
|
268
268
|
prompts: List[Any],
|
269
269
|
) -> ModelConfig:
|
270
270
|
"""
|
@@ -370,7 +370,7 @@ class JudgmentTrainer:
|
|
370
370
|
async def train(
|
371
371
|
self,
|
372
372
|
agent_function: Callable[[Any], Any],
|
373
|
-
scorers: List[Union[
|
373
|
+
scorers: List[Union[ExampleAPIScorerConfig, BaseScorer]],
|
374
374
|
prompts: List[Any],
|
375
375
|
rft_provider: Optional[str] = None,
|
376
376
|
) -> ModelConfig:
|