judgeval 0.9.3__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. judgeval/__init__.py +2 -2
  2. judgeval/api/__init__.py +28 -96
  3. judgeval/api/api_types.py +49 -140
  4. judgeval/constants.py +1 -5
  5. judgeval/data/__init__.py +1 -3
  6. judgeval/data/example.py +4 -2
  7. judgeval/data/judgment_types.py +57 -165
  8. judgeval/data/result.py +1 -2
  9. judgeval/data/trace.py +14 -40
  10. judgeval/dataset/__init__.py +15 -42
  11. judgeval/evaluation/__init__.py +23 -34
  12. judgeval/scorers/__init__.py +9 -7
  13. judgeval/scorers/api_scorer.py +8 -0
  14. judgeval/scorers/base_scorer.py +0 -1
  15. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -10
  16. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -2
  17. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
  18. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -2
  19. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -2
  20. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +43 -4
  21. judgeval/tracer/__init__.py +40 -93
  22. judgeval/tracer/local_eval_queue.py +2 -2
  23. judgeval/tracer/processors/__init__.py +84 -6
  24. judgeval/tracer/utils.py +1 -1
  25. judgeval/trainer/trainer.py +4 -4
  26. judgeval/utils/serialize.py +7 -1
  27. {judgeval-0.9.3.dist-info → judgeval-0.10.0.dist-info}/METADATA +2 -2
  28. {judgeval-0.9.3.dist-info → judgeval-0.10.0.dist-info}/RECORD +31 -36
  29. judgeval/data/trace_run.py +0 -39
  30. judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
  31. judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
  32. judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
  33. judgeval/scorers/trace_api_scorer.py +0 -5
  34. {judgeval-0.9.3.dist-info → judgeval-0.10.0.dist-info}/WHEEL +0 -0
  35. {judgeval-0.9.3.dist-info → judgeval-0.10.0.dist-info}/entry_points.txt +0 -0
  36. {judgeval-0.9.3.dist-info → judgeval-0.10.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -85,7 +85,6 @@ class BaseScorer(BaseModel):
85
85
  This method is used at eval time
86
86
  """
87
87
  self.model_client, self.using_native_model = create_judge(model)
88
- self.model = self.model_client.get_model_name() or model
89
88
 
90
89
  def success_check(self) -> bool:
91
90
  """
@@ -10,24 +10,16 @@ from judgeval.scorers.judgeval_scorers.api_scorers.answer_correctness import (
10
10
  from judgeval.scorers.judgeval_scorers.api_scorers.instruction_adherence import (
11
11
  InstructionAdherenceScorer,
12
12
  )
13
- from judgeval.scorers.judgeval_scorers.api_scorers.derailment_scorer import (
14
- DerailmentScorer,
15
- )
16
- from judgeval.scorers.judgeval_scorers.api_scorers.tool_order import ToolOrderScorer
17
13
  from judgeval.scorers.judgeval_scorers.api_scorers.prompt_scorer import (
14
+ TracePromptScorer,
18
15
  PromptScorer,
19
16
  )
20
- from judgeval.scorers.judgeval_scorers.api_scorers.tool_dependency import (
21
- ToolDependencyScorer,
22
- )
23
17
 
24
18
  __all__ = [
25
19
  "FaithfulnessScorer",
26
20
  "AnswerRelevancyScorer",
27
21
  "AnswerCorrectnessScorer",
28
22
  "InstructionAdherenceScorer",
29
- "DerailmentScorer",
30
- "ToolOrderScorer",
23
+ "TracePromptScorer",
31
24
  "PromptScorer",
32
- "ToolDependencyScorer",
33
25
  ]
@@ -6,13 +6,13 @@ TODO add link to docs page for this scorer
6
6
  """
7
7
 
8
8
  # Internal imports
9
- from judgeval.scorers.api_scorer import APIScorerConfig
9
+ from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
10
10
  from judgeval.constants import APIScorerType
11
11
  from judgeval.data import ExampleParams
12
12
  from typing import List
13
13
 
14
14
 
15
- class AnswerCorrectnessScorer(APIScorerConfig):
15
+ class AnswerCorrectnessScorer(ExampleAPIScorerConfig):
16
16
  score_type: APIScorerType = APIScorerType.ANSWER_CORRECTNESS
17
17
  required_params: List[ExampleParams] = [
18
18
  ExampleParams.INPUT,
@@ -1,10 +1,10 @@
1
- from judgeval.scorers.api_scorer import APIScorerConfig
1
+ from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
2
2
  from judgeval.constants import APIScorerType
3
3
  from judgeval.data import ExampleParams
4
4
  from typing import List
5
5
 
6
6
 
7
- class AnswerRelevancyScorer(APIScorerConfig):
7
+ class AnswerRelevancyScorer(ExampleAPIScorerConfig):
8
8
  score_type: APIScorerType = APIScorerType.ANSWER_RELEVANCY
9
9
  required_params: List[ExampleParams] = [
10
10
  ExampleParams.INPUT,
@@ -6,13 +6,13 @@ TODO add link to docs page for this scorer
6
6
  """
7
7
 
8
8
  # Internal imports
9
- from judgeval.scorers.api_scorer import APIScorerConfig
9
+ from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
10
10
  from judgeval.constants import APIScorerType
11
11
  from judgeval.data import ExampleParams
12
12
  from typing import List
13
13
 
14
14
 
15
- class FaithfulnessScorer(APIScorerConfig):
15
+ class FaithfulnessScorer(ExampleAPIScorerConfig):
16
16
  score_type: APIScorerType = APIScorerType.FAITHFULNESS
17
17
  required_params: List[ExampleParams] = [
18
18
  ExampleParams.INPUT,
@@ -6,12 +6,12 @@ TODO add link to docs page for this scorer
6
6
  """
7
7
 
8
8
  # Internal imports
9
- from judgeval.scorers.api_scorer import APIScorerConfig
9
+ from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
10
10
  from judgeval.constants import APIScorerType
11
11
  from judgeval.data import ExampleParams
12
12
 
13
13
 
14
- class InstructionAdherenceScorer(APIScorerConfig):
14
+ class InstructionAdherenceScorer(ExampleAPIScorerConfig):
15
15
  def __init__(self, threshold: float):
16
16
  super().__init__(
17
17
  threshold=threshold,
@@ -1,4 +1,8 @@
1
- from judgeval.scorers.api_scorer import APIScorerConfig
1
+ from judgeval.scorers.api_scorer import (
2
+ APIScorerConfig,
3
+ ExampleAPIScorerConfig,
4
+ TraceAPIScorerConfig,
5
+ )
2
6
  from judgeval.constants import APIScorerType
3
7
  from typing import Dict, Any, Optional
4
8
  from judgeval.api import JudgmentSyncClient
@@ -6,6 +10,7 @@ from judgeval.exceptions import JudgmentAPIError
6
10
  import os
7
11
  from copy import copy
8
12
  from judgeval.logger import judgeval_logger
13
+ from abc import ABC
9
14
 
10
15
 
11
16
  def push_prompt_scorer(
@@ -15,6 +20,7 @@ def push_prompt_scorer(
15
20
  options: Optional[Dict[str, float]] = None,
16
21
  judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
17
22
  organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
23
+ is_trace: Optional[bool] = None,
18
24
  ) -> str:
19
25
  client = JudgmentSyncClient(judgment_api_key, organization_id)
20
26
  try:
@@ -24,6 +30,7 @@ def push_prompt_scorer(
24
30
  "prompt": prompt,
25
31
  "threshold": threshold,
26
32
  "options": options,
33
+ "is_trace": is_trace,
27
34
  }
28
35
  )
29
36
  except JudgmentAPIError as e:
@@ -88,7 +95,7 @@ def scorer_exists(
88
95
  )
89
96
 
90
97
 
91
- class PromptScorer(APIScorerConfig):
98
+ class BasePromptScorer(ABC, APIScorerConfig):
92
99
  """
93
100
  In the Judgment backend, this scorer is implemented as a PromptScorer that takes
94
101
  1. a system role that may involve the Example object
@@ -97,9 +104,9 @@ class PromptScorer(APIScorerConfig):
97
104
  and uses a judge to execute the evaluation from the system role and classify into one of the options
98
105
  """
99
106
 
107
+ score_type: APIScorerType
100
108
  prompt: str
101
109
  options: Optional[Dict[str, float]] = None
102
- score_type: APIScorerType = APIScorerType.PROMPT_SCORER
103
110
  judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or ""
104
111
  organization_id: str = os.getenv("JUDGMENT_ORG_ID") or ""
105
112
 
@@ -111,7 +118,18 @@ class PromptScorer(APIScorerConfig):
111
118
  organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
112
119
  ):
113
120
  scorer_config = fetch_prompt_scorer(name, judgment_api_key, organization_id)
121
+ if scorer_config["is_trace"] != issubclass(cls, TracePromptScorer):
122
+ raise JudgmentAPIError(
123
+ status_code=400,
124
+ detail=f"Scorer with name {name} is not a {cls.__name__}",
125
+ response=None, # type: ignore
126
+ )
127
+ if issubclass(cls, TracePromptScorer):
128
+ score_type = APIScorerType.TRACE_PROMPT_SCORER
129
+ else:
130
+ score_type = APIScorerType.PROMPT_SCORER
114
131
  return cls(
132
+ score_type=score_type,
115
133
  name=name,
116
134
  prompt=scorer_config["prompt"],
117
135
  threshold=scorer_config["threshold"],
@@ -131,11 +149,24 @@ class PromptScorer(APIScorerConfig):
131
149
  organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
132
150
  ):
133
151
  if not scorer_exists(name, judgment_api_key, organization_id):
152
+ if issubclass(cls, TracePromptScorer):
153
+ is_trace = True
154
+ score_type = APIScorerType.TRACE_PROMPT_SCORER
155
+ else:
156
+ is_trace = False
157
+ score_type = APIScorerType.PROMPT_SCORER
134
158
  push_prompt_scorer(
135
- name, prompt, threshold, options, judgment_api_key, organization_id
159
+ name,
160
+ prompt,
161
+ threshold,
162
+ options,
163
+ judgment_api_key,
164
+ organization_id,
165
+ is_trace,
136
166
  )
137
167
  judgeval_logger.info(f"Successfully created PromptScorer: {name}")
138
168
  return cls(
169
+ score_type=score_type,
139
170
  name=name,
140
171
  prompt=prompt,
141
172
  threshold=threshold,
@@ -251,3 +282,11 @@ class PromptScorer(APIScorerConfig):
251
282
  k: getattr(self, k) for k in extra_fields if getattr(self, k) is not None
252
283
  }
253
284
  return base
285
+
286
+
287
+ class PromptScorer(BasePromptScorer, ExampleAPIScorerConfig):
288
+ pass
289
+
290
+
291
+ class TracePromptScorer(BasePromptScorer, TraceAPIScorerConfig):
292
+ pass
@@ -43,8 +43,7 @@ from judgeval.env import (
43
43
  JUDGMENT_ORG_ID,
44
44
  )
45
45
  from judgeval.logger import judgeval_logger
46
- from judgeval.scorers.api_scorer import APIScorerConfig
47
- from judgeval.scorers.trace_api_scorer import TraceAPIScorerConfig
46
+ from judgeval.scorers.api_scorer import ExampleAPIScorerConfig, TraceAPIScorerConfig
48
47
  from judgeval.scorers.base_scorer import BaseScorer
49
48
  from judgeval.tracer.constants import JUDGEVAL_TRACER_INSTRUMENTING_MODULE_NAME
50
49
  from judgeval.tracer.managers import (
@@ -57,7 +56,7 @@ from judgeval.utils.serialize import safe_serialize
57
56
  from judgeval.version import get_version
58
57
  from judgeval.warnings import JudgmentWarning
59
58
 
60
- from judgeval.tracer.keys import AttributeKeys, ResourceKeys, InternalAttributeKeys
59
+ from judgeval.tracer.keys import AttributeKeys, InternalAttributeKeys
61
60
  from judgeval.api import JudgmentSyncClient
62
61
  from judgeval.tracer.llm import wrap_provider
63
62
  from judgeval.utils.url import url_for
@@ -65,6 +64,7 @@ from judgeval.tracer.local_eval_queue import LocalEvaluationQueue
65
64
  from judgeval.tracer.processors import (
66
65
  JudgmentSpanProcessor,
67
66
  NoOpJudgmentSpanProcessor,
67
+ NoOpSpanProcessor,
68
68
  )
69
69
  from judgeval.tracer.utils import set_span_attribute, TraceScorerConfig
70
70
 
@@ -85,19 +85,6 @@ class AgentContext(TypedDict):
85
85
  parent_agent_id: str | None
86
86
 
87
87
 
88
- def resolve_project_id(
89
- api_key: str, organization_id: str, project_name: str
90
- ) -> str | None:
91
- try:
92
- client = JudgmentSyncClient(
93
- api_key=api_key,
94
- organization_id=organization_id,
95
- )
96
- return client.projects_resolve({"project_name": project_name})["project_id"]
97
- except Exception:
98
- return None
99
-
100
-
101
88
  class Tracer:
102
89
  _active_tracers: List[Tracer] = []
103
90
 
@@ -188,38 +175,20 @@ class Tracer:
188
175
  self.cost_context = ContextVar("current_cost_context", default=None)
189
176
 
190
177
  if self.enable_monitoring:
191
- project_id = resolve_project_id(
192
- self.api_key, self.organization_id, self.project_name
193
- )
194
-
195
- resource_attributes = resource_attributes or {}
196
- resource_attributes.update(
197
- {
198
- ResourceKeys.SERVICE_NAME: self.project_name,
199
- ResourceKeys.TELEMETRY_SDK_NAME: "judgeval",
200
- ResourceKeys.TELEMETRY_SDK_VERSION: get_version(),
201
- }
202
- )
203
-
204
- if project_id is not None:
205
- resource_attributes[ResourceKeys.JUDGMENT_PROJECT_ID] = project_id
206
- else:
207
- judgeval_logger.error(
208
- f"Failed to resolve project {self.project_name}, please create it first at https://app.judgmentlabs.ai/projects. Skipping Judgment export."
209
- )
210
-
211
- resource = Resource.create(resource_attributes)
212
-
213
178
  self.judgment_processor = JudgmentSpanProcessor(
214
179
  self,
215
- self.api_url,
180
+ self.project_name,
216
181
  self.api_key,
217
182
  self.organization_id,
218
183
  max_queue_size=2**18,
219
184
  export_timeout_millis=30000,
185
+ resource_attributes=resource_attributes,
220
186
  )
221
- self.processors.append(self.judgment_processor)
187
+
188
+ resource = Resource.create(self.judgment_processor.resource_attributes)
222
189
  self.provider = TracerProvider(resource=resource)
190
+
191
+ self.processors.append(self.judgment_processor)
223
192
  for processor in self.processors:
224
193
  self.provider.add_span_processor(processor)
225
194
 
@@ -253,6 +222,14 @@ class Tracer:
253
222
  def get_current_cost_context(self):
254
223
  return self.cost_context
255
224
 
225
+ def get_processor(self):
226
+ """Get the judgment span processor instance.
227
+
228
+ Returns:
229
+ The JudgmentSpanProcessor or NoOpJudgmentSpanProcessor instance used by this tracer.
230
+ """
231
+ return self.judgment_processor
232
+
256
233
  def set_customer_id(self, customer_id: str) -> None:
257
234
  span = self.get_current_span()
258
235
  if span and span.is_recording():
@@ -507,11 +484,11 @@ class Tracer:
507
484
  safe_serialize(format_inputs(f, args, kwargs)),
508
485
  )
509
486
 
487
+ self.judgment_processor.emit_partial()
488
+
510
489
  if scorer_config:
511
490
  self._set_pending_trace_eval(span, scorer_config, args, kwargs)
512
491
 
513
- self.judgment_processor.emit_partial()
514
-
515
492
  result = f(*args, **kwargs)
516
493
  except Exception as user_exc:
517
494
  span.record_exception(user_exc)
@@ -559,13 +536,13 @@ class Tracer:
559
536
  safe_serialize(format_inputs(f, args, kwargs)),
560
537
  )
561
538
 
539
+ self.judgment_processor.emit_partial()
540
+
562
541
  if scorer_config:
563
542
  self._set_pending_trace_eval(
564
543
  main_span, scorer_config, args, kwargs
565
544
  )
566
545
 
567
- self.judgment_processor.emit_partial()
568
-
569
546
  generator = f(*args, **kwargs)
570
547
  set_span_attribute(
571
548
  main_span, AttributeKeys.JUDGMENT_OUTPUT, "<generator>"
@@ -609,11 +586,11 @@ class Tracer:
609
586
  safe_serialize(format_inputs(f, args, kwargs)),
610
587
  )
611
588
 
589
+ self.judgment_processor.emit_partial()
590
+
612
591
  if scorer_config:
613
592
  self._set_pending_trace_eval(span, scorer_config, args, kwargs)
614
593
 
615
- self.judgment_processor.emit_partial()
616
-
617
594
  result = await f(*args, **kwargs)
618
595
  except Exception as user_exc:
619
596
  span.record_exception(user_exc)
@@ -661,13 +638,13 @@ class Tracer:
661
638
  safe_serialize(format_inputs(f, args, kwargs)),
662
639
  )
663
640
 
641
+ self.judgment_processor.emit_partial()
642
+
664
643
  if scorer_config:
665
644
  self._set_pending_trace_eval(
666
645
  main_span, scorer_config, args, kwargs
667
646
  )
668
647
 
669
- self.judgment_processor.emit_partial()
670
-
671
648
  async_generator = f(*args, **kwargs)
672
649
  set_span_attribute(
673
650
  main_span, AttributeKeys.JUDGMENT_OUTPUT, "<async_generator>"
@@ -847,42 +824,6 @@ class Tracer:
847
824
 
848
825
  return sync_wrapper
849
826
 
850
- @overload
851
- def observe_tools(
852
- self,
853
- cls: Cls,
854
- /,
855
- *,
856
- exclude_methods: List[str] = [],
857
- include_private: bool = False,
858
- ) -> Cls: ...
859
-
860
- @overload
861
- def observe_tools(
862
- self,
863
- cls: None = None,
864
- /,
865
- *,
866
- exclude_methods: List[str] = [],
867
- include_private: bool = False,
868
- ) -> Callable[[Cls], Cls]: ...
869
-
870
- def observe_tools(
871
- self,
872
- cls: Cls | None = None,
873
- /,
874
- *,
875
- exclude_methods: List[str] = [],
876
- include_private: bool = False,
877
- ) -> Cls | Callable[[Cls], Cls]:
878
- if cls is None:
879
- return partial(
880
- self.observe_tools,
881
- exclude_methods=exclude_methods,
882
- include_private=include_private,
883
- )
884
- return cls
885
-
886
827
  def wrap(self, client: ApiClient) -> ApiClient:
887
828
  return wrap_provider(self, client)
888
829
 
@@ -913,11 +854,7 @@ class Tracer:
913
854
  proper cleanup before program termination.
914
855
  """
915
856
  try:
916
- success = self.force_flush(timeout_millis=30000)
917
- if not success:
918
- judgeval_logger.warning(
919
- "Some spans may not have been exported before program exit"
920
- )
857
+ self.force_flush(timeout_millis=30000)
921
858
  except Exception as e:
922
859
  judgeval_logger.warning(f"Error during atexit flush: {e}")
923
860
 
@@ -925,7 +862,7 @@ class Tracer:
925
862
  self,
926
863
  /,
927
864
  *,
928
- scorer: Union[APIScorerConfig, BaseScorer],
865
+ scorer: Union[ExampleAPIScorerConfig, BaseScorer],
929
866
  example: Example,
930
867
  model: str = JUDGMENT_DEFAULT_GPT_MODEL,
931
868
  sampling_rate: float = 1.0,
@@ -934,9 +871,9 @@ class Tracer:
934
871
  judgeval_logger.info("Evaluation is not enabled, skipping evaluation")
935
872
  return
936
873
 
937
- if not isinstance(scorer, (APIScorerConfig, BaseScorer)):
874
+ if not isinstance(scorer, (ExampleAPIScorerConfig, BaseScorer)):
938
875
  judgeval_logger.error(
939
- "Scorer must be an instance of APIScorerConfig or BaseScorer, got %s, skipping evaluation."
876
+ "Scorer must be an instance of ExampleAPIScorerConfig or BaseScorer, got %s, skipping evaluation."
940
877
  % type(scorer)
941
878
  )
942
879
  return
@@ -965,7 +902,7 @@ class Tracer:
965
902
  span_context = self.get_current_span().get_span_context()
966
903
  trace_id = format(span_context.trace_id, "032x")
967
904
  span_id = format(span_context.span_id, "016x")
968
- hosted_scoring = isinstance(scorer, APIScorerConfig) or (
905
+ hosted_scoring = isinstance(scorer, ExampleAPIScorerConfig) or (
969
906
  isinstance(scorer, BaseScorer) and scorer.server_hosted
970
907
  )
971
908
  eval_run_name = f"async_evaluate_{span_id}" # note this name doesnt matter because we don't save the experiment only the example and scorer_data
@@ -1074,3 +1011,13 @@ def format_inputs(
1074
1011
  return inputs
1075
1012
  except Exception:
1076
1013
  return {}
1014
+
1015
+
1016
+ # Export processor classes for direct access
1017
+ __all__ = [
1018
+ "Tracer",
1019
+ "wrap",
1020
+ "JudgmentSpanProcessor",
1021
+ "NoOpJudgmentSpanProcessor",
1022
+ "NoOpSpanProcessor",
1023
+ ]
@@ -24,7 +24,7 @@ class LocalEvaluationQueue:
24
24
  """Lightweight in-memory queue for local evaluation runs.
25
25
 
26
26
  Only supports EvaluationRuns with local scorers (BaseScorer instances).
27
- API scorers (APIScorerConfig) are not supported as they have their own queue.
27
+ API scorers (ExampleAPIScorerConfig) are not supported as they have their own queue.
28
28
  """
29
29
 
30
30
  def __init__(
@@ -54,7 +54,7 @@ class LocalEvaluationQueue:
54
54
  if not evaluation_run.custom_scorers:
55
55
  raise ValueError(
56
56
  "LocalEvaluationQueue only supports runs with local scorers (BaseScorer). "
57
- "Found only APIScorerConfig instances."
57
+ "Found only ExampleAPIScorerConfig instances."
58
58
  )
59
59
 
60
60
  return safe_run_async(
@@ -6,8 +6,13 @@ from opentelemetry.sdk.trace import ReadableSpan, Span, SpanProcessor, SpanConte
6
6
  from opentelemetry.sdk.trace.export import (
7
7
  BatchSpanProcessor,
8
8
  )
9
+ from opentelemetry.sdk.resources import Resource
9
10
  from judgeval.tracer.exporters import JudgmentSpanExporter
10
- from judgeval.tracer.keys import AttributeKeys, InternalAttributeKeys
11
+ from judgeval.tracer.keys import AttributeKeys, InternalAttributeKeys, ResourceKeys
12
+ from judgeval.api import JudgmentSyncClient
13
+ from judgeval.logger import judgeval_logger
14
+ from judgeval.utils.url import url_for
15
+ from judgeval.version import get_version
11
16
 
12
17
  if TYPE_CHECKING:
13
18
  from judgeval.tracer import Tracer
@@ -31,15 +36,27 @@ class JudgmentSpanProcessor(BatchSpanProcessor):
31
36
  def __init__(
32
37
  self,
33
38
  tracer: Tracer,
34
- endpoint: str,
39
+ project_name: str,
35
40
  api_key: str,
36
41
  organization_id: str,
37
42
  /,
38
43
  *,
39
44
  max_queue_size: int = 2**18,
40
45
  export_timeout_millis: int = 30000,
46
+ resource_attributes: Optional[dict[str, Any]] = None,
41
47
  ):
42
48
  self.tracer = tracer
49
+ self.project_name = project_name
50
+ self.api_key = api_key
51
+ self.organization_id = organization_id
52
+
53
+ # Resolve project_id
54
+ self.project_id = self._resolve_project_id()
55
+
56
+ # Set up resource attributes with project_id
57
+ self._setup_resource_attributes(resource_attributes or {})
58
+
59
+ endpoint = url_for("/otel/v1/traces")
43
60
  super().__init__(
44
61
  JudgmentSpanExporter(
45
62
  endpoint=endpoint,
@@ -53,6 +70,38 @@ class JudgmentSpanProcessor(BatchSpanProcessor):
53
70
  defaultdict(dict)
54
71
  )
55
72
 
73
+ def _resolve_project_id(self) -> str | None:
74
+ """Resolve project_id from project_name using the API."""
75
+ try:
76
+ client = JudgmentSyncClient(
77
+ api_key=self.api_key,
78
+ organization_id=self.organization_id,
79
+ )
80
+ return client.projects_resolve({"project_name": self.project_name})[
81
+ "project_id"
82
+ ]
83
+ except Exception:
84
+ return None
85
+
86
+ def _setup_resource_attributes(self, resource_attributes: dict[str, Any]) -> None:
87
+ """Set up resource attributes including project_id."""
88
+ resource_attributes.update(
89
+ {
90
+ ResourceKeys.SERVICE_NAME: self.project_name,
91
+ ResourceKeys.TELEMETRY_SDK_NAME: "judgeval",
92
+ ResourceKeys.TELEMETRY_SDK_VERSION: get_version(),
93
+ }
94
+ )
95
+
96
+ if self.project_id is not None:
97
+ resource_attributes[ResourceKeys.JUDGMENT_PROJECT_ID] = self.project_id
98
+ else:
99
+ judgeval_logger.error(
100
+ f"Failed to resolve project {self.project_name}, please create it first at https://app.judgmentlabs.ai/org/{self.organization_id}/projects. Skipping Judgment export."
101
+ )
102
+
103
+ self.resource_attributes = resource_attributes
104
+
56
105
  def _get_span_key(self, span_context: SpanContext) -> tuple[int, int]:
57
106
  return (span_context.trace_id, span_context.span_id)
58
107
 
@@ -103,11 +152,18 @@ class JudgmentSpanProcessor(BatchSpanProcessor):
103
152
 
104
153
  attributes = dict(current_span.attributes or {})
105
154
  attributes[AttributeKeys.JUDGMENT_UPDATE_ID] = current_update_id
155
+
156
+ existing_resource_attrs = (
157
+ dict(current_span.resource.attributes) if current_span.resource else {}
158
+ )
159
+ merged_resource_attrs = {**existing_resource_attrs, **self.resource_attributes}
160
+ merged_resource = Resource.create(merged_resource_attrs)
161
+
106
162
  partial_span = ReadableSpan(
107
163
  name=current_span.name,
108
164
  context=span_context,
109
165
  parent=current_span.parent,
110
- resource=current_span.resource,
166
+ resource=merged_resource,
111
167
  attributes=attributes,
112
168
  events=current_span.events,
113
169
  links=current_span.links,
@@ -137,11 +193,20 @@ class JudgmentSpanProcessor(BatchSpanProcessor):
137
193
  attributes = dict(span.attributes or {})
138
194
  attributes[AttributeKeys.JUDGMENT_UPDATE_ID] = 20
139
195
 
196
+ existing_resource_attrs = (
197
+ dict(span.resource.attributes) if span.resource else {}
198
+ )
199
+ merged_resource_attrs = {
200
+ **existing_resource_attrs,
201
+ **self.resource_attributes,
202
+ }
203
+ merged_resource = Resource.create(merged_resource_attrs)
204
+
140
205
  final_span = ReadableSpan(
141
206
  name=span.name,
142
207
  context=span.context,
143
208
  parent=span.parent,
144
- resource=span.resource,
209
+ resource=merged_resource,
145
210
  attributes=attributes,
146
211
  events=span.events,
147
212
  links=span.links,
@@ -160,7 +225,7 @@ class JudgmentSpanProcessor(BatchSpanProcessor):
160
225
 
161
226
  class NoOpJudgmentSpanProcessor(JudgmentSpanProcessor):
162
227
  def __init__(self):
163
- super().__init__(None, "", "", "") # type: ignore[arg-type]
228
+ pass
164
229
 
165
230
  def on_start(self, span: Span, parent_context: Optional[Context] = None) -> None:
166
231
  pass
@@ -177,5 +242,18 @@ class NoOpJudgmentSpanProcessor(JudgmentSpanProcessor):
177
242
  def emit_partial(self) -> None:
178
243
  pass
179
244
 
245
+ def set_internal_attribute(
246
+ self, span_context: SpanContext, key: str, value: Any
247
+ ) -> None:
248
+ pass
249
+
250
+ def get_internal_attribute(
251
+ self, span_context: SpanContext, key: str, default: Any = None
252
+ ) -> Any:
253
+ return default
254
+
255
+ def increment_update_id(self, span_context: SpanContext) -> int:
256
+ return 0
257
+
180
258
 
181
- __all__ = ("NoOpSpanProcessor", "JudgmentSpanProcessor", "NoOpJudgmentSpanProcessor")
259
+ __all__ = ["NoOpSpanProcessor", "JudgmentSpanProcessor", "NoOpJudgmentSpanProcessor"]
judgeval/tracer/utils.py CHANGED
@@ -2,7 +2,7 @@ from typing import Any
2
2
  from opentelemetry.trace import Span
3
3
  from pydantic import BaseModel
4
4
  from typing import Callable, Optional
5
- from judgeval.scorers.trace_api_scorer import TraceAPIScorerConfig
5
+ from judgeval.scorers.api_scorer import TraceAPIScorerConfig
6
6
  from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
7
7
 
8
8
 
@@ -10,7 +10,7 @@ from judgeval.tracer.exporters.store import SpanStore
10
10
  from judgeval.tracer.exporters import InMemorySpanExporter
11
11
  from judgeval.tracer.keys import AttributeKeys
12
12
  from judgeval import JudgmentClient
13
- from judgeval.scorers import BaseScorer, APIScorerConfig
13
+ from judgeval.scorers import BaseScorer, ExampleAPIScorerConfig
14
14
  from judgeval.data import Example
15
15
  from .console import _spinner_progress, _print_progress, _print_progress_update
16
16
  from judgeval.exceptions import JudgmentRuntimeError
@@ -154,7 +154,7 @@ class JudgmentTrainer:
154
154
  async def generate_rollouts_and_rewards(
155
155
  self,
156
156
  agent_function: Callable[[Any], Any],
157
- scorers: List[Union[APIScorerConfig, BaseScorer]],
157
+ scorers: List[Union[ExampleAPIScorerConfig, BaseScorer]],
158
158
  prompts: List[Any],
159
159
  num_prompts_per_step: Optional[int] = None,
160
160
  num_generations_per_prompt: Optional[int] = None,
@@ -264,7 +264,7 @@ class JudgmentTrainer:
264
264
  async def run_reinforcement_learning(
265
265
  self,
266
266
  agent_function: Callable[[Any], Any],
267
- scorers: List[Union[APIScorerConfig, BaseScorer]],
267
+ scorers: List[Union[ExampleAPIScorerConfig, BaseScorer]],
268
268
  prompts: List[Any],
269
269
  ) -> ModelConfig:
270
270
  """
@@ -370,7 +370,7 @@ class JudgmentTrainer:
370
370
  async def train(
371
371
  self,
372
372
  agent_function: Callable[[Any], Any],
373
- scorers: List[Union[APIScorerConfig, BaseScorer]],
373
+ scorers: List[Union[ExampleAPIScorerConfig, BaseScorer]],
374
374
  prompts: List[Any],
375
375
  rft_provider: Optional[str] = None,
376
376
  ) -> ModelConfig: