judgeval 0.9.4__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. judgeval/__init__.py +2 -2
  2. judgeval/api/__init__.py +28 -96
  3. judgeval/api/api_types.py +49 -140
  4. judgeval/constants.py +1 -5
  5. judgeval/data/__init__.py +1 -3
  6. judgeval/data/example.py +4 -2
  7. judgeval/data/judgment_types.py +57 -165
  8. judgeval/data/result.py +1 -2
  9. judgeval/data/trace.py +14 -40
  10. judgeval/dataset/__init__.py +15 -42
  11. judgeval/evaluation/__init__.py +23 -34
  12. judgeval/scorers/__init__.py +9 -7
  13. judgeval/scorers/api_scorer.py +8 -0
  14. judgeval/scorers/base_scorer.py +0 -1
  15. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -10
  16. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -2
  17. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
  18. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -2
  19. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -2
  20. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +43 -4
  21. judgeval/tracer/__init__.py +13 -50
  22. judgeval/tracer/local_eval_queue.py +2 -2
  23. judgeval/tracer/processors/__init__.py +1 -1
  24. judgeval/tracer/utils.py +1 -1
  25. judgeval/trainer/trainer.py +4 -4
  26. {judgeval-0.9.4.dist-info → judgeval-0.10.0.dist-info}/METADATA +1 -1
  27. {judgeval-0.9.4.dist-info → judgeval-0.10.0.dist-info}/RECORD +30 -35
  28. judgeval/data/trace_run.py +0 -39
  29. judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
  30. judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
  31. judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
  32. judgeval/scorers/trace_api_scorer.py +0 -5
  33. {judgeval-0.9.4.dist-info → judgeval-0.10.0.dist-info}/WHEEL +0 -0
  34. {judgeval-0.9.4.dist-info → judgeval-0.10.0.dist-info}/entry_points.txt +0 -0
  35. {judgeval-0.9.4.dist-info → judgeval-0.10.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -85,7 +85,6 @@ class BaseScorer(BaseModel):
85
85
  This method is used at eval time
86
86
  """
87
87
  self.model_client, self.using_native_model = create_judge(model)
88
- self.model = self.model_client.get_model_name() or model
89
88
 
90
89
  def success_check(self) -> bool:
91
90
  """
@@ -10,24 +10,16 @@ from judgeval.scorers.judgeval_scorers.api_scorers.answer_correctness import (
10
10
  from judgeval.scorers.judgeval_scorers.api_scorers.instruction_adherence import (
11
11
  InstructionAdherenceScorer,
12
12
  )
13
- from judgeval.scorers.judgeval_scorers.api_scorers.derailment_scorer import (
14
- DerailmentScorer,
15
- )
16
- from judgeval.scorers.judgeval_scorers.api_scorers.tool_order import ToolOrderScorer
17
13
  from judgeval.scorers.judgeval_scorers.api_scorers.prompt_scorer import (
14
+ TracePromptScorer,
18
15
  PromptScorer,
19
16
  )
20
- from judgeval.scorers.judgeval_scorers.api_scorers.tool_dependency import (
21
- ToolDependencyScorer,
22
- )
23
17
 
24
18
  __all__ = [
25
19
  "FaithfulnessScorer",
26
20
  "AnswerRelevancyScorer",
27
21
  "AnswerCorrectnessScorer",
28
22
  "InstructionAdherenceScorer",
29
- "DerailmentScorer",
30
- "ToolOrderScorer",
23
+ "TracePromptScorer",
31
24
  "PromptScorer",
32
- "ToolDependencyScorer",
33
25
  ]
@@ -6,13 +6,13 @@ TODO add link to docs page for this scorer
6
6
  """
7
7
 
8
8
  # Internal imports
9
- from judgeval.scorers.api_scorer import APIScorerConfig
9
+ from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
10
10
  from judgeval.constants import APIScorerType
11
11
  from judgeval.data import ExampleParams
12
12
  from typing import List
13
13
 
14
14
 
15
- class AnswerCorrectnessScorer(APIScorerConfig):
15
+ class AnswerCorrectnessScorer(ExampleAPIScorerConfig):
16
16
  score_type: APIScorerType = APIScorerType.ANSWER_CORRECTNESS
17
17
  required_params: List[ExampleParams] = [
18
18
  ExampleParams.INPUT,
@@ -1,10 +1,10 @@
1
- from judgeval.scorers.api_scorer import APIScorerConfig
1
+ from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
2
2
  from judgeval.constants import APIScorerType
3
3
  from judgeval.data import ExampleParams
4
4
  from typing import List
5
5
 
6
6
 
7
- class AnswerRelevancyScorer(APIScorerConfig):
7
+ class AnswerRelevancyScorer(ExampleAPIScorerConfig):
8
8
  score_type: APIScorerType = APIScorerType.ANSWER_RELEVANCY
9
9
  required_params: List[ExampleParams] = [
10
10
  ExampleParams.INPUT,
@@ -6,13 +6,13 @@ TODO add link to docs page for this scorer
6
6
  """
7
7
 
8
8
  # Internal imports
9
- from judgeval.scorers.api_scorer import APIScorerConfig
9
+ from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
10
10
  from judgeval.constants import APIScorerType
11
11
  from judgeval.data import ExampleParams
12
12
  from typing import List
13
13
 
14
14
 
15
- class FaithfulnessScorer(APIScorerConfig):
15
+ class FaithfulnessScorer(ExampleAPIScorerConfig):
16
16
  score_type: APIScorerType = APIScorerType.FAITHFULNESS
17
17
  required_params: List[ExampleParams] = [
18
18
  ExampleParams.INPUT,
@@ -6,12 +6,12 @@ TODO add link to docs page for this scorer
6
6
  """
7
7
 
8
8
  # Internal imports
9
- from judgeval.scorers.api_scorer import APIScorerConfig
9
+ from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
10
10
  from judgeval.constants import APIScorerType
11
11
  from judgeval.data import ExampleParams
12
12
 
13
13
 
14
- class InstructionAdherenceScorer(APIScorerConfig):
14
+ class InstructionAdherenceScorer(ExampleAPIScorerConfig):
15
15
  def __init__(self, threshold: float):
16
16
  super().__init__(
17
17
  threshold=threshold,
@@ -1,4 +1,8 @@
1
- from judgeval.scorers.api_scorer import APIScorerConfig
1
+ from judgeval.scorers.api_scorer import (
2
+ APIScorerConfig,
3
+ ExampleAPIScorerConfig,
4
+ TraceAPIScorerConfig,
5
+ )
2
6
  from judgeval.constants import APIScorerType
3
7
  from typing import Dict, Any, Optional
4
8
  from judgeval.api import JudgmentSyncClient
@@ -6,6 +10,7 @@ from judgeval.exceptions import JudgmentAPIError
6
10
  import os
7
11
  from copy import copy
8
12
  from judgeval.logger import judgeval_logger
13
+ from abc import ABC
9
14
 
10
15
 
11
16
  def push_prompt_scorer(
@@ -15,6 +20,7 @@ def push_prompt_scorer(
15
20
  options: Optional[Dict[str, float]] = None,
16
21
  judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
17
22
  organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
23
+ is_trace: Optional[bool] = None,
18
24
  ) -> str:
19
25
  client = JudgmentSyncClient(judgment_api_key, organization_id)
20
26
  try:
@@ -24,6 +30,7 @@ def push_prompt_scorer(
24
30
  "prompt": prompt,
25
31
  "threshold": threshold,
26
32
  "options": options,
33
+ "is_trace": is_trace,
27
34
  }
28
35
  )
29
36
  except JudgmentAPIError as e:
@@ -88,7 +95,7 @@ def scorer_exists(
88
95
  )
89
96
 
90
97
 
91
- class PromptScorer(APIScorerConfig):
98
+ class BasePromptScorer(ABC, APIScorerConfig):
92
99
  """
93
100
  In the Judgment backend, this scorer is implemented as a PromptScorer that takes
94
101
  1. a system role that may involve the Example object
@@ -97,9 +104,9 @@ class PromptScorer(APIScorerConfig):
97
104
  and uses a judge to execute the evaluation from the system role and classify into one of the options
98
105
  """
99
106
 
107
+ score_type: APIScorerType
100
108
  prompt: str
101
109
  options: Optional[Dict[str, float]] = None
102
- score_type: APIScorerType = APIScorerType.PROMPT_SCORER
103
110
  judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or ""
104
111
  organization_id: str = os.getenv("JUDGMENT_ORG_ID") or ""
105
112
 
@@ -111,7 +118,18 @@ class PromptScorer(APIScorerConfig):
111
118
  organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
112
119
  ):
113
120
  scorer_config = fetch_prompt_scorer(name, judgment_api_key, organization_id)
121
+ if scorer_config["is_trace"] != issubclass(cls, TracePromptScorer):
122
+ raise JudgmentAPIError(
123
+ status_code=400,
124
+ detail=f"Scorer with name {name} is not a {cls.__name__}",
125
+ response=None, # type: ignore
126
+ )
127
+ if issubclass(cls, TracePromptScorer):
128
+ score_type = APIScorerType.TRACE_PROMPT_SCORER
129
+ else:
130
+ score_type = APIScorerType.PROMPT_SCORER
114
131
  return cls(
132
+ score_type=score_type,
115
133
  name=name,
116
134
  prompt=scorer_config["prompt"],
117
135
  threshold=scorer_config["threshold"],
@@ -131,11 +149,24 @@ class PromptScorer(APIScorerConfig):
131
149
  organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
132
150
  ):
133
151
  if not scorer_exists(name, judgment_api_key, organization_id):
152
+ if issubclass(cls, TracePromptScorer):
153
+ is_trace = True
154
+ score_type = APIScorerType.TRACE_PROMPT_SCORER
155
+ else:
156
+ is_trace = False
157
+ score_type = APIScorerType.PROMPT_SCORER
134
158
  push_prompt_scorer(
135
- name, prompt, threshold, options, judgment_api_key, organization_id
159
+ name,
160
+ prompt,
161
+ threshold,
162
+ options,
163
+ judgment_api_key,
164
+ organization_id,
165
+ is_trace,
136
166
  )
137
167
  judgeval_logger.info(f"Successfully created PromptScorer: {name}")
138
168
  return cls(
169
+ score_type=score_type,
139
170
  name=name,
140
171
  prompt=prompt,
141
172
  threshold=threshold,
@@ -251,3 +282,11 @@ class PromptScorer(APIScorerConfig):
251
282
  k: getattr(self, k) for k in extra_fields if getattr(self, k) is not None
252
283
  }
253
284
  return base
285
+
286
+
287
+ class PromptScorer(BasePromptScorer, ExampleAPIScorerConfig):
288
+ pass
289
+
290
+
291
+ class TracePromptScorer(BasePromptScorer, TraceAPIScorerConfig):
292
+ pass
@@ -43,8 +43,7 @@ from judgeval.env import (
43
43
  JUDGMENT_ORG_ID,
44
44
  )
45
45
  from judgeval.logger import judgeval_logger
46
- from judgeval.scorers.api_scorer import APIScorerConfig
47
- from judgeval.scorers.trace_api_scorer import TraceAPIScorerConfig
46
+ from judgeval.scorers.api_scorer import ExampleAPIScorerConfig, TraceAPIScorerConfig
48
47
  from judgeval.scorers.base_scorer import BaseScorer
49
48
  from judgeval.tracer.constants import JUDGEVAL_TRACER_INSTRUMENTING_MODULE_NAME
50
49
  from judgeval.tracer.managers import (
@@ -485,11 +484,11 @@ class Tracer:
485
484
  safe_serialize(format_inputs(f, args, kwargs)),
486
485
  )
487
486
 
487
+ self.judgment_processor.emit_partial()
488
+
488
489
  if scorer_config:
489
490
  self._set_pending_trace_eval(span, scorer_config, args, kwargs)
490
491
 
491
- self.judgment_processor.emit_partial()
492
-
493
492
  result = f(*args, **kwargs)
494
493
  except Exception as user_exc:
495
494
  span.record_exception(user_exc)
@@ -537,13 +536,13 @@ class Tracer:
537
536
  safe_serialize(format_inputs(f, args, kwargs)),
538
537
  )
539
538
 
539
+ self.judgment_processor.emit_partial()
540
+
540
541
  if scorer_config:
541
542
  self._set_pending_trace_eval(
542
543
  main_span, scorer_config, args, kwargs
543
544
  )
544
545
 
545
- self.judgment_processor.emit_partial()
546
-
547
546
  generator = f(*args, **kwargs)
548
547
  set_span_attribute(
549
548
  main_span, AttributeKeys.JUDGMENT_OUTPUT, "<generator>"
@@ -587,11 +586,11 @@ class Tracer:
587
586
  safe_serialize(format_inputs(f, args, kwargs)),
588
587
  )
589
588
 
589
+ self.judgment_processor.emit_partial()
590
+
590
591
  if scorer_config:
591
592
  self._set_pending_trace_eval(span, scorer_config, args, kwargs)
592
593
 
593
- self.judgment_processor.emit_partial()
594
-
595
594
  result = await f(*args, **kwargs)
596
595
  except Exception as user_exc:
597
596
  span.record_exception(user_exc)
@@ -639,13 +638,13 @@ class Tracer:
639
638
  safe_serialize(format_inputs(f, args, kwargs)),
640
639
  )
641
640
 
641
+ self.judgment_processor.emit_partial()
642
+
642
643
  if scorer_config:
643
644
  self._set_pending_trace_eval(
644
645
  main_span, scorer_config, args, kwargs
645
646
  )
646
647
 
647
- self.judgment_processor.emit_partial()
648
-
649
648
  async_generator = f(*args, **kwargs)
650
649
  set_span_attribute(
651
650
  main_span, AttributeKeys.JUDGMENT_OUTPUT, "<async_generator>"
@@ -825,42 +824,6 @@ class Tracer:
825
824
 
826
825
  return sync_wrapper
827
826
 
828
- @overload
829
- def observe_tools(
830
- self,
831
- cls: Cls,
832
- /,
833
- *,
834
- exclude_methods: List[str] = [],
835
- include_private: bool = False,
836
- ) -> Cls: ...
837
-
838
- @overload
839
- def observe_tools(
840
- self,
841
- cls: None = None,
842
- /,
843
- *,
844
- exclude_methods: List[str] = [],
845
- include_private: bool = False,
846
- ) -> Callable[[Cls], Cls]: ...
847
-
848
- def observe_tools(
849
- self,
850
- cls: Cls | None = None,
851
- /,
852
- *,
853
- exclude_methods: List[str] = [],
854
- include_private: bool = False,
855
- ) -> Cls | Callable[[Cls], Cls]:
856
- if cls is None:
857
- return partial(
858
- self.observe_tools,
859
- exclude_methods=exclude_methods,
860
- include_private=include_private,
861
- )
862
- return cls
863
-
864
827
  def wrap(self, client: ApiClient) -> ApiClient:
865
828
  return wrap_provider(self, client)
866
829
 
@@ -899,7 +862,7 @@ class Tracer:
899
862
  self,
900
863
  /,
901
864
  *,
902
- scorer: Union[APIScorerConfig, BaseScorer],
865
+ scorer: Union[ExampleAPIScorerConfig, BaseScorer],
903
866
  example: Example,
904
867
  model: str = JUDGMENT_DEFAULT_GPT_MODEL,
905
868
  sampling_rate: float = 1.0,
@@ -908,9 +871,9 @@ class Tracer:
908
871
  judgeval_logger.info("Evaluation is not enabled, skipping evaluation")
909
872
  return
910
873
 
911
- if not isinstance(scorer, (APIScorerConfig, BaseScorer)):
874
+ if not isinstance(scorer, (ExampleAPIScorerConfig, BaseScorer)):
912
875
  judgeval_logger.error(
913
- "Scorer must be an instance of APIScorerConfig or BaseScorer, got %s, skipping evaluation."
876
+ "Scorer must be an instance of ExampleAPIScorerConfig or BaseScorer, got %s, skipping evaluation."
914
877
  % type(scorer)
915
878
  )
916
879
  return
@@ -939,7 +902,7 @@ class Tracer:
939
902
  span_context = self.get_current_span().get_span_context()
940
903
  trace_id = format(span_context.trace_id, "032x")
941
904
  span_id = format(span_context.span_id, "016x")
942
- hosted_scoring = isinstance(scorer, APIScorerConfig) or (
905
+ hosted_scoring = isinstance(scorer, ExampleAPIScorerConfig) or (
943
906
  isinstance(scorer, BaseScorer) and scorer.server_hosted
944
907
  )
945
908
  eval_run_name = f"async_evaluate_{span_id}" # note this name doesnt matter because we don't save the experiment only the example and scorer_data
@@ -24,7 +24,7 @@ class LocalEvaluationQueue:
24
24
  """Lightweight in-memory queue for local evaluation runs.
25
25
 
26
26
  Only supports EvaluationRuns with local scorers (BaseScorer instances).
27
- API scorers (APIScorerConfig) are not supported as they have their own queue.
27
+ API scorers (ExampleAPIScorerConfig) are not supported as they have their own queue.
28
28
  """
29
29
 
30
30
  def __init__(
@@ -54,7 +54,7 @@ class LocalEvaluationQueue:
54
54
  if not evaluation_run.custom_scorers:
55
55
  raise ValueError(
56
56
  "LocalEvaluationQueue only supports runs with local scorers (BaseScorer). "
57
- "Found only APIScorerConfig instances."
57
+ "Found only ExampleAPIScorerConfig instances."
58
58
  )
59
59
 
60
60
  return safe_run_async(
@@ -97,7 +97,7 @@ class JudgmentSpanProcessor(BatchSpanProcessor):
97
97
  resource_attributes[ResourceKeys.JUDGMENT_PROJECT_ID] = self.project_id
98
98
  else:
99
99
  judgeval_logger.error(
100
- f"Failed to resolve project {self.project_name}, please create it first at https://app.judgmentlabs.ai/projects. Skipping Judgment export."
100
+ f"Failed to resolve project {self.project_name}, please create it first at https://app.judgmentlabs.ai/org/{self.organization_id}/projects. Skipping Judgment export."
101
101
  )
102
102
 
103
103
  self.resource_attributes = resource_attributes
judgeval/tracer/utils.py CHANGED
@@ -2,7 +2,7 @@ from typing import Any
2
2
  from opentelemetry.trace import Span
3
3
  from pydantic import BaseModel
4
4
  from typing import Callable, Optional
5
- from judgeval.scorers.trace_api_scorer import TraceAPIScorerConfig
5
+ from judgeval.scorers.api_scorer import TraceAPIScorerConfig
6
6
  from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
7
7
 
8
8
 
@@ -10,7 +10,7 @@ from judgeval.tracer.exporters.store import SpanStore
10
10
  from judgeval.tracer.exporters import InMemorySpanExporter
11
11
  from judgeval.tracer.keys import AttributeKeys
12
12
  from judgeval import JudgmentClient
13
- from judgeval.scorers import BaseScorer, APIScorerConfig
13
+ from judgeval.scorers import BaseScorer, ExampleAPIScorerConfig
14
14
  from judgeval.data import Example
15
15
  from .console import _spinner_progress, _print_progress, _print_progress_update
16
16
  from judgeval.exceptions import JudgmentRuntimeError
@@ -154,7 +154,7 @@ class JudgmentTrainer:
154
154
  async def generate_rollouts_and_rewards(
155
155
  self,
156
156
  agent_function: Callable[[Any], Any],
157
- scorers: List[Union[APIScorerConfig, BaseScorer]],
157
+ scorers: List[Union[ExampleAPIScorerConfig, BaseScorer]],
158
158
  prompts: List[Any],
159
159
  num_prompts_per_step: Optional[int] = None,
160
160
  num_generations_per_prompt: Optional[int] = None,
@@ -264,7 +264,7 @@ class JudgmentTrainer:
264
264
  async def run_reinforcement_learning(
265
265
  self,
266
266
  agent_function: Callable[[Any], Any],
267
- scorers: List[Union[APIScorerConfig, BaseScorer]],
267
+ scorers: List[Union[ExampleAPIScorerConfig, BaseScorer]],
268
268
  prompts: List[Any],
269
269
  ) -> ModelConfig:
270
270
  """
@@ -370,7 +370,7 @@ class JudgmentTrainer:
370
370
  async def train(
371
371
  self,
372
372
  agent_function: Callable[[Any], Any],
373
- scorers: List[Union[APIScorerConfig, BaseScorer]],
373
+ scorers: List[Union[ExampleAPIScorerConfig, BaseScorer]],
374
374
  prompts: List[Any],
375
375
  rft_provider: Optional[str] = None,
376
376
  ) -> ModelConfig:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.9.4
3
+ Version: 0.10.0
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -1,69 +1,64 @@
1
- judgeval/__init__.py,sha256=1af4bHzNfPIajg0F19xg95sxT7_-vI2jbloic2XhX5M,4948
1
+ judgeval/__init__.py,sha256=MqB1s0zp-Fr_KvKFjGKnRHUeulutmrlMcUyjNRRAU_4,4962
2
2
  judgeval/cli.py,sha256=R5IiIQmSVg21kQHX2kL3sOeXCxvvAMSqyva3Z9AoSXc,1560
3
- judgeval/constants.py,sha256=fqzSY7tDfseWy1trLjCSGC6WVOFEm_4hvA8IFpv7CUc,3683
3
+ judgeval/constants.py,sha256=h7Cuf_2uvNzHZi8nqRFoMpvsQUZMS3mlNB3s2uduse8,3557
4
4
  judgeval/env.py,sha256=R0bj7XU29RIVVQjkVMa11ObhOYVMbaE_3LTvL3I9dWM,2212
5
5
  judgeval/exceptions.py,sha256=tTbfe4yoOtPXmn22UQz9-6a-5PT9uOko85xaRRwr0Sw,621
6
6
  judgeval/logger.py,sha256=ZWbp0QfT1CJnQIjV-Zle4n489nFCKEmD2-ukx--iiow,1553
7
7
  judgeval/version.py,sha256=kJtYsih3hTYZ_rY_Lt0RcFqvjAfF5Xo1uNq0jZWJ5pw,73
8
8
  judgeval/warnings.py,sha256=LbGte14ppiFjrkp-JJYueZ40NWFvMkWRvPXr6r-fUWw,73
9
- judgeval/api/__init__.py,sha256=DZ-dijtkzUsjY3CBdCh6TH_PHC5qlI_tAFCBgvAZNjU,14538
10
- judgeval/api/api_types.py,sha256=4xyqlmV9mEoTUIbii-bj7oS0fVwWrJ_UhYxpXvcBywA,9198
11
- judgeval/data/__init__.py,sha256=1QagDcSQtfnJ632t9Dnq8d7XjAqhmY4mInOWt8qH9tM,455
9
+ judgeval/api/__init__.py,sha256=RWQDwzT93nXWih3WYMPl1OL2ga9uk0dUGYV7fEDzBso,12764
10
+ judgeval/api/api_types.py,sha256=uyz8ePQI-ec88PVwhHN-KVmldAmNgRjOVmesVDKIBUw,6461
11
+ judgeval/data/__init__.py,sha256=1tU0EN0ThIfQ1fad5I3dKxAfTcZ5U8cvTLcQ6qLVLU0,407
12
12
  judgeval/data/evaluation_run.py,sha256=G7ad4eDQTjketfcQRITk8bs8CIO8rm058H1G_qkLmhc,4729
13
- judgeval/data/example.py,sha256=aTZg0GWQmUEBHk1n9Asw8sz-8YBWKlFsMZYjwq1DfrI,917
14
- judgeval/data/judgment_types.py,sha256=b2pDeEOSl_zHJLDzqr0AGYbZ5zrooJMr5VmK-bDrN4o,17082
15
- judgeval/data/result.py,sha256=JQ6f0XzL9p0oPmx-_z2NKUcISO6pISsVZ5dT1jkBeZs,2120
13
+ judgeval/data/example.py,sha256=eGJpF-lyUH734Cg90B7WtU9f8iKoS3VFGeV6R-GVCCc,1039
14
+ judgeval/data/judgment_types.py,sha256=JkhNG6fRBFdryG8ogVZsMWtq3W3JmWh0AYIR8LdBAT4,11773
15
+ judgeval/data/result.py,sha256=LA0OzwcVKwD5NkmtmFuA_EusmYRyE10mjDMXa2bgU1g,2067
16
16
  judgeval/data/scorer_data.py,sha256=g9PE0DNLikW0LgxGWhgpCiNVOX8PzqEaZKivifLOUDI,2997
17
17
  judgeval/data/tool.py,sha256=bj_WxFg22mypUUVR5KqQRxMDHWvKwiE1MMPjLnTCoDU,99
18
- judgeval/data/trace.py,sha256=HTeucJqNdFsQI3Ybb6mJ8NkmHkc1vIddzQ7BtQs25k8,1315
19
- judgeval/data/trace_run.py,sha256=VCQUdDlrHixyiqWW1RUiCtLgqMt-3oW1M1A7CCer2Ok,1635
18
+ judgeval/data/trace.py,sha256=R9RF1kv1JHeOpjXLjErJcxV2RrNrJUSqWcWe73l3f9k,503
20
19
  judgeval/data/scripts/fix_default_factory.py,sha256=lvp2JwYZqz-XpD9LZNa3mANZVP-jJSZoNzolI6JWERM,591
21
20
  judgeval/data/scripts/openapi_transform.py,sha256=Sm04JClzyP1ga8KA3gkIdsae8Hlx-XU7-x0gHCQYOhg,3877
22
- judgeval/dataset/__init__.py,sha256=xlg4VgEvbReWOlk6MK4GqJubSOeo17VqoIyjdMwmIf8,6573
23
- judgeval/evaluation/__init__.py,sha256=O0sk3zP5jbHPtknT6DuB6ijkJ3-0I54mMf1UzDPKMF0,15409
21
+ judgeval/dataset/__init__.py,sha256=S1iLL7ivDLIT3aTNO1ardHqhIRxXMuoW5PFLFIkt4uY,5731
22
+ judgeval/evaluation/__init__.py,sha256=u-aDyLTRebPZigeBbJHpnZk3wQAS7jv_VgLXIi-jMGU,15075
24
23
  judgeval/integrations/langgraph/__init__.py,sha256=VvqCKOk65A2gLlr8uWrJVzpRF5OnIja5zwF4hGPEFsw,27540
25
24
  judgeval/judges/__init__.py,sha256=e7JnTc1TG_SwqydDHTXHIP0EBazQxt-ydMQG7ghSU5A,228
26
25
  judgeval/judges/base_judge.py,sha256=_dz0qWsKRxzXxpRY9l6mrxTRYPSF2FE4ZXkrzhZ4gbY,986
27
26
  judgeval/judges/litellm_judge.py,sha256=5vEF0IUo7HVWnOF2ww-DMke8Xkarnz32B_qbgKjc0-I,4182
28
27
  judgeval/judges/together_judge.py,sha256=GzwlXZJzle8hT-vWKmq39JyIeanJqJfHDOkrksUbzk0,4398
29
28
  judgeval/judges/utils.py,sha256=ITbYwvjU3o9-FIAReFvxh24yJrx9LV3l9BnSBgKUpxg,2068
30
- judgeval/scorers/__init__.py,sha256=a5f_QcC7P9DjoOu_DMmADlkIXebo0d3zEJDJ7mhN3tM,640
29
+ judgeval/scorers/__init__.py,sha256=34PMPsfR2_3n7T96wpSfAZJWzWlU6v53S3mGX2PE87k,665
31
30
  judgeval/scorers/agent_scorer.py,sha256=V1NSwhGWgtXPsX-blKLkDLsPPbEiP-A4614X-95dtlQ,565
32
- judgeval/scorers/api_scorer.py,sha256=M7cwJ2YY2Mw0pCo1UH-29jwrNd2PdiBRdQtmWS5ijXA,2173
33
- judgeval/scorers/base_scorer.py,sha256=8uhkmj78R6-Stenl1eo6IVqKSBgkLpoqR0acGi-Fxik,2788
31
+ judgeval/scorers/api_scorer.py,sha256=8TUJut9r74v-qMACiSKAUbDI1v3ZItPXrTz8s4_Lrgk,2287
32
+ judgeval/scorers/base_scorer.py,sha256=naGiZYHnkn9HVwY-jpOY7O6cYPJJJe5dHbrRBSOikxw,2723
34
33
  judgeval/scorers/example_scorer.py,sha256=o_BGUztJXjnKnuOqIa9T4PXe0wPoWg63FyH518N1LxA,561
35
34
  judgeval/scorers/exceptions.py,sha256=ACDHK5-TWiF3NTk-wycaedpbrdobm-CvvC1JA_iP-Mk,179
36
35
  judgeval/scorers/score.py,sha256=xquM59SCtNeuAsrBsHFgBQk3CHp4-bms4oFs24xfcU0,7176
37
- judgeval/scorers/trace_api_scorer.py,sha256=B2Vp8Jj2I7N-G1weHMm1b_9gVbn0BMcOtestMFNtx08,112
38
36
  judgeval/scorers/utils.py,sha256=iSZONwK0HecxUPz-cMCyra_87DSCag1E8BdpF2a4_44,377
39
37
  judgeval/scorers/judgeval_scorers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
40
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=MFsxDPZoZibJlsz4RgtLehA3rVcEfS9o5cw0l8gI5IM,1046
41
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py,sha256=zJsU0VrUmRhY9qav48c6jTyDqUwI3JzhV9ajtlJCe0M,544
42
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py,sha256=UDfzTO9Fx0FA5o0wfD8kprrGA4eW-43Rn9Gc0BQtKgY,393
43
- judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py,sha256=mbBvirNcivu9dP6deM7FogDXrdwI9o8yqsO8IeKPSb4,309
44
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py,sha256=ps51bTgQsD9xGYsk1v9bx0WxQMqywSllCE9_xlJkLd8,531
45
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py,sha256=aQzu-TiGqG74JDQ927evv5yGmnZw2AOolyHvlIhiUbI,683
46
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py,sha256=C-9Q7s9K7mcgFMcEL0I_7XQZMRqrL5MFRi9G6Dx8-v8,8505
47
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py,sha256=BhrLnIASZOTT9XJ6giYSoVfdR7NYsjRRTOTNioNtEiU,610
48
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py,sha256=bMu0WMJaXdMyDTN42sVLoWV-lrUHCEa8iDrCI_K7nlQ,808
49
- judgeval/tracer/__init__.py,sha256=0DM6ixBI75FuVG7UMG_k-KHJm1MyFbRyhAUPm2GYu9A,36057
38
+ judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=wrq7y9I30GZbwDXIrSh81KRO_-j7i-1DjwX5Hc3PScI,728
39
+ judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py,sha256=_qa1sOHUwJubBCfyx6lsE_4vZsUh65VoTZba1NSouis,558
40
+ judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py,sha256=ciiFBQQC4UDsk9qou9OiKbAR31s82eRUY1ZTt1gdM-0,407
41
+ judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py,sha256=lIJ3GgOI9tfbrC7voZMvlxXdK3X1bhdj2zNxqdaGIkM,545
42
+ judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py,sha256=bSwbpVNhpkpEeX3GtCJuyz5vFyY1gbyqYEfaBF2KTVY,697
43
+ judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py,sha256=E2_TVO88iLSBAdcKYnfHYp4cUyffgG_p1th5aCpjCd8,9680
44
+ judgeval/tracer/__init__.py,sha256=mQQaca8XJRYwSRn7a5x63dFQeA8xGjwfoZYikQCAAyI,35214
50
45
  judgeval/tracer/constants.py,sha256=ae8tivAW97awJQxdRB9OMqX50wOLX3zqChT_AGkPBu0,85
51
46
  judgeval/tracer/keys.py,sha256=qXPoZSkEhVF-YYfQ9-zeDMVdr4GtpPf2W7MPJaN2AQo,2889
52
- judgeval/tracer/local_eval_queue.py,sha256=Amt7xkdmVJH1l2itm-ogiIW5oDaLnACisGfsdZjazn0,7228
47
+ judgeval/tracer/local_eval_queue.py,sha256=iv9on1G4woGlhYn1mZATEMkzCiz-qVn2cdzEINzQFYQ,7242
53
48
  judgeval/tracer/managers.py,sha256=h2ZHJ61_vf3cS-HlEUiodFzKDUuQWIhYC6n7pMVyM9c,6113
54
- judgeval/tracer/utils.py,sha256=jljfr-oiCy8agOh0apAoR04tR2XRAzFg51On_LPzue8,600
49
+ judgeval/tracer/utils.py,sha256=3_8ZjjF4XgNyAu9LpThq5dVOcwdwI-E3vb-HRl_Px8c,594
55
50
  judgeval/tracer/exporters/__init__.py,sha256=lnZXfPGaQH844HAIuZCQqjqhnmZGA98kHY8Xp-Oi4Ws,1220
56
51
  judgeval/tracer/exporters/s3.py,sha256=N9gmw17cnR0VkfAQQkLsNj5BksgNRETThR5qYhWRjP4,4360
57
52
  judgeval/tracer/exporters/store.py,sha256=KQV3cyqteesByQjR-9VdPXT9OlUZ-6F08ogqj837_c0,1012
58
53
  judgeval/tracer/exporters/utils.py,sha256=JRcoSQuEHxMDJbXfyrUIfA2SHBVkZM82h4bTbYGxkNw,1154
59
54
  judgeval/tracer/llm/__init__.py,sha256=p9uwWPg9k-NcWjj9TbwQj55sHhBOqRYx2-Ld6YHaFUs,42625
60
55
  judgeval/tracer/llm/providers.py,sha256=QQLJlSNnDjXRAc2Wqw78o254COJUSXX39D7D_mx3NVA,2651
61
- judgeval/tracer/processors/__init__.py,sha256=fjk3zGxQGp6adnj1-QdSaiRJk-VhyzuKG5vCalvbucI,8645
56
+ judgeval/tracer/processors/__init__.py,sha256=tXbQaXGMQeutgM_7d5Y2EFTeSjbVEBky685Dst_v3rg,8672
62
57
  judgeval/trainer/__init__.py,sha256=h_DDVV7HFF7HUPAJFpt2d9wjqgnmEVcHxqZyB1k7pPQ,257
63
58
  judgeval/trainer/config.py,sha256=8s0X8B334PJomorwONaUpb6K8cAMxRdYAeQdtx7HPHs,4258
64
59
  judgeval/trainer/console.py,sha256=PJ0rCnDwC7aoW-VsLDS96ZyMyagh-l9EOJKff1ATIpo,4342
65
60
  judgeval/trainer/trainable_model.py,sha256=vSDtHJJ-fLczC2gkaY9jG6TQvLgWqaVjElm1l8YlJcU,8959
66
- judgeval/trainer/trainer.py,sha256=_dlV0NSD4jfNgTb2GwghWGBmnoNsooQq85nvIWW5VR4,16550
61
+ judgeval/trainer/trainer.py,sha256=YhepEm3M-5z1RB50cAEsLbZiOIE_fOWiX-thyvBj6v4,16578
67
62
  judgeval/utils/async_utils.py,sha256=lgCgi8gkLUcAEepruEkx-AGQgJnAJpKmBIhZx6Y0q2s,935
68
63
  judgeval/utils/decorators.py,sha256=rdqY1w0zNL6O6GU6Wdeo0-x5EgpFTEhU2vkgiWsRYdc,525
69
64
  judgeval/utils/file_utils.py,sha256=3LI1YCZwO5ogTgJreyOgRgDksey3natO2Td1PQqaPyY,3252
@@ -73,8 +68,8 @@ judgeval/utils/serialize.py,sha256=QXR-8Nj5rqOrI9zLx0oRLdk6DW6Bc7j8eyF4zQ7PLxA,6
73
68
  judgeval/utils/testing.py,sha256=kJOq4LlEXaNThfg9oSIRqSK7IH8AwLgbukjn5uxMY7A,3661
74
69
  judgeval/utils/url.py,sha256=Shf0v3XcbaWpL0m1eGJEEO_z4TsQCnDB2Rl25OTUmiI,195
75
70
  judgeval/utils/version_check.py,sha256=kcF6SvB6GbVKI0Gv9QRVm-kvBn9_z-c3jmPORsXO3h0,1015
76
- judgeval-0.9.4.dist-info/METADATA,sha256=Hr4y27-wt-658_DJd_D7oAUpDebQS5a9jfdQwGvfmbg,8869
77
- judgeval-0.9.4.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
78
- judgeval-0.9.4.dist-info/entry_points.txt,sha256=-eoeD-oDLn4A7MSgeBS9Akwanf3_0r0cgEleBcIOjg0,46
79
- judgeval-0.9.4.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
80
- judgeval-0.9.4.dist-info/RECORD,,
71
+ judgeval-0.10.0.dist-info/METADATA,sha256=vpsStrROABbjYIuuO8UqssmVjq70k4rLH2AvEz4jie8,8870
72
+ judgeval-0.10.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
73
+ judgeval-0.10.0.dist-info/entry_points.txt,sha256=-eoeD-oDLn4A7MSgeBS9Akwanf3_0r0cgEleBcIOjg0,46
74
+ judgeval-0.10.0.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
75
+ judgeval-0.10.0.dist-info/RECORD,,
@@ -1,39 +0,0 @@
1
- from pydantic import BaseModel
2
- from typing import List, Optional, Dict, Any, Union
3
- from judgeval.data import Trace
4
- from judgeval.scorers import APIScorerConfig, BaseScorer
5
- from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
6
-
7
-
8
- class TraceRun(BaseModel):
9
- """
10
- Stores example and evaluation scorers together for running an eval task
11
-
12
- Args:
13
- project_name (str): The name of the project the evaluation results belong to
14
- eval_name (str): A name for this evaluation run
15
- traces (List[Trace]): The traces to evaluate
16
- scorers (List[Union[JudgmentScorer, BaseScorer]]): A list of scorers to use for evaluation
17
- model (str): The model used as a judge when using LLM as a Judge
18
- metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
19
- rules (Optional[List[Rule]]): Rules to evaluate against scoring results
20
- append (Optional[bool]): Whether to append to existing evaluation results
21
- tools (Optional[List[Dict[str, Any]]]): List of tools to use for evaluation
22
- """
23
-
24
- organization_id: Optional[str] = None
25
- project_name: Optional[str] = None
26
- eval_name: Optional[str] = None
27
- traces: Optional[List[Trace]] = None
28
- scorers: List[Union[APIScorerConfig, BaseScorer]]
29
- model: Optional[str] = JUDGMENT_DEFAULT_GPT_MODEL
30
- trace_span_id: Optional[str] = None
31
- append: Optional[bool] = False
32
- override: Optional[bool] = False
33
-
34
- # TODO: ?
35
- rules: Any = None
36
- tools: Optional[List[Dict[str, Any]]] = None
37
-
38
- class Config:
39
- arbitrary_types_allowed = True
@@ -1,14 +0,0 @@
1
- """
2
- `judgeval` answer relevancy scorer
3
-
4
- TODO add link to docs page for this scorer
5
-
6
- """
7
-
8
- # Internal imports
9
- from judgeval.scorers.api_scorer import APIScorerConfig
10
- from judgeval.constants import APIScorerType
11
-
12
-
13
- class DerailmentScorer(APIScorerConfig):
14
- score_type: APIScorerType = APIScorerType.DERAILMENT
@@ -1,20 +0,0 @@
1
- """
2
- `judgeval` tool dependency scorer
3
- """
4
-
5
- # Internal imports
6
- from judgeval.scorers.trace_api_scorer import TraceAPIScorerConfig
7
- from judgeval.constants import APIScorerType
8
- from typing import Optional, Dict
9
-
10
-
11
- class ToolDependencyScorer(TraceAPIScorerConfig):
12
- kwargs: Optional[Dict] = None
13
-
14
- def __init__(self, threshold: float = 1.0, enable_param_checking: bool = True):
15
- super().__init__(threshold=threshold, score_type=APIScorerType.TOOL_DEPENDENCY)
16
- self.kwargs = {"enable_param_checking": enable_param_checking}
17
-
18
- @property
19
- def __name__(self):
20
- return "Tool Dependency"
@@ -1,27 +0,0 @@
1
- """
2
- `judgeval` tool order scorer
3
- """
4
-
5
- # Internal imports
6
- from judgeval.scorers.trace_api_scorer import TraceAPIScorerConfig
7
- from judgeval.constants import APIScorerType
8
- from typing import Dict, Any
9
-
10
-
11
- class ToolOrderScorer(TraceAPIScorerConfig):
12
- score_type: APIScorerType = APIScorerType.TOOL_ORDER
13
- threshold: float = 1.0
14
- exact_match: bool = False
15
-
16
- def model_dump(self, *args, **kwargs) -> Dict[str, Any]:
17
- base = super().model_dump(*args, **kwargs)
18
- base_fields = set(TraceAPIScorerConfig.model_fields.keys())
19
- all_fields = set(self.__class__.model_fields.keys())
20
-
21
- extra_fields = all_fields - base_fields - {"kwargs"}
22
-
23
- base["kwargs"] = {
24
- k: getattr(self, k) for k in extra_fields if getattr(self, k) is not None
25
- }
26
-
27
- return base