judgeval 0.0.3__tar.gz → 0.0.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {judgeval-0.0.3 → judgeval-0.0.4}/.github/workflows/ci.yaml +1 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/.gitignore +1 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/PKG-INFO +1 -1
- {judgeval-0.0.3 → judgeval-0.0.4}/Pipfile +3 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/docs/evaluation/introduction.mdx +18 -20
- judgeval-0.0.4/docs/evaluation/scorers/answer_correctness.mdx +56 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/docs/evaluation/scorers/answer_relevancy.mdx +1 -1
- judgeval-0.0.4/docs/evaluation/scorers/classifier_scorer.mdx +90 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/docs/evaluation/scorers/contextual_precision.mdx +1 -1
- {judgeval-0.0.3 → judgeval-0.0.4}/docs/evaluation/scorers/contextual_recall.mdx +1 -1
- {judgeval-0.0.3 → judgeval-0.0.4}/docs/evaluation/scorers/contextual_relevancy.mdx +1 -1
- {judgeval-0.0.3 → judgeval-0.0.4}/docs/evaluation/scorers/faithfulness.mdx +3 -4
- {judgeval-0.0.3 → judgeval-0.0.4}/docs/evaluation/scorers/hallucination.mdx +3 -4
- {judgeval-0.0.3 → judgeval-0.0.4}/docs/evaluation/scorers/json_correctness.mdx +3 -4
- {judgeval-0.0.3 → judgeval-0.0.4}/docs/evaluation/scorers/summarization.mdx +3 -4
- {judgeval-0.0.3 → judgeval-0.0.4}/docs/evaluation/scorers/tool_correctness.mdx +3 -4
- {judgeval-0.0.3 → judgeval-0.0.4}/docs/getting_started.mdx +31 -46
- judgeval-0.0.4/docs/images/trace_screenshot.png +0 -0
- judgeval-0.0.4/docs/judgment/introduction.mdx +7 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/docs/mint.json +9 -4
- judgeval-0.0.4/docs/monitoring/tracing.mdx +0 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/pyproject.toml +1 -1
- judgeval-0.0.4/src/demo/cookbooks/langchain_basic_rag/basic_agentic_rag.ipynb +781 -0
- judgeval-0.0.4/src/demo/cookbooks/langchain_basic_rag/tesla_q3.pdf +0 -0
- judgeval-0.0.4/src/demo/cookbooks/langchain_sales/example_product_price_id_mapping.json +1 -0
- judgeval-0.0.4/src/demo/cookbooks/langchain_sales/sales_agent_with_context.ipynb +1375 -0
- judgeval-0.0.4/src/demo/cookbooks/langchain_sales/sample_product_catalog.txt +20 -0
- judgeval-0.0.4/src/demo/cookbooks/openai_travel_agent/agent.py +208 -0
- judgeval-0.0.4/src/demo/cookbooks/openai_travel_agent/populate_db.py +73 -0
- judgeval-0.0.4/src/judgeval/__init__.py +12 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/common/tracer.py +57 -31
- {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/constants.py +1 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/data/__init__.py +2 -1
- {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/data/scorer_data.py +2 -2
- {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/evaluation_run.py +16 -15
- {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/judges/__init__.py +2 -2
- {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/judges/base_judge.py +1 -1
- {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/judges/litellm_judge.py +2 -2
- {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/judges/mixture_of_judges.py +2 -2
- {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/judges/together_judge.py +2 -2
- {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/judges/utils.py +4 -4
- {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/judgment_client.py +67 -15
- {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/run_evaluation.py +79 -14
- {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/scorers/__init__.py +8 -4
- judgeval-0.0.4/src/judgeval/scorers/api_scorer.py +64 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/scorers/base_scorer.py +3 -2
- judgeval-0.0.4/src/judgeval/scorers/exceptions.py +11 -0
- judgeval-0.0.3/src/judgeval/scorers/custom_scorer.py → judgeval-0.0.4/src/judgeval/scorers/judgeval_scorer.py +9 -5
- judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/__init__.py +144 -0
- judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +23 -0
- judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +19 -0
- {judgeval-0.0.3/src/judgeval/scorers/judgeval_scorers → judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/api_scorers}/answer_relevancy.py +2 -2
- {judgeval-0.0.3/src/judgeval/scorers/judgeval_scorers → judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/api_scorers}/contextual_precision.py +2 -2
- {judgeval-0.0.3/src/judgeval/scorers/judgeval_scorers → judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/api_scorers}/contextual_recall.py +2 -2
- {judgeval-0.0.3/src/judgeval/scorers/judgeval_scorers → judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/api_scorers}/contextual_relevancy.py +2 -2
- {judgeval-0.0.3/src/judgeval/scorers/judgeval_scorers → judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/api_scorers}/faithfulness.py +2 -2
- {judgeval-0.0.3/src/judgeval/scorers/judgeval_scorers → judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/api_scorers}/hallucination.py +2 -2
- {judgeval-0.0.3/src/judgeval/scorers/judgeval_scorers → judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/api_scorers}/json_correctness.py +7 -7
- {judgeval-0.0.3/src/judgeval/scorers/judgeval_scorers → judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/api_scorers}/summarization.py +2 -2
- {judgeval-0.0.3/src/judgeval/scorers/judgeval_scorers → judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/api_scorers}/tool_correctness.py +2 -2
- judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +24 -0
- judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +4 -0
- judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +272 -0
- judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +169 -0
- judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +4 -0
- judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +292 -0
- judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +174 -0
- judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +3 -0
- judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +259 -0
- judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +106 -0
- judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +3 -0
- judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +249 -0
- judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +142 -0
- judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +3 -0
- judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +240 -0
- judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +121 -0
- judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +3 -0
- judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +318 -0
- judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +265 -0
- judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +3 -0
- judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +258 -0
- judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +104 -0
- judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +127 -0
- judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +3 -0
- judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +247 -0
- judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +541 -0
- judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +3 -0
- judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +151 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/scorers/prompt_scorer.py +4 -4
- {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/scorers/score.py +14 -14
- {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/scorers/utils.py +40 -6
- judgeval-0.0.3/src/judgeval/__init__.py +0 -83
- judgeval-0.0.3/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -21
- {judgeval-0.0.3 → judgeval-0.0.4}/LICENSE.md +0 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/README.md +0 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/docs/README.md +0 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/docs/development.mdx +0 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/docs/essentials/code.mdx +0 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/docs/essentials/images.mdx +0 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/docs/essentials/markdown.mdx +0 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/docs/essentials/navigation.mdx +0 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/docs/essentials/reusable-snippets.mdx +0 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/docs/essentials/settings.mdx +0 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/docs/evaluation/data_datasets.mdx +0 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/docs/evaluation/data_examples.mdx +0 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/docs/evaluation/judges.mdx +0 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/docs/evaluation/scorers/custom_scorers.mdx +0 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/docs/evaluation/scorers/introduction.mdx +0 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/docs/favicon.svg +0 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/docs/images/checks-passed.png +0 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/docs/images/create_aggressive_scorer.png +0 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/docs/images/create_scorer.png +0 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/docs/images/evaluation_diagram.png +0 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/docs/images/hero-dark.svg +0 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/docs/images/hero-light.svg +0 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/docs/introduction.mdx +0 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/docs/logo/dark.svg +0 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/docs/logo/light.svg +0 -0
- {judgeval-0.0.3/docs/judgment → judgeval-0.0.4/docs/monitoring}/introduction.mdx +0 -0
- /judgeval-0.0.3/docs/evaluation/scorers/classifier_scorer.mdx → /judgeval-0.0.4/docs/monitoring/production_insights.mdx +0 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/docs/notebooks/create_dataset.ipynb +0 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/docs/notebooks/create_scorer.ipynb +0 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/docs/notebooks/demo.ipynb +0 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/docs/notebooks/prompt_scorer.ipynb +0 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/docs/notebooks/quickstart.ipynb +0 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/docs/quickstart.mdx +0 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/docs/snippets/snippet-intro.mdx +0 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/pytest.ini +0 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/clients.py +0 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/common/__init__.py +0 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/common/exceptions.py +0 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/common/logger.py +0 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/common/utils.py +0 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/data/api_example.py +0 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/data/datasets/__init__.py +0 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/data/datasets/dataset.py +0 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/data/datasets/ground_truth.py +0 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/data/datasets/utils.py +0 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/data/example.py +0 -0
- {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/data/result.py +0 -0
@@ -8,25 +8,6 @@ Evaluation is the process of **scoring** an LLM system's outputs with metrics; a
|
|
8
8
|
- An evaluation dataset
|
9
9
|
- Metrics we are interested in tracking
|
10
10
|
|
11
|
-
The ideal fit of evaluation into an application workflow looks like this:
|
12
|
-
|
13
|
-

|
14
|
-
|
15
|
-
## Metrics
|
16
|
-
|
17
|
-
`judgeval` comes with a set of 10+ built-in evaluation metrics. These metrics are accessible through `judgeval`'s `Scorer` interface.
|
18
|
-
Every `Scorer` has a `threshold` parameter that you can use in the context of unit testing your app.
|
19
|
-
|
20
|
-
```python scorer.py
|
21
|
-
from judgeval.scorers import FaithfulnessScorer
|
22
|
-
|
23
|
-
scorer = FaithfulnessScorer(threshold=1.0)
|
24
|
-
```
|
25
|
-
You can use scorers to evaluate your LLM system's outputs by using `Example`s.
|
26
|
-
|
27
|
-
<Tip>
|
28
|
-
We're always working on adding new scorers, so if you have a metric you'd like to add, please [let us know!](mailto:contact@judgmentlabs.ai)
|
29
|
-
</Tip>
|
30
11
|
|
31
12
|
## Examples
|
32
13
|
|
@@ -54,7 +35,7 @@ Creating an Example allows you to evaluate using
|
|
54
35
|
`judgeval`'s default scorers:
|
55
36
|
|
56
37
|
```python example.py
|
57
|
-
from judgeval
|
38
|
+
from judgeval import JudgmentClient
|
58
39
|
from judgeval.scorers import FaithfulnessScorer
|
59
40
|
|
60
41
|
client = JudgmentClient()
|
@@ -102,6 +83,23 @@ results = client.evaluate_dataset(
|
|
102
83
|
)
|
103
84
|
```
|
104
85
|
|
86
|
+
## Metrics
|
87
|
+
|
88
|
+
`judgeval` comes with a set of 10+ built-in evaluation metrics. These metrics are accessible through `judgeval`'s `Scorer` interface.
|
89
|
+
Every `Scorer` has a `threshold` parameter that you can use in the context of unit testing your app.
|
90
|
+
|
91
|
+
```python scorer.py
|
92
|
+
from judgeval.scorers import FaithfulnessScorer
|
93
|
+
|
94
|
+
scorer = FaithfulnessScorer(threshold=1.0)
|
95
|
+
```
|
96
|
+
You can use scorers to evaluate your LLM system's outputs by using `Example`s.
|
97
|
+
|
98
|
+
<Tip>
|
99
|
+
We're always working on adding new scorers, so if you have a metric you'd like to add, please [let us know!](mailto:contact@judgmentlabs.ai)
|
100
|
+
</Tip>
|
101
|
+
|
102
|
+
|
105
103
|
**Congratulations!** 🎉
|
106
104
|
|
107
105
|
You've learned the basics of building and running evaluations with `judgeval`.
|
@@ -0,0 +1,56 @@
|
|
1
|
+
---
|
2
|
+
title: Answer Correctness
|
3
|
+
description: ""
|
4
|
+
---
|
5
|
+
|
6
|
+
The answer correctness scorer is a default LLM judge scorer that measures how correct/consistent the LLM system's `actual_output` is to the `expected_output`.
|
7
|
+
In practice, this scorer helps determine whether your LLM application produces **answers that are consistent with golden/ground truth answers**.
|
8
|
+
|
9
|
+
|
10
|
+
## Required Fields
|
11
|
+
|
12
|
+
To run the answer relevancy scorer, you must include the following fields in your `Example`:
|
13
|
+
- `input`
|
14
|
+
- `actual_output`
|
15
|
+
- `expected_output`
|
16
|
+
|
17
|
+
## Scorer Breakdown
|
18
|
+
|
19
|
+
`AnswerCorrectness` scores are calculated by extracting statements made in the `expected_output` and classifying how many are consistent/correct with respect to the `actual_output`.
|
20
|
+
|
21
|
+
The score is calculated as:
|
22
|
+
|
23
|
+
$$
|
24
|
+
\text{correctness score} = \frac{\text{correct statements}}{\text{total statements}}
|
25
|
+
$$
|
26
|
+
|
27
|
+
## Sample Implementation
|
28
|
+
|
29
|
+
```python answer_correctness.py
|
30
|
+
from judgeval import JudgmentClient
|
31
|
+
from judgeval.data import Example
|
32
|
+
from judgeval.scorers import AnswerCorrectnessScorer
|
33
|
+
|
34
|
+
client = JudgmentClient()
|
35
|
+
example = Example(
|
36
|
+
input="What's your return policy for a pair of socks?",
|
37
|
+
# Replace this with your LLM system's output
|
38
|
+
actual_output="We offer a 30-day return policy for all items, including socks!",
|
39
|
+
# Replace this with your golden/ground truth answer
|
40
|
+
expected_output="Socks can be returned within 30 days of purchase.",
|
41
|
+
)
|
42
|
+
# supply your own threshold
|
43
|
+
scorer = AnswerCorrectnessScorer(threshold=0.8)
|
44
|
+
|
45
|
+
results = client.run_evaluation(
|
46
|
+
examples=[example],
|
47
|
+
scorers=[scorer],
|
48
|
+
model="gpt-4o",
|
49
|
+
)
|
50
|
+
print(results)
|
51
|
+
```
|
52
|
+
|
53
|
+
<Note>
|
54
|
+
The `AnswerCorrectness` scorer uses an LLM judge, so you'll receive a reason for the score in the `reason` field of the results.
|
55
|
+
This allows you to double-check the accuracy of the evaluation and understand how the score was calculated.
|
56
|
+
</Note>
|
@@ -0,0 +1,90 @@
|
|
1
|
+
---
|
2
|
+
title: Classifier Scorers
|
3
|
+
description: ""
|
4
|
+
---
|
5
|
+
|
6
|
+
A `ClassifierScorer` is a powerful tool for evaluating your LLM system using natural language criteria.
|
7
|
+
Classifier scorers are great for prototyping new evaluation criteria on a small set of examples before using them to benchmark your workflows at scale.
|
8
|
+
|
9
|
+
## Creating a Classifier Scorer
|
10
|
+
|
11
|
+
### `judgeval` SDK
|
12
|
+
|
13
|
+
You can create a `ClassifierScorer` by providing a natural language description of your evaluation task/criteria and a set of choices that an LLM judge can choose from when evaluating an example.
|
14
|
+
Here's an example of creating a `ClassifierScorer` that determines if a response is friendly or not:
|
15
|
+
|
16
|
+
```python friendliness_scorer.py
|
17
|
+
|
18
|
+
from judgeval.scorers import ClassifierScorer
|
19
|
+
|
20
|
+
friendliness_scorer = ClassifierScorer(
|
21
|
+
name="Friendliness Scorer",
|
22
|
+
threshold=1.0,
|
23
|
+
conversation=[
|
24
|
+
{
|
25
|
+
"role": "system",
|
26
|
+
"content": "Is the response positive (Y/N)? The response is: {{actual_output}}."
|
27
|
+
}
|
28
|
+
],
|
29
|
+
options={"Y": 1, "N": 0}
|
30
|
+
)
|
31
|
+
```
|
32
|
+
|
33
|
+
<Tip>
|
34
|
+
You can put variables from [`Example`s](/evaluation/data_examples) into your `conversation` by using the mustache `{{variable_name}}` syntax.
|
35
|
+
</Tip>
|
36
|
+
|
37
|
+
### `Judgment` Platform
|
38
|
+
|
39
|
+
1. Navigate to the `Scorers` tab in the Judgment platform. You'll find this on via the sidebar on the left.
|
40
|
+
2. Click the `Create Scorer` button in the top right corner.
|
41
|
+
|
42
|
+

|
43
|
+
|
44
|
+
3. Here, you can create a custom scorer by using a criteria in natural language, supplying custom arguments from the [`Example`](evaluation/data_examples) class.
|
45
|
+
Then, you supply a set of **choices** the scorer can select from when evaluating an example. Finally, you can test your scorer on samples in our playground.
|
46
|
+
|
47
|
+
4. Once you're finished, you can save the scorer and use it in your evaluation runs just like any other scorer in `judgeval`.
|
48
|
+
|
49
|
+
#### Example
|
50
|
+
|
51
|
+
Here's an example of building a similar `ClassifierScorer` that checks if the LLM's tone is too aggressive.
|
52
|
+
|
53
|
+

|
54
|
+
|
55
|
+
|
56
|
+
## Using a Classifier Scorer
|
57
|
+
|
58
|
+
Classifer scorers can be used in the same way as any other scorer in `judgeval`.
|
59
|
+
They can also be run in conjunction with other scorers in a single evaluation run!
|
60
|
+
|
61
|
+
```python run_classifier_scorer.py
|
62
|
+
...
|
63
|
+
|
64
|
+
results = client.run_evaluation(
|
65
|
+
examples=[example1],
|
66
|
+
scorers=[friendliness_scorer],
|
67
|
+
model="gpt-4o"
|
68
|
+
)
|
69
|
+
```
|
70
|
+
|
71
|
+
### Saving Classifier Scorers
|
72
|
+
|
73
|
+
Whether you create a `ClassifierScorer` via the `judgeval` SDK or the Judgment platform, you can save it to the `Judgment` platform for reuse in future evaluations.
|
74
|
+
- If you create a `ClassifierScorer` via the `judgeval` SDK, you can save it by calling `client.push_classifier_scorer()`.
|
75
|
+
- Similarly, you can load a `ClassifierScorer` by calling `client.fetch_classifier_scorer()`.
|
76
|
+
- Each `ClassifierScorer` has a **unique slug** that you can use to identify it.
|
77
|
+
|
78
|
+
```python
|
79
|
+
from judgeval import JudgmentClient
|
80
|
+
|
81
|
+
client = JudgmentClient()
|
82
|
+
|
83
|
+
# Saving a ClassifierScorer from SDK to platform
|
84
|
+
friendliness_slug = client.push_classifier_scorer(friendliness_scorer)
|
85
|
+
|
86
|
+
# Loading a ClassifierScorer from platform to SDK
|
87
|
+
classifier_scorer = client.fetch_classifier_scorer("classifier-scorer-slug")
|
88
|
+
```
|
89
|
+
|
90
|
+
TODO add image of slugs on the platform
|
@@ -42,7 +42,7 @@ Our contextual precision scorer is based on Stanford NLP's [ARES](https://arxiv.
|
|
42
42
|
## Sample Implementation
|
43
43
|
|
44
44
|
```python contextual_precision.py
|
45
|
-
from judgeval
|
45
|
+
from judgeval import JudgmentClient
|
46
46
|
from judgeval.data import Example
|
47
47
|
from judgeval.scorers import ContextualPrecisionScorer
|
48
48
|
|
@@ -41,7 +41,7 @@ Our contextual recall scorer is based on Stanford NLP's [ARES](https://arxiv.org
|
|
41
41
|
## Sample Implementation
|
42
42
|
|
43
43
|
```python contextual_recall.py
|
44
|
-
from judgeval
|
44
|
+
from judgeval import JudgmentClient
|
45
45
|
from judgeval.data import Example
|
46
46
|
from judgeval.scorers import ContextualRecallScorer
|
47
47
|
|
@@ -31,7 +31,7 @@ Our contextual relevancy scorer is based on Stanford NLP's [ARES](https://arxiv.
|
|
31
31
|
## Sample Implementation
|
32
32
|
|
33
33
|
```python contextual_relevancy.py
|
34
|
-
from judgeval
|
34
|
+
from judgeval import JudgmentClient
|
35
35
|
from judgeval.data import Example
|
36
36
|
from judgeval.scorers import ContextualRelevancyScorer
|
37
37
|
|
@@ -37,10 +37,9 @@ $$
|
|
37
37
|
## Sample Implementation
|
38
38
|
|
39
39
|
```python faithfulness.py
|
40
|
-
from judgeval
|
40
|
+
from judgeval import JudgmentClient
|
41
41
|
from judgeval.data import Example
|
42
|
-
from judgeval.scorers import
|
43
|
-
from judgeval.constants import APIScorer
|
42
|
+
from judgeval.scorers import FaithfulnessScorer
|
44
43
|
|
45
44
|
client = JudgmentClient()
|
46
45
|
example = Example(
|
@@ -51,7 +50,7 @@ example = Example(
|
|
51
50
|
retrieval_context=["Return policy, all items: 30-day limit for full refund, no questions asked."]
|
52
51
|
)
|
53
52
|
# supply your own threshold
|
54
|
-
scorer =
|
53
|
+
scorer = FaithfulnessScorer(threshold=0.8)
|
55
54
|
|
56
55
|
results = client.run_evaluation(
|
57
56
|
examples=[example],
|
@@ -30,10 +30,9 @@ $$
|
|
30
30
|
## Sample Implementation
|
31
31
|
|
32
32
|
```python hallucination.py
|
33
|
-
from judgeval
|
33
|
+
from judgeval import JudgmentClient
|
34
34
|
from judgeval.data import Example
|
35
|
-
from judgeval.scorers import
|
36
|
-
from judgeval.constants import APIScorer
|
35
|
+
from judgeval.scorers import HallucinationScorer
|
37
36
|
|
38
37
|
client = JudgmentClient()
|
39
38
|
example = Example(
|
@@ -44,7 +43,7 @@ example = Example(
|
|
44
43
|
context=["**RETURN POLICY** all products returnable with no cost for 30-days after purchase (receipt required)."]
|
45
44
|
)
|
46
45
|
# supply your own threshold
|
47
|
-
scorer =
|
46
|
+
scorer = HallucinationScorer(threshold=0.8)
|
48
47
|
|
49
48
|
results = client.run_evaluation(
|
50
49
|
examples=[example],
|
@@ -35,17 +35,16 @@ $$
|
|
35
35
|
## Sample Implementation
|
36
36
|
|
37
37
|
```python json_correctness.py
|
38
|
-
from judgeval
|
38
|
+
from judgeval import JudgmentClient
|
39
39
|
from judgeval.data import Example
|
40
|
-
from judgeval.scorers import
|
41
|
-
from judgeval.constants import APIScorer
|
40
|
+
from judgeval.scorers import JSONCorrectnessScorer
|
42
41
|
client = JudgmentClient()
|
43
42
|
example = Example(
|
44
43
|
input="Create a JSON object with the keys 'field1' (str) and 'field2' (int). Fill them with random values.",
|
45
44
|
# Replace this with your LLM system's output
|
46
45
|
actual_output="{'field1': 'value1', 'field2': 1}",
|
47
46
|
)
|
48
|
-
scorer =
|
47
|
+
scorer = JSONCorrectnessScorer(threshold=0.8)
|
49
48
|
results = client.run_evaluation(
|
50
49
|
examples=[example],
|
51
50
|
scorers=[scorer],
|
@@ -40,10 +40,9 @@ $$
|
|
40
40
|
## Sample Implementation
|
41
41
|
|
42
42
|
```python summarization.py
|
43
|
-
from judgeval
|
43
|
+
from judgeval import JudgmentClient
|
44
44
|
from judgeval.data import Example
|
45
|
-
from judgeval.scorers import
|
46
|
-
from judgeval.constants import APIScorer
|
45
|
+
from judgeval.scorers import SummarizationScorer
|
47
46
|
|
48
47
|
client = JudgmentClient()
|
49
48
|
example = Example(
|
@@ -52,7 +51,7 @@ example = Example(
|
|
52
51
|
actual_output="...",
|
53
52
|
)
|
54
53
|
# supply your own threshold
|
55
|
-
scorer =
|
54
|
+
scorer = SummarizationScorer(threshold=0.8)
|
56
55
|
|
57
56
|
results = client.run_evaluation(
|
58
57
|
examples=[example],
|
@@ -27,10 +27,9 @@ TODO add more docs here regarding tool ordering, exact match, or even correct to
|
|
27
27
|
## Sample Implementation
|
28
28
|
|
29
29
|
```python tool_correctness.py
|
30
|
-
from judgeval
|
30
|
+
from judgeval import JudgmentClient
|
31
31
|
from judgeval.data import Example
|
32
|
-
from judgeval.scorers import
|
33
|
-
from judgeval.constants import APIScorer
|
32
|
+
from judgeval.scorers import ToolCorrectnessScorer
|
34
33
|
|
35
34
|
client = JudgmentClient()
|
36
35
|
example = Example(
|
@@ -40,7 +39,7 @@ example = Example(
|
|
40
39
|
expected_output=["DBQuery", "GoogleSearch"],
|
41
40
|
)
|
42
41
|
# supply your own threshold
|
43
|
-
scorer =
|
42
|
+
scorer = ToolCorrectnessScorer(threshold=0.8)
|
44
43
|
|
45
44
|
results = client.run_evaluation(
|
46
45
|
examples=[example],
|
@@ -19,19 +19,23 @@ access our state-of-the-art judge models, and manage your evaluations/datasets o
|
|
19
19
|
Once you have a key, you can set the environment variable `JUDGMENT_API_KEY` to your key.
|
20
20
|
This allows the `JudgmentClient` to authenticate your requests to the Judgment API.
|
21
21
|
|
22
|
+
```
|
23
|
+
export JUDGMENT_API_KEY="your_key_here"
|
24
|
+
```
|
25
|
+
|
22
26
|
To receive a key, please email us at `contact@judgmentlabs.ai`.
|
23
27
|
|
24
28
|
|
25
29
|
<Note>
|
26
30
|
Running evaluations on Judgment Labs' infrastructure is recommended for
|
27
31
|
large-scale evaluations. [Contact us](mailto:contact@judgmentlabs.ai) if you're dealing with
|
28
|
-
sensitive data that has to reside in your private VPCs
|
32
|
+
sensitive data that has to reside in your private VPCs.
|
29
33
|
</Note>
|
30
34
|
|
31
35
|
# Create your first evaluation
|
32
36
|
|
33
37
|
```python sample_eval.py
|
34
|
-
from judgeval
|
38
|
+
from judgeval import JudgmentClient
|
35
39
|
from judgeval.data import Example
|
36
40
|
from judgeval.scorers import FaithfulnessScorer
|
37
41
|
|
@@ -58,16 +62,16 @@ Congratulations! Your evaluation should have passed. Let's break down what happe
|
|
58
62
|
- The variable `retrieval_context` represents the retrieved context from your knowledge base and `FaithfulnessScorer(threshold=0.5)`
|
59
63
|
is a scorer that checks if the output is hallucinated relative to the retrieved context.
|
60
64
|
- Scorers give values betweeen 0 - 1 and we set the threshold for this scorer to 0.5 in the context of a unit test. If you are interested measuring rather than testing, you can ignore this threshold and reference the `score` field alone.
|
61
|
-
- We chose `gpt-4o` as our judge model for faithfulness. Judgment Labs offers ANY judge model for your evaluation needs.
|
65
|
+
- We chose `gpt-4o` as our judge model for faithfulness. Judgment Labs offers ANY judge model for your evaluation needs. Consider trying out our state-of-the-art judge models for your next evaluation!
|
62
66
|
|
63
67
|
# Create Your First Scorer
|
64
|
-
`judgeval` offers three kinds of LLM scorers for your evaluation needs: ready-made,
|
68
|
+
`judgeval` offers three kinds of LLM scorers for your evaluation needs: ready-made, classifier scorers, and custom scorers.
|
65
69
|
|
66
70
|
## Ready-made Scorers
|
67
71
|
Judgment Labs provides default implementations of 10+ research-backed metrics covering evaluation needs ranging from hallucination detection to RAG retrieval quality. To create a ready-made scorer, just import it directly from `judgeval.scorers`:
|
68
72
|
|
69
73
|
```python scorer_example.py
|
70
|
-
from judgeval
|
74
|
+
from judgeval import JudgmentClient
|
71
75
|
from judgeval.data import Example
|
72
76
|
from judgeval.scorers import FaithfulnessScorer
|
73
77
|
|
@@ -91,15 +95,29 @@ print(results)
|
|
91
95
|
For a complete list of ready-made scorers, see the [scorers docs](/evaluation/scorers).
|
92
96
|
</Note>
|
93
97
|
|
94
|
-
##
|
98
|
+
## Classifier Scorers
|
95
99
|
`judgeval` allows you to create custom scorers using natural language. These can range from simple judges to powerful evaluators for your LLM systems.
|
96
100
|
|
101
|
+
```python classifier_scorer.py
|
102
|
+
from judgeval.scorers import ClassifierScorer
|
103
|
+
|
104
|
+
classifier_scorer = ClassifierScorer(
|
105
|
+
name="Tone Scorer",
|
106
|
+
threshold=0.9,
|
107
|
+
conversation=[
|
108
|
+
{
|
109
|
+
"role": "system",
|
110
|
+
"content": "Is the response positive (Y/N)? The response is: {{actual_output}}."
|
111
|
+
}
|
112
|
+
],
|
113
|
+
options={"Y": 1, "N": 0}
|
114
|
+
)
|
97
115
|
```
|
98
|
-
|
99
|
-
|
116
|
+
|
117
|
+
To learn more about `ClassifierScorer`s, click [here](/evaluation/scorers/classifier_scorer).
|
100
118
|
|
101
119
|
## Custom Scorers
|
102
|
-
If you find that none of the ready-made scorers or
|
120
|
+
If you find that none of the ready-made scorers or classifier scorers fit your needs, you can easily create your own custom scorer.
|
103
121
|
These can be as simple or complex as you need them to be and **_do not_** have to use an LLM judge model.
|
104
122
|
Here's an example of computing BLEU scores:
|
105
123
|
|
@@ -148,7 +166,7 @@ If you're interested in measuring multiple metrics at once, you can group scorer
|
|
148
166
|
regardless of the type of scorer.
|
149
167
|
|
150
168
|
```python multiple_scorers.py
|
151
|
-
from judgeval
|
169
|
+
from judgeval import JudgmentClient
|
152
170
|
from judgeval.scorers import FaithfulnessScorer, SummarizationScorer
|
153
171
|
|
154
172
|
client = JudgmentClient()
|
@@ -221,41 +239,6 @@ Work in progress!
|
|
221
239
|
|
222
240
|
Work in progress!
|
223
241
|
|
224
|
-
## Creating ClassifierScorers
|
225
|
-
|
226
|
-
ClassifierScorers are **powerful** evaluators that can be created in minutes via Judgment's platform or SDK
|
227
|
-
using **natural language criteria**.
|
228
|
-
|
229
|
-
<Tip>
|
230
|
-
For more information on what a ClassifierScorer is, click [here](/evaluation/scorers/classifier_scorer).
|
231
|
-
</Tip>
|
232
|
-
|
233
|
-
**Here's how to create a ClassifierScorer:**
|
234
|
-
|
235
|
-
1. Navigate to the `Scorers` tab in the Judgment platform. You'll find this on via the sidebar on the left.
|
236
|
-
2. Click the `Create Scorer` button in the top right corner.
|
237
|
-
|
238
|
-

|
239
|
-
|
240
|
-
3. Here, you can create a custom scorer by using a criteria in natural language, supplying custom arguments from the [`Example`](evaluation/data_examples) class.
|
241
|
-
Then, you supply a set of **choices** the scorer can select from when evaluating an example. Finally, you can test your scorer on samples in our playground.
|
242
|
-
|
243
|
-
4. Once you're finished, you can save the scorer and use it in your evaluation runs just like any other scorer in `judgeval`.
|
244
|
-
|
245
|
-
### Example
|
246
|
-
|
247
|
-
Here's an example of building a `ClassifierScorer` that checks if the LLM's tone is too aggressive.
|
248
|
-
This might be useful when building a customer support chatbot.
|
249
|
-
|
250
|
-

|
251
|
-
|
252
|
-
<Tip>
|
253
|
-
A great use of ClassifierScorers is to prototype an evaluation criteria on a small set of examples before
|
254
|
-
using it to benchmark your workflow.
|
255
|
-
|
256
|
-
To learn more about `ClassifierScorer`s, click [here](/evaluation/scorers/classifier_scorer).
|
257
|
-
</Tip>
|
258
|
-
|
259
242
|
## Optimizing Your LLM System
|
260
243
|
|
261
244
|
Evaluation is a **prerequisite** for optimizing your LLM systems. Measuring the quality of your LLM workflows
|
@@ -284,7 +267,9 @@ Beyond experimenting and measuring historical performance, `judgeval` supports m
|
|
284
267
|
Using our `tracing` module, you can **track your LLM system outputs from end to end**, allowing you to visualize the flow of your LLM system.
|
285
268
|
Additionally, you can **enable evaluations to run in real-time** using Judgment's state-of-the-art judge models.
|
286
269
|
|
287
|
-
|
270
|
+
<div style={{display: 'flex', justifyContent: 'center'}}>
|
271
|
+

|
272
|
+
</div>
|
288
273
|
|
289
274
|
There are many benefits of monitoring your LLM systems in production with `judgeval`, including:
|
290
275
|
- Detecting hallucinations and other quality issues **before they reach your customers**
|
Binary file
|
@@ -25,10 +25,6 @@
|
|
25
25
|
"url": "https://github.com/judgmentlabs"
|
26
26
|
}
|
27
27
|
],
|
28
|
-
"topbarCtaButton": {
|
29
|
-
"name": "Dashboard",
|
30
|
-
"url": "https://dashboard.mintlify.com"
|
31
|
-
},
|
32
28
|
"tabs": [
|
33
29
|
{
|
34
30
|
"name": "Tutorials",
|
@@ -60,6 +56,7 @@
|
|
60
56
|
"group": "Scorers",
|
61
57
|
"pages": [
|
62
58
|
"evaluation/scorers/introduction",
|
59
|
+
"evaluation/scorers/answer_correctness",
|
63
60
|
"evaluation/scorers/answer_relevancy",
|
64
61
|
"evaluation/scorers/contextual_precision",
|
65
62
|
"evaluation/scorers/contextual_recall",
|
@@ -76,6 +73,14 @@
|
|
76
73
|
"evaluation/judges"
|
77
74
|
]
|
78
75
|
},
|
76
|
+
{
|
77
|
+
"group": "Monitoring",
|
78
|
+
"pages": [
|
79
|
+
"monitoring/introduction",
|
80
|
+
"monitoring/tracing",
|
81
|
+
"monitoring/production_insights"
|
82
|
+
]
|
83
|
+
},
|
79
84
|
{
|
80
85
|
"group": "Judgment Platform",
|
81
86
|
"pages": [
|
File without changes
|