judgeval 0.0.3__tar.gz → 0.0.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (140) hide show
  1. {judgeval-0.0.3 → judgeval-0.0.4}/.github/workflows/ci.yaml +1 -0
  2. {judgeval-0.0.3 → judgeval-0.0.4}/.gitignore +1 -0
  3. {judgeval-0.0.3 → judgeval-0.0.4}/PKG-INFO +1 -1
  4. {judgeval-0.0.3 → judgeval-0.0.4}/Pipfile +3 -0
  5. {judgeval-0.0.3 → judgeval-0.0.4}/docs/evaluation/introduction.mdx +18 -20
  6. judgeval-0.0.4/docs/evaluation/scorers/answer_correctness.mdx +56 -0
  7. {judgeval-0.0.3 → judgeval-0.0.4}/docs/evaluation/scorers/answer_relevancy.mdx +1 -1
  8. judgeval-0.0.4/docs/evaluation/scorers/classifier_scorer.mdx +90 -0
  9. {judgeval-0.0.3 → judgeval-0.0.4}/docs/evaluation/scorers/contextual_precision.mdx +1 -1
  10. {judgeval-0.0.3 → judgeval-0.0.4}/docs/evaluation/scorers/contextual_recall.mdx +1 -1
  11. {judgeval-0.0.3 → judgeval-0.0.4}/docs/evaluation/scorers/contextual_relevancy.mdx +1 -1
  12. {judgeval-0.0.3 → judgeval-0.0.4}/docs/evaluation/scorers/faithfulness.mdx +3 -4
  13. {judgeval-0.0.3 → judgeval-0.0.4}/docs/evaluation/scorers/hallucination.mdx +3 -4
  14. {judgeval-0.0.3 → judgeval-0.0.4}/docs/evaluation/scorers/json_correctness.mdx +3 -4
  15. {judgeval-0.0.3 → judgeval-0.0.4}/docs/evaluation/scorers/summarization.mdx +3 -4
  16. {judgeval-0.0.3 → judgeval-0.0.4}/docs/evaluation/scorers/tool_correctness.mdx +3 -4
  17. {judgeval-0.0.3 → judgeval-0.0.4}/docs/getting_started.mdx +31 -46
  18. judgeval-0.0.4/docs/images/trace_screenshot.png +0 -0
  19. judgeval-0.0.4/docs/judgment/introduction.mdx +7 -0
  20. {judgeval-0.0.3 → judgeval-0.0.4}/docs/mint.json +9 -4
  21. judgeval-0.0.4/docs/monitoring/tracing.mdx +0 -0
  22. {judgeval-0.0.3 → judgeval-0.0.4}/pyproject.toml +1 -1
  23. judgeval-0.0.4/src/demo/cookbooks/langchain_basic_rag/basic_agentic_rag.ipynb +781 -0
  24. judgeval-0.0.4/src/demo/cookbooks/langchain_basic_rag/tesla_q3.pdf +0 -0
  25. judgeval-0.0.4/src/demo/cookbooks/langchain_sales/example_product_price_id_mapping.json +1 -0
  26. judgeval-0.0.4/src/demo/cookbooks/langchain_sales/sales_agent_with_context.ipynb +1375 -0
  27. judgeval-0.0.4/src/demo/cookbooks/langchain_sales/sample_product_catalog.txt +20 -0
  28. judgeval-0.0.4/src/demo/cookbooks/openai_travel_agent/agent.py +208 -0
  29. judgeval-0.0.4/src/demo/cookbooks/openai_travel_agent/populate_db.py +73 -0
  30. judgeval-0.0.4/src/judgeval/__init__.py +12 -0
  31. {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/common/tracer.py +57 -31
  32. {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/constants.py +1 -0
  33. {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/data/__init__.py +2 -1
  34. {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/data/scorer_data.py +2 -2
  35. {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/evaluation_run.py +16 -15
  36. {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/judges/__init__.py +2 -2
  37. {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/judges/base_judge.py +1 -1
  38. {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/judges/litellm_judge.py +2 -2
  39. {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/judges/mixture_of_judges.py +2 -2
  40. {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/judges/together_judge.py +2 -2
  41. {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/judges/utils.py +4 -4
  42. {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/judgment_client.py +67 -15
  43. {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/run_evaluation.py +79 -14
  44. {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/scorers/__init__.py +8 -4
  45. judgeval-0.0.4/src/judgeval/scorers/api_scorer.py +64 -0
  46. {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/scorers/base_scorer.py +3 -2
  47. judgeval-0.0.4/src/judgeval/scorers/exceptions.py +11 -0
  48. judgeval-0.0.3/src/judgeval/scorers/custom_scorer.py → judgeval-0.0.4/src/judgeval/scorers/judgeval_scorer.py +9 -5
  49. judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/__init__.py +144 -0
  50. judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +23 -0
  51. judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +19 -0
  52. {judgeval-0.0.3/src/judgeval/scorers/judgeval_scorers → judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/api_scorers}/answer_relevancy.py +2 -2
  53. {judgeval-0.0.3/src/judgeval/scorers/judgeval_scorers → judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/api_scorers}/contextual_precision.py +2 -2
  54. {judgeval-0.0.3/src/judgeval/scorers/judgeval_scorers → judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/api_scorers}/contextual_recall.py +2 -2
  55. {judgeval-0.0.3/src/judgeval/scorers/judgeval_scorers → judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/api_scorers}/contextual_relevancy.py +2 -2
  56. {judgeval-0.0.3/src/judgeval/scorers/judgeval_scorers → judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/api_scorers}/faithfulness.py +2 -2
  57. {judgeval-0.0.3/src/judgeval/scorers/judgeval_scorers → judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/api_scorers}/hallucination.py +2 -2
  58. {judgeval-0.0.3/src/judgeval/scorers/judgeval_scorers → judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/api_scorers}/json_correctness.py +7 -7
  59. {judgeval-0.0.3/src/judgeval/scorers/judgeval_scorers → judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/api_scorers}/summarization.py +2 -2
  60. {judgeval-0.0.3/src/judgeval/scorers/judgeval_scorers → judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/api_scorers}/tool_correctness.py +2 -2
  61. judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +24 -0
  62. judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +4 -0
  63. judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +272 -0
  64. judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +169 -0
  65. judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +4 -0
  66. judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +292 -0
  67. judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +174 -0
  68. judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +3 -0
  69. judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +259 -0
  70. judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +106 -0
  71. judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +3 -0
  72. judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +249 -0
  73. judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +142 -0
  74. judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +3 -0
  75. judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +240 -0
  76. judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +121 -0
  77. judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +3 -0
  78. judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +318 -0
  79. judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +265 -0
  80. judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +3 -0
  81. judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +258 -0
  82. judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +104 -0
  83. judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +127 -0
  84. judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +3 -0
  85. judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +247 -0
  86. judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +541 -0
  87. judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +3 -0
  88. judgeval-0.0.4/src/judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +151 -0
  89. {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/scorers/prompt_scorer.py +4 -4
  90. {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/scorers/score.py +14 -14
  91. {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/scorers/utils.py +40 -6
  92. judgeval-0.0.3/src/judgeval/__init__.py +0 -83
  93. judgeval-0.0.3/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -21
  94. {judgeval-0.0.3 → judgeval-0.0.4}/LICENSE.md +0 -0
  95. {judgeval-0.0.3 → judgeval-0.0.4}/README.md +0 -0
  96. {judgeval-0.0.3 → judgeval-0.0.4}/docs/README.md +0 -0
  97. {judgeval-0.0.3 → judgeval-0.0.4}/docs/development.mdx +0 -0
  98. {judgeval-0.0.3 → judgeval-0.0.4}/docs/essentials/code.mdx +0 -0
  99. {judgeval-0.0.3 → judgeval-0.0.4}/docs/essentials/images.mdx +0 -0
  100. {judgeval-0.0.3 → judgeval-0.0.4}/docs/essentials/markdown.mdx +0 -0
  101. {judgeval-0.0.3 → judgeval-0.0.4}/docs/essentials/navigation.mdx +0 -0
  102. {judgeval-0.0.3 → judgeval-0.0.4}/docs/essentials/reusable-snippets.mdx +0 -0
  103. {judgeval-0.0.3 → judgeval-0.0.4}/docs/essentials/settings.mdx +0 -0
  104. {judgeval-0.0.3 → judgeval-0.0.4}/docs/evaluation/data_datasets.mdx +0 -0
  105. {judgeval-0.0.3 → judgeval-0.0.4}/docs/evaluation/data_examples.mdx +0 -0
  106. {judgeval-0.0.3 → judgeval-0.0.4}/docs/evaluation/judges.mdx +0 -0
  107. {judgeval-0.0.3 → judgeval-0.0.4}/docs/evaluation/scorers/custom_scorers.mdx +0 -0
  108. {judgeval-0.0.3 → judgeval-0.0.4}/docs/evaluation/scorers/introduction.mdx +0 -0
  109. {judgeval-0.0.3 → judgeval-0.0.4}/docs/favicon.svg +0 -0
  110. {judgeval-0.0.3 → judgeval-0.0.4}/docs/images/checks-passed.png +0 -0
  111. {judgeval-0.0.3 → judgeval-0.0.4}/docs/images/create_aggressive_scorer.png +0 -0
  112. {judgeval-0.0.3 → judgeval-0.0.4}/docs/images/create_scorer.png +0 -0
  113. {judgeval-0.0.3 → judgeval-0.0.4}/docs/images/evaluation_diagram.png +0 -0
  114. {judgeval-0.0.3 → judgeval-0.0.4}/docs/images/hero-dark.svg +0 -0
  115. {judgeval-0.0.3 → judgeval-0.0.4}/docs/images/hero-light.svg +0 -0
  116. {judgeval-0.0.3 → judgeval-0.0.4}/docs/introduction.mdx +0 -0
  117. {judgeval-0.0.3 → judgeval-0.0.4}/docs/logo/dark.svg +0 -0
  118. {judgeval-0.0.3 → judgeval-0.0.4}/docs/logo/light.svg +0 -0
  119. {judgeval-0.0.3/docs/judgment → judgeval-0.0.4/docs/monitoring}/introduction.mdx +0 -0
  120. /judgeval-0.0.3/docs/evaluation/scorers/classifier_scorer.mdx → /judgeval-0.0.4/docs/monitoring/production_insights.mdx +0 -0
  121. {judgeval-0.0.3 → judgeval-0.0.4}/docs/notebooks/create_dataset.ipynb +0 -0
  122. {judgeval-0.0.3 → judgeval-0.0.4}/docs/notebooks/create_scorer.ipynb +0 -0
  123. {judgeval-0.0.3 → judgeval-0.0.4}/docs/notebooks/demo.ipynb +0 -0
  124. {judgeval-0.0.3 → judgeval-0.0.4}/docs/notebooks/prompt_scorer.ipynb +0 -0
  125. {judgeval-0.0.3 → judgeval-0.0.4}/docs/notebooks/quickstart.ipynb +0 -0
  126. {judgeval-0.0.3 → judgeval-0.0.4}/docs/quickstart.mdx +0 -0
  127. {judgeval-0.0.3 → judgeval-0.0.4}/docs/snippets/snippet-intro.mdx +0 -0
  128. {judgeval-0.0.3 → judgeval-0.0.4}/pytest.ini +0 -0
  129. {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/clients.py +0 -0
  130. {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/common/__init__.py +0 -0
  131. {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/common/exceptions.py +0 -0
  132. {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/common/logger.py +0 -0
  133. {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/common/utils.py +0 -0
  134. {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/data/api_example.py +0 -0
  135. {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/data/datasets/__init__.py +0 -0
  136. {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/data/datasets/dataset.py +0 -0
  137. {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/data/datasets/ground_truth.py +0 -0
  138. {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/data/datasets/utils.py +0 -0
  139. {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/data/example.py +0 -0
  140. {judgeval-0.0.3 → judgeval-0.0.4}/src/judgeval/data/result.py +0 -0
@@ -40,4 +40,5 @@ jobs:
40
40
 
41
41
  - name: Run tests
42
42
  run: |
43
+ cd src
43
44
  pipenv run pytest
@@ -8,6 +8,7 @@ __pycache__/
8
8
 
9
9
  # Testing files for competitor packages
10
10
  demo/test_competitors.py
11
+ src/e2etests/customer_usecases/
11
12
 
12
13
  # Packages
13
14
  *.egg
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.0.3
3
+ Version: 0.0.4
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -16,6 +16,9 @@ openai = "*"
16
16
  together = "*"
17
17
  anthropic = "*"
18
18
  patronus = "*"
19
+ asyncio = "*"
20
+ nest-asyncio = "*"
21
+ tavily-python = "*"
19
22
 
20
23
  [dev-packages]
21
24
  pytest = "*"
@@ -8,25 +8,6 @@ Evaluation is the process of **scoring** an LLM system's outputs with metrics; a
8
8
  - An evaluation dataset
9
9
  - Metrics we are interested in tracking
10
10
 
11
- The ideal fit of evaluation into an application workflow looks like this:
12
-
13
- ![Alt text](/images/evaluation_diagram.png "Optional title")
14
-
15
- ## Metrics
16
-
17
- `judgeval` comes with a set of 10+ built-in evaluation metrics. These metrics are accessible through `judgeval`'s `Scorer` interface.
18
- Every `Scorer` has a `threshold` parameter that you can use in the context of unit testing your app.
19
-
20
- ```python scorer.py
21
- from judgeval.scorers import FaithfulnessScorer
22
-
23
- scorer = FaithfulnessScorer(threshold=1.0)
24
- ```
25
- You can use scorers to evaluate your LLM system's outputs by using `Example`s.
26
-
27
- <Tip>
28
- We're always working on adding new scorers, so if you have a metric you'd like to add, please [let us know!](mailto:contact@judgmentlabs.ai)
29
- </Tip>
30
11
 
31
12
  ## Examples
32
13
 
@@ -54,7 +35,7 @@ Creating an Example allows you to evaluate using
54
35
  `judgeval`'s default scorers:
55
36
 
56
37
  ```python example.py
57
- from judgeval.judgment_client import JudgmentClient
38
+ from judgeval import JudgmentClient
58
39
  from judgeval.scorers import FaithfulnessScorer
59
40
 
60
41
  client = JudgmentClient()
@@ -102,6 +83,23 @@ results = client.evaluate_dataset(
102
83
  )
103
84
  ```
104
85
 
86
+ ## Metrics
87
+
88
+ `judgeval` comes with a set of 10+ built-in evaluation metrics. These metrics are accessible through `judgeval`'s `Scorer` interface.
89
+ Every `Scorer` has a `threshold` parameter that you can use in the context of unit testing your app.
90
+
91
+ ```python scorer.py
92
+ from judgeval.scorers import FaithfulnessScorer
93
+
94
+ scorer = FaithfulnessScorer(threshold=1.0)
95
+ ```
96
+ You can use scorers to evaluate your LLM system's outputs by using `Example`s.
97
+
98
+ <Tip>
99
+ We're always working on adding new scorers, so if you have a metric you'd like to add, please [let us know!](mailto:contact@judgmentlabs.ai)
100
+ </Tip>
101
+
102
+
105
103
  **Congratulations!** 🎉
106
104
 
107
105
  You've learned the basics of building and running evaluations with `judgeval`.
@@ -0,0 +1,56 @@
1
+ ---
2
+ title: Answer Correctness
3
+ description: ""
4
+ ---
5
+
6
+ The answer correctness scorer is a default LLM judge scorer that measures how correct/consistent the LLM system's `actual_output` is to the `expected_output`.
7
+ In practice, this scorer helps determine whether your LLM application produces **answers that are consistent with golden/ground truth answers**.
8
+
9
+
10
+ ## Required Fields
11
+
12
+ To run the answer relevancy scorer, you must include the following fields in your `Example`:
13
+ - `input`
14
+ - `actual_output`
15
+ - `expected_output`
16
+
17
+ ## Scorer Breakdown
18
+
19
+ `AnswerCorrectness` scores are calculated by extracting statements made in the `expected_output` and classifying how many are consistent/correct with respect to the `actual_output`.
20
+
21
+ The score is calculated as:
22
+
23
+ $$
24
+ \text{correctness score} = \frac{\text{correct statements}}{\text{total statements}}
25
+ $$
26
+
27
+ ## Sample Implementation
28
+
29
+ ```python answer_correctness.py
30
+ from judgeval import JudgmentClient
31
+ from judgeval.data import Example
32
+ from judgeval.scorers import AnswerCorrectnessScorer
33
+
34
+ client = JudgmentClient()
35
+ example = Example(
36
+ input="What's your return policy for a pair of socks?",
37
+ # Replace this with your LLM system's output
38
+ actual_output="We offer a 30-day return policy for all items, including socks!",
39
+ # Replace this with your golden/ground truth answer
40
+ expected_output="Socks can be returned within 30 days of purchase.",
41
+ )
42
+ # supply your own threshold
43
+ scorer = AnswerCorrectnessScorer(threshold=0.8)
44
+
45
+ results = client.run_evaluation(
46
+ examples=[example],
47
+ scorers=[scorer],
48
+ model="gpt-4o",
49
+ )
50
+ print(results)
51
+ ```
52
+
53
+ <Note>
54
+ The `AnswerCorrectness` scorer uses an LLM judge, so you'll receive a reason for the score in the `reason` field of the results.
55
+ This allows you to double-check the accuracy of the evaluation and understand how the score was calculated.
56
+ </Note>
@@ -30,7 +30,7 @@ $$
30
30
  ## Sample Implementation
31
31
 
32
32
  ```python answer_relevancy.py
33
- from judgeval.judgment_client import JudgmentClient
33
+ from judgeval import JudgmentClient
34
34
  from judgeval.data import Example
35
35
  from judgeval.scorers import AnswerRelevancyScorer
36
36
 
@@ -0,0 +1,90 @@
1
+ ---
2
+ title: Classifier Scorers
3
+ description: ""
4
+ ---
5
+
6
+ A `ClassifierScorer` is a powerful tool for evaluating your LLM system using natural language criteria.
7
+ Classifier scorers are great for prototyping new evaluation criteria on a small set of examples before using them to benchmark your workflows at scale.
8
+
9
+ ## Creating a Classifier Scorer
10
+
11
+ ### `judgeval` SDK
12
+
13
+ You can create a `ClassifierScorer` by providing a natural language description of your evaluation task/criteria and a set of choices that an LLM judge can choose from when evaluating an example.
14
+ Here's an example of creating a `ClassifierScorer` that determines if a response is friendly or not:
15
+
16
+ ```python friendliness_scorer.py
17
+
18
+ from judgeval.scorers import ClassifierScorer
19
+
20
+ friendliness_scorer = ClassifierScorer(
21
+ name="Friendliness Scorer",
22
+ threshold=1.0,
23
+ conversation=[
24
+ {
25
+ "role": "system",
26
+ "content": "Is the response positive (Y/N)? The response is: {{actual_output}}."
27
+ }
28
+ ],
29
+ options={"Y": 1, "N": 0}
30
+ )
31
+ ```
32
+
33
+ <Tip>
34
+ You can put variables from [`Example`s](/evaluation/data_examples) into your `conversation` by using the mustache `{{variable_name}}` syntax.
35
+ </Tip>
36
+
37
+ ### `Judgment` Platform
38
+
39
+ 1. Navigate to the `Scorers` tab in the Judgment platform. You'll find this on via the sidebar on the left.
40
+ 2. Click the `Create Scorer` button in the top right corner.
41
+
42
+ ![Alt text](/images/create_scorer.png "Optional title")
43
+
44
+ 3. Here, you can create a custom scorer by using a criteria in natural language, supplying custom arguments from the [`Example`](evaluation/data_examples) class.
45
+ Then, you supply a set of **choices** the scorer can select from when evaluating an example. Finally, you can test your scorer on samples in our playground.
46
+
47
+ 4. Once you're finished, you can save the scorer and use it in your evaluation runs just like any other scorer in `judgeval`.
48
+
49
+ #### Example
50
+
51
+ Here's an example of building a similar `ClassifierScorer` that checks if the LLM's tone is too aggressive.
52
+
53
+ ![Alt text](/images/create_aggressive_scorer.png "Optional title")
54
+
55
+
56
+ ## Using a Classifier Scorer
57
+
58
+ Classifer scorers can be used in the same way as any other scorer in `judgeval`.
59
+ They can also be run in conjunction with other scorers in a single evaluation run!
60
+
61
+ ```python run_classifier_scorer.py
62
+ ...
63
+
64
+ results = client.run_evaluation(
65
+ examples=[example1],
66
+ scorers=[friendliness_scorer],
67
+ model="gpt-4o"
68
+ )
69
+ ```
70
+
71
+ ### Saving Classifier Scorers
72
+
73
+ Whether you create a `ClassifierScorer` via the `judgeval` SDK or the Judgment platform, you can save it to the `Judgment` platform for reuse in future evaluations.
74
+ - If you create a `ClassifierScorer` via the `judgeval` SDK, you can save it by calling `client.push_classifier_scorer()`.
75
+ - Similarly, you can load a `ClassifierScorer` by calling `client.fetch_classifier_scorer()`.
76
+ - Each `ClassifierScorer` has a **unique slug** that you can use to identify it.
77
+
78
+ ```python
79
+ from judgeval import JudgmentClient
80
+
81
+ client = JudgmentClient()
82
+
83
+ # Saving a ClassifierScorer from SDK to platform
84
+ friendliness_slug = client.push_classifier_scorer(friendliness_scorer)
85
+
86
+ # Loading a ClassifierScorer from platform to SDK
87
+ classifier_scorer = client.fetch_classifier_scorer("classifier-scorer-slug")
88
+ ```
89
+
90
+ TODO add image of slugs on the platform
@@ -42,7 +42,7 @@ Our contextual precision scorer is based on Stanford NLP's [ARES](https://arxiv.
42
42
  ## Sample Implementation
43
43
 
44
44
  ```python contextual_precision.py
45
- from judgeval.judgment_client import JudgmentClient
45
+ from judgeval import JudgmentClient
46
46
  from judgeval.data import Example
47
47
  from judgeval.scorers import ContextualPrecisionScorer
48
48
 
@@ -41,7 +41,7 @@ Our contextual recall scorer is based on Stanford NLP's [ARES](https://arxiv.org
41
41
  ## Sample Implementation
42
42
 
43
43
  ```python contextual_recall.py
44
- from judgeval.judgment_client import JudgmentClient
44
+ from judgeval import JudgmentClient
45
45
  from judgeval.data import Example
46
46
  from judgeval.scorers import ContextualRecallScorer
47
47
 
@@ -31,7 +31,7 @@ Our contextual relevancy scorer is based on Stanford NLP's [ARES](https://arxiv.
31
31
  ## Sample Implementation
32
32
 
33
33
  ```python contextual_relevancy.py
34
- from judgeval.judgment_client import JudgmentClient
34
+ from judgeval import JudgmentClient
35
35
  from judgeval.data import Example
36
36
  from judgeval.scorers import ContextualRelevancyScorer
37
37
 
@@ -37,10 +37,9 @@ $$
37
37
  ## Sample Implementation
38
38
 
39
39
  ```python faithfulness.py
40
- from judgeval.judgment_client import JudgmentClient
40
+ from judgeval import JudgmentClient
41
41
  from judgeval.data import Example
42
- from judgeval.scorers import JudgmentScorer
43
- from judgeval.constants import APIScorer
42
+ from judgeval.scorers import FaithfulnessScorer
44
43
 
45
44
  client = JudgmentClient()
46
45
  example = Example(
@@ -51,7 +50,7 @@ example = Example(
51
50
  retrieval_context=["Return policy, all items: 30-day limit for full refund, no questions asked."]
52
51
  )
53
52
  # supply your own threshold
54
- scorer = JudgmentScorer(threshold=0.8, score_type=APIScorer.FAITHFULNESS)
53
+ scorer = FaithfulnessScorer(threshold=0.8)
55
54
 
56
55
  results = client.run_evaluation(
57
56
  examples=[example],
@@ -30,10 +30,9 @@ $$
30
30
  ## Sample Implementation
31
31
 
32
32
  ```python hallucination.py
33
- from judgeval.judgment_client import JudgmentClient
33
+ from judgeval import JudgmentClient
34
34
  from judgeval.data import Example
35
- from judgeval.scorers import JudgmentScorer
36
- from judgeval.constants import APIScorer
35
+ from judgeval.scorers import HallucinationScorer
37
36
 
38
37
  client = JudgmentClient()
39
38
  example = Example(
@@ -44,7 +43,7 @@ example = Example(
44
43
  context=["**RETURN POLICY** all products returnable with no cost for 30-days after purchase (receipt required)."]
45
44
  )
46
45
  # supply your own threshold
47
- scorer = JudgmentScorer(threshold=0.8, score_type=APIScorer.HALLUCINATION)
46
+ scorer = HallucinationScorer(threshold=0.8)
48
47
 
49
48
  results = client.run_evaluation(
50
49
  examples=[example],
@@ -35,17 +35,16 @@ $$
35
35
  ## Sample Implementation
36
36
 
37
37
  ```python json_correctness.py
38
- from judgeval.judgment_client import JudgmentClient
38
+ from judgeval import JudgmentClient
39
39
  from judgeval.data import Example
40
- from judgeval.scorers import JudgmentScorer
41
- from judgeval.constants import APIScorer
40
+ from judgeval.scorers import JSONCorrectnessScorer
42
41
  client = JudgmentClient()
43
42
  example = Example(
44
43
  input="Create a JSON object with the keys 'field1' (str) and 'field2' (int). Fill them with random values.",
45
44
  # Replace this with your LLM system's output
46
45
  actual_output="{'field1': 'value1', 'field2': 1}",
47
46
  )
48
- scorer = JudgmentScorer(threshold=0.8, score_type=APIScorer.JSON_CORRECTNESS) # TODO update this
47
+ scorer = JSONCorrectnessScorer(threshold=0.8)
49
48
  results = client.run_evaluation(
50
49
  examples=[example],
51
50
  scorers=[scorer],
@@ -40,10 +40,9 @@ $$
40
40
  ## Sample Implementation
41
41
 
42
42
  ```python summarization.py
43
- from judgeval.judgment_client import JudgmentClient
43
+ from judgeval import JudgmentClient
44
44
  from judgeval.data import Example
45
- from judgeval.scorers import JudgmentScorer
46
- from judgeval.constants import APIScorer
45
+ from judgeval.scorers import SummarizationScorer
47
46
 
48
47
  client = JudgmentClient()
49
48
  example = Example(
@@ -52,7 +51,7 @@ example = Example(
52
51
  actual_output="...",
53
52
  )
54
53
  # supply your own threshold
55
- scorer = JudgmentScorer(threshold=0.8, score_type=APIScorer.SUMMARIZATION)
54
+ scorer = SummarizationScorer(threshold=0.8)
56
55
 
57
56
  results = client.run_evaluation(
58
57
  examples=[example],
@@ -27,10 +27,9 @@ TODO add more docs here regarding tool ordering, exact match, or even correct to
27
27
  ## Sample Implementation
28
28
 
29
29
  ```python tool_correctness.py
30
- from judgeval.judgment_client import JudgmentClient
30
+ from judgeval import JudgmentClient
31
31
  from judgeval.data import Example
32
- from judgeval.scorers import JudgmentScorer
33
- from judgeval.constants import APIScorer
32
+ from judgeval.scorers import ToolCorrectnessScorer
34
33
 
35
34
  client = JudgmentClient()
36
35
  example = Example(
@@ -40,7 +39,7 @@ example = Example(
40
39
  expected_output=["DBQuery", "GoogleSearch"],
41
40
  )
42
41
  # supply your own threshold
43
- scorer = JudgmentScorer(threshold=0.8, score_type=APIScorer.TOOL_CORRECTNESS)
42
+ scorer = ToolCorrectnessScorer(threshold=0.8)
44
43
 
45
44
  results = client.run_evaluation(
46
45
  examples=[example],
@@ -19,19 +19,23 @@ access our state-of-the-art judge models, and manage your evaluations/datasets o
19
19
  Once you have a key, you can set the environment variable `JUDGMENT_API_KEY` to your key.
20
20
  This allows the `JudgmentClient` to authenticate your requests to the Judgment API.
21
21
 
22
+ ```
23
+ export JUDGMENT_API_KEY="your_key_here"
24
+ ```
25
+
22
26
  To receive a key, please email us at `contact@judgmentlabs.ai`.
23
27
 
24
28
 
25
29
  <Note>
26
30
  Running evaluations on Judgment Labs' infrastructure is recommended for
27
31
  large-scale evaluations. [Contact us](mailto:contact@judgmentlabs.ai) if you're dealing with
28
- sensitive data that has to reside in your private VPCs/On-Prem.
32
+ sensitive data that has to reside in your private VPCs.
29
33
  </Note>
30
34
 
31
35
  # Create your first evaluation
32
36
 
33
37
  ```python sample_eval.py
34
- from judgeval.judgment_client import JudgmentClient
38
+ from judgeval import JudgmentClient
35
39
  from judgeval.data import Example
36
40
  from judgeval.scorers import FaithfulnessScorer
37
41
 
@@ -58,16 +62,16 @@ Congratulations! Your evaluation should have passed. Let's break down what happe
58
62
  - The variable `retrieval_context` represents the retrieved context from your knowledge base and `FaithfulnessScorer(threshold=0.5)`
59
63
  is a scorer that checks if the output is hallucinated relative to the retrieved context.
60
64
  - Scorers give values betweeen 0 - 1 and we set the threshold for this scorer to 0.5 in the context of a unit test. If you are interested measuring rather than testing, you can ignore this threshold and reference the `score` field alone.
61
- - We chose `gpt-4o` as our judge model for faithfulness. Judgment Labs offers ANY judge model for your evaluation needs.
65
+ - We chose `gpt-4o` as our judge model for faithfulness. Judgment Labs offers ANY judge model for your evaluation needs. Consider trying out our state-of-the-art judge models for your next evaluation!
62
66
 
63
67
  # Create Your First Scorer
64
- `judgeval` offers three kinds of LLM scorers for your evaluation needs: ready-made, prompt scorers, and custom scorers.
68
+ `judgeval` offers three kinds of LLM scorers for your evaluation needs: ready-made, classifier scorers, and custom scorers.
65
69
 
66
70
  ## Ready-made Scorers
67
71
  Judgment Labs provides default implementations of 10+ research-backed metrics covering evaluation needs ranging from hallucination detection to RAG retrieval quality. To create a ready-made scorer, just import it directly from `judgeval.scorers`:
68
72
 
69
73
  ```python scorer_example.py
70
- from judgeval.judgment_client import JudgmentClient
74
+ from judgeval import JudgmentClient
71
75
  from judgeval.data import Example
72
76
  from judgeval.scorers import FaithfulnessScorer
73
77
 
@@ -91,15 +95,29 @@ print(results)
91
95
  For a complete list of ready-made scorers, see the [scorers docs](/evaluation/scorers).
92
96
  </Note>
93
97
 
94
- ## Prompt Scorers
98
+ ## Classifier Scorers
95
99
  `judgeval` allows you to create custom scorers using natural language. These can range from simple judges to powerful evaluators for your LLM systems.
96
100
 
101
+ ```python classifier_scorer.py
102
+ from judgeval.scorers import ClassifierScorer
103
+
104
+ classifier_scorer = ClassifierScorer(
105
+ name="Tone Scorer",
106
+ threshold=0.9,
107
+ conversation=[
108
+ {
109
+ "role": "system",
110
+ "content": "Is the response positive (Y/N)? The response is: {{actual_output}}."
111
+ }
112
+ ],
113
+ options={"Y": 1, "N": 0}
114
+ )
97
115
  ```
98
- TODO
99
- ```
116
+
117
+ To learn more about `ClassifierScorer`s, click [here](/evaluation/scorers/classifier_scorer).
100
118
 
101
119
  ## Custom Scorers
102
- If you find that none of the ready-made scorers or prompt scorers fit your needs, you can easily create your own custom scorer.
120
+ If you find that none of the ready-made scorers or classifier scorers fit your needs, you can easily create your own custom scorer.
103
121
  These can be as simple or complex as you need them to be and **_do not_** have to use an LLM judge model.
104
122
  Here's an example of computing BLEU scores:
105
123
 
@@ -148,7 +166,7 @@ If you're interested in measuring multiple metrics at once, you can group scorer
148
166
  regardless of the type of scorer.
149
167
 
150
168
  ```python multiple_scorers.py
151
- from judgeval.judgment_client import JudgmentClient
169
+ from judgeval import JudgmentClient
152
170
  from judgeval.scorers import FaithfulnessScorer, SummarizationScorer
153
171
 
154
172
  client = JudgmentClient()
@@ -221,41 +239,6 @@ Work in progress!
221
239
 
222
240
  Work in progress!
223
241
 
224
- ## Creating ClassifierScorers
225
-
226
- ClassifierScorers are **powerful** evaluators that can be created in minutes via Judgment's platform or SDK
227
- using **natural language criteria**.
228
-
229
- <Tip>
230
- For more information on what a ClassifierScorer is, click [here](/evaluation/scorers/classifier_scorer).
231
- </Tip>
232
-
233
- **Here's how to create a ClassifierScorer:**
234
-
235
- 1. Navigate to the `Scorers` tab in the Judgment platform. You'll find this on via the sidebar on the left.
236
- 2. Click the `Create Scorer` button in the top right corner.
237
-
238
- ![Alt text](/images/create_scorer.png "Optional title")
239
-
240
- 3. Here, you can create a custom scorer by using a criteria in natural language, supplying custom arguments from the [`Example`](evaluation/data_examples) class.
241
- Then, you supply a set of **choices** the scorer can select from when evaluating an example. Finally, you can test your scorer on samples in our playground.
242
-
243
- 4. Once you're finished, you can save the scorer and use it in your evaluation runs just like any other scorer in `judgeval`.
244
-
245
- ### Example
246
-
247
- Here's an example of building a `ClassifierScorer` that checks if the LLM's tone is too aggressive.
248
- This might be useful when building a customer support chatbot.
249
-
250
- ![Alt text](images/create_aggressive_scorer.png "Optional title")
251
-
252
- <Tip>
253
- A great use of ClassifierScorers is to prototype an evaluation criteria on a small set of examples before
254
- using it to benchmark your workflow.
255
-
256
- To learn more about `ClassifierScorer`s, click [here](/evaluation/scorers/classifier_scorer).
257
- </Tip>
258
-
259
242
  ## Optimizing Your LLM System
260
243
 
261
244
  Evaluation is a **prerequisite** for optimizing your LLM systems. Measuring the quality of your LLM workflows
@@ -284,7 +267,9 @@ Beyond experimenting and measuring historical performance, `judgeval` supports m
284
267
  Using our `tracing` module, you can **track your LLM system outputs from end to end**, allowing you to visualize the flow of your LLM system.
285
268
  Additionally, you can **enable evaluations to run in real-time** using Judgment's state-of-the-art judge models.
286
269
 
287
- TODO add picture of tracing, or an embedded gif
270
+ <div style={{display: 'flex', justifyContent: 'center'}}>
271
+ ![Alt text](/images/trace_screenshot.png "Image of a RAG pipeline trace")
272
+ </div>
288
273
 
289
274
  There are many benefits of monitoring your LLM systems in production with `judgeval`, including:
290
275
  - Detecting hallucinations and other quality issues **before they reach your customers**
@@ -0,0 +1,7 @@
1
+ ---
2
+ title: Introduction
3
+ ---
4
+
5
+ The Judgment platform is a tool for viewing and analyzing evaluations in **development and production**.
6
+
7
+
@@ -25,10 +25,6 @@
25
25
  "url": "https://github.com/judgmentlabs"
26
26
  }
27
27
  ],
28
- "topbarCtaButton": {
29
- "name": "Dashboard",
30
- "url": "https://dashboard.mintlify.com"
31
- },
32
28
  "tabs": [
33
29
  {
34
30
  "name": "Tutorials",
@@ -60,6 +56,7 @@
60
56
  "group": "Scorers",
61
57
  "pages": [
62
58
  "evaluation/scorers/introduction",
59
+ "evaluation/scorers/answer_correctness",
63
60
  "evaluation/scorers/answer_relevancy",
64
61
  "evaluation/scorers/contextual_precision",
65
62
  "evaluation/scorers/contextual_recall",
@@ -76,6 +73,14 @@
76
73
  "evaluation/judges"
77
74
  ]
78
75
  },
76
+ {
77
+ "group": "Monitoring",
78
+ "pages": [
79
+ "monitoring/introduction",
80
+ "monitoring/tracing",
81
+ "monitoring/production_insights"
82
+ ]
83
+ },
79
84
  {
80
85
  "group": "Judgment Platform",
81
86
  "pages": [
File without changes
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "judgeval"
3
- version = "0.0.3"
3
+ version = "0.0.4"
4
4
  authors = [
5
5
  { name="Andrew Li", email="andrew@judgmentlabs.ai" },
6
6
  { name="Alex Shan", email="alex@judgmentlabs.ai" },