judgeval 0.0.24__tar.gz → 0.0.25__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {judgeval-0.0.24 → judgeval-0.0.25}/PKG-INFO +1 -1
- {judgeval-0.0.24 → judgeval-0.0.25}/pyproject.toml +1 -1
- judgeval-0.0.25/src/demo/cookbooks/test.py +152 -0
- judgeval-0.0.25/src/demo/custom_scorer.py +60 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/.github/workflows/ci.yaml +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/.gitignore +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/LICENSE.md +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/Pipfile +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/Pipfile.lock +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/README.md +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/README.md +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/api_reference/judgment_client.mdx +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/api_reference/trace.mdx +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/development.mdx +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/essentials/code.mdx +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/essentials/images.mdx +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/essentials/markdown.mdx +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/essentials/navigation.mdx +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/essentials/reusable-snippets.mdx +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/essentials/settings.mdx +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/evaluation/data_datasets.mdx +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/evaluation/data_examples.mdx +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/evaluation/introduction.mdx +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/evaluation/judges.mdx +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/evaluation/scorers/answer_correctness.mdx +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/evaluation/scorers/answer_relevancy.mdx +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/evaluation/scorers/classifier_scorer.mdx +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/evaluation/scorers/comparison.mdx +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/evaluation/scorers/contextual_precision.mdx +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/evaluation/scorers/contextual_recall.mdx +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/evaluation/scorers/contextual_relevancy.mdx +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/evaluation/scorers/custom_scorers.mdx +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/evaluation/scorers/execution_order.mdx +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/evaluation/scorers/faithfulness.mdx +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/evaluation/scorers/groundedness.mdx +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/evaluation/scorers/hallucination.mdx +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/evaluation/scorers/introduction.mdx +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/evaluation/scorers/json_correctness.mdx +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/evaluation/scorers/summarization.mdx +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/evaluation/unit_testing.mdx +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/favicon.svg +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/getting_started.mdx +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/images/basic_trace_example.png +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/images/checks-passed.png +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/images/create_aggressive_scorer.png +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/images/create_scorer.png +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/images/evaluation_diagram.png +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/images/hero-dark.svg +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/images/hero-light.svg +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/images/online_eval_fault.png +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/images/trace_ss.png +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/integration/langgraph.mdx +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/introduction.mdx +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/judgment/introduction.mdx +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/logo/dark.svg +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/logo/light.svg +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/mint.json +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/monitoring/introduction.mdx +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/monitoring/production_insights.mdx +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/monitoring/tracing.mdx +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/notebooks/create_dataset.ipynb +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/notebooks/create_scorer.ipynb +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/notebooks/demo.ipynb +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/notebooks/prompt_scorer.ipynb +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/notebooks/quickstart.ipynb +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/quickstart.mdx +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/docs/snippets/snippet-intro.mdx +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/pytest.ini +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/__init__.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/clients.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/common/__init__.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/common/exceptions.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/common/logger.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/common/tracer.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/common/utils.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/constants.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/data/__init__.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/data/api_example.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/data/datasets/__init__.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/data/datasets/dataset.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/data/datasets/eval_dataset_client.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/data/example.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/data/result.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/data/scorer_data.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/evaluation_run.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/integrations/langgraph.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/judges/__init__.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/judges/base_judge.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/judges/litellm_judge.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/judges/mixture_of_judges.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/judges/together_judge.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/judges/utils.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/judgment_client.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/rules.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/run_evaluation.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/__init__.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/api_scorer.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/base_scorer.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/exceptions.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorer.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/local_implementations/comparison/__init__.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/local_implementations/comparison/comparison_scorer.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/local_implementations/comparison/prompts.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/local_implementations/execution_order/__init__.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/local_implementations/execution_order/execution_order.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/instruction_adherence.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/prompt.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/prompt_scorer.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/score.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/utils.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/tracer/__init__.py +0 -0
- {judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/utils/alerts.py +0 -0
@@ -0,0 +1,152 @@
|
|
1
|
+
from judgeval.data import Example
|
2
|
+
from judgeval.data.datasets import EvalDataset
|
3
|
+
from judgeval.scorers import AnswerRelevancyScorer
|
4
|
+
from judgeval import JudgmentClient
|
5
|
+
|
6
|
+
|
7
|
+
def create_sample_dataset():
|
8
|
+
# Define sample inputs
|
9
|
+
inputs = [
|
10
|
+
# Highly relevant Q/A pairs
|
11
|
+
"Who founded Microsoft?",
|
12
|
+
"What is the capital of France?",
|
13
|
+
"How does photosynthesis work?",
|
14
|
+
"What are the benefits of exercise?",
|
15
|
+
"Explain quantum computing in simple terms.",
|
16
|
+
|
17
|
+
# Somewhat relevant Q/A pairs
|
18
|
+
"What is machine learning?",
|
19
|
+
"How do electric cars work?",
|
20
|
+
"What causes climate change?",
|
21
|
+
"How does the human digestive system function?",
|
22
|
+
"What is blockchain technology?",
|
23
|
+
|
24
|
+
# Minimally relevant Q/A pairs
|
25
|
+
"What are the main programming languages?",
|
26
|
+
"How do I bake a chocolate cake?",
|
27
|
+
"What is the history of the Roman Empire?",
|
28
|
+
"How do vaccines work?",
|
29
|
+
"What are black holes?",
|
30
|
+
|
31
|
+
# Not relevant Q/A pairs
|
32
|
+
"What is the best smartphone to buy?",
|
33
|
+
"How tall is Mount Everest?",
|
34
|
+
"Who wrote Romeo and Juliet?",
|
35
|
+
"What is the population of Tokyo?",
|
36
|
+
"How do I change a flat tire?"
|
37
|
+
]
|
38
|
+
|
39
|
+
# Define corresponding outputs
|
40
|
+
actual_outputs = [
|
41
|
+
# Highly relevant answers
|
42
|
+
"Bill Gates and Paul Allen founded Microsoft in 1975.",
|
43
|
+
"The capital of France is Paris, known for the Eiffel Tower and Louvre Museum.",
|
44
|
+
"Photosynthesis is the process where plants convert sunlight, water, and carbon dioxide into glucose and oxygen.",
|
45
|
+
"Regular exercise improves cardiovascular health, builds muscle strength, reduces stress, and helps maintain a healthy weight.",
|
46
|
+
"Quantum computing uses quantum bits or qubits that can exist in multiple states simultaneously, allowing for potentially faster computation of certain problems compared to classical computers.",
|
47
|
+
|
48
|
+
# Somewhat relevant answers (partial or tangential information)
|
49
|
+
"Machine learning involves statistical techniques, but it's primarily about natural language processing and computer vision applications in modern businesses.",
|
50
|
+
"Electric cars use batteries, though the most important aspect is their impact on reducing traffic congestion in urban areas.",
|
51
|
+
"Climate change is related to weather patterns, but it's mainly caused by volcanic eruptions and natural planetary cycles.",
|
52
|
+
"The digestive system breaks down food, but the most interesting part is how it connects to brain function and mental health.",
|
53
|
+
"Blockchain is a distributed ledger technology, though its primary purpose is to replace traditional banking systems entirely.",
|
54
|
+
|
55
|
+
# Minimally relevant answers (mostly off-topic but with slight connection)
|
56
|
+
"Programming languages include Python and JavaScript, but the real question is whether AI will replace programmers in the next decade.",
|
57
|
+
"Chocolate cakes require flour and sugar, but I'd recommend focusing on gluten-free alternatives since they're healthier.",
|
58
|
+
"The Roman Empire lasted for centuries, but modern Italy's political system is more relevant to understand today's European politics.",
|
59
|
+
"Vaccines stimulate immune responses, but the pharmaceutical industry's profit motives are what you should really be concerned about.",
|
60
|
+
"Black holes are regions of spacetime, but the conspiracy theories about what NASA isn't telling us are far more interesting.",
|
61
|
+
|
62
|
+
# Not relevant answers (completely off-topic)
|
63
|
+
"The migration patterns of monarch butterflies are fascinating examples of evolutionary adaptation.",
|
64
|
+
"The Great Wall of China was built over multiple dynasties and stretches over 13,000 miles.",
|
65
|
+
"Photosynthesis is how plants convert sunlight into energy, producing oxygen as a byproduct.",
|
66
|
+
"The human genome contains approximately 3 billion base pairs of DNA.",
|
67
|
+
"The Pythagorean theorem states that in a right-angled triangle, the square of the hypotenuse equals the sum of squares of the other two sides."
|
68
|
+
]
|
69
|
+
|
70
|
+
# Create Example objects from inputs and outputs
|
71
|
+
examples = []
|
72
|
+
for i in range(len(inputs)):
|
73
|
+
examples.append(Example(
|
74
|
+
input=inputs[i],
|
75
|
+
actual_output=actual_outputs[i]
|
76
|
+
))
|
77
|
+
|
78
|
+
return EvalDataset(examples=examples)
|
79
|
+
|
80
|
+
|
81
|
+
def save_dataset(client, dataset, alias):
|
82
|
+
"""Save the dataset to Judgment API with the given alias"""
|
83
|
+
client.push_dataset(alias=alias, dataset=dataset)
|
84
|
+
print(f"Dataset saved with alias: {alias}")
|
85
|
+
|
86
|
+
|
87
|
+
def run_evaluation(client, dataset_alias, model="gpt-4o", project_name="jnpr_mist_demo_project", eval_run_name="jnpr_demo_eval_run"):
|
88
|
+
"""Pull a dataset and run an evaluation on it"""
|
89
|
+
# Pull the dataset from Judgment API
|
90
|
+
eval_dataset = client.pull_dataset(alias=dataset_alias)
|
91
|
+
|
92
|
+
# Run the evaluation
|
93
|
+
results = client.evaluate_dataset(
|
94
|
+
dataset=eval_dataset,
|
95
|
+
scorers=[AnswerRelevancyScorer(threshold=0.8)],
|
96
|
+
model=model,
|
97
|
+
eval_run_name=eval_run_name,
|
98
|
+
project_name=project_name,
|
99
|
+
)
|
100
|
+
|
101
|
+
return results
|
102
|
+
|
103
|
+
|
104
|
+
def run_assertion_test(client, dataset_alias, model="gpt-4o", project_name="jnpr_mist_demo_project", eval_run_name="jnpr_demo_assertion_run"):
|
105
|
+
"""Pull a dataset and run assertion tests on its examples"""
|
106
|
+
# Pull the dataset from Judgment API
|
107
|
+
eval_dataset = client.pull_dataset(alias=dataset_alias)
|
108
|
+
|
109
|
+
# Extract examples from the dataset
|
110
|
+
examples = eval_dataset.examples
|
111
|
+
|
112
|
+
# Run assertion tests on each example
|
113
|
+
# Run assertion test on all examples at once
|
114
|
+
client.assert_test(
|
115
|
+
examples=examples,
|
116
|
+
scorers=[AnswerRelevancyScorer(threshold=0.8)],
|
117
|
+
model=model,
|
118
|
+
project_name=project_name,
|
119
|
+
eval_run_name=eval_run_name
|
120
|
+
)
|
121
|
+
|
122
|
+
|
123
|
+
def main():
|
124
|
+
client = JudgmentClient()
|
125
|
+
|
126
|
+
# Uncomment to create and save a new dataset
|
127
|
+
# dataset = create_sample_dataset()
|
128
|
+
# save_dataset(client, dataset, "jnpr_demo_dataset")
|
129
|
+
|
130
|
+
# # Run evaluation on the saved dataset
|
131
|
+
# results = run_evaluation(
|
132
|
+
# client,
|
133
|
+
# dataset_alias="jnpr_demo_dataset",
|
134
|
+
# model="gpt-4o",
|
135
|
+
# project_name="jnpr_mist_demo_project",
|
136
|
+
# eval_run_name="jnpr_demo_eval"
|
137
|
+
# )
|
138
|
+
|
139
|
+
# Run assertion test on the saved dataset
|
140
|
+
results = run_assertion_test(
|
141
|
+
client,
|
142
|
+
dataset_alias="jnpr_demo_dataset",
|
143
|
+
model="gpt-4o",
|
144
|
+
project_name="jnpr_mist_demo_project",
|
145
|
+
eval_run_name="jnpr_demo_assertion"
|
146
|
+
)
|
147
|
+
return results
|
148
|
+
|
149
|
+
|
150
|
+
if __name__ == "__main__":
|
151
|
+
results = main()
|
152
|
+
print(results)
|
@@ -0,0 +1,60 @@
|
|
1
|
+
from judgeval import JudgmentClient
|
2
|
+
from judgeval.data import Example
|
3
|
+
from judgeval.scorers import JudgevalScorer, AnswerRelevancyScorer
|
4
|
+
|
5
|
+
client = JudgmentClient()
|
6
|
+
|
7
|
+
|
8
|
+
class SampleScorer(JudgevalScorer):
|
9
|
+
|
10
|
+
def __init__(
|
11
|
+
self,
|
12
|
+
threshold=0.5,
|
13
|
+
score_type="Sample Scorer",
|
14
|
+
include_reason=True,
|
15
|
+
async_mode=True,
|
16
|
+
strict_mode=False,
|
17
|
+
verbose_mode=True
|
18
|
+
):
|
19
|
+
super().__init__(score_type=score_type, threshold=threshold)
|
20
|
+
self.threshold = 1 if strict_mode else threshold
|
21
|
+
# Optional attributes
|
22
|
+
self.include_reason = include_reason
|
23
|
+
self.async_mode = async_mode
|
24
|
+
self.strict_mode = strict_mode
|
25
|
+
self.verbose_mode = verbose_mode
|
26
|
+
|
27
|
+
def score_example(self, example):
|
28
|
+
self.score = 1.0
|
29
|
+
self.success = True
|
30
|
+
return self.score
|
31
|
+
|
32
|
+
async def a_score_example(self, example):
|
33
|
+
print("Scoring example...")
|
34
|
+
self.score = 1.0
|
35
|
+
self.success = True
|
36
|
+
return self.score
|
37
|
+
|
38
|
+
def _success_check(self):
|
39
|
+
print("Checking success...")
|
40
|
+
if self.error is not None:
|
41
|
+
return False
|
42
|
+
return self.score >= self.threshold
|
43
|
+
|
44
|
+
# @property
|
45
|
+
# def __name__(self):
|
46
|
+
# return "Sample Scorer"
|
47
|
+
|
48
|
+
|
49
|
+
if __name__ == "__main__":
|
50
|
+
scorer = SampleScorer()
|
51
|
+
example = Example(
|
52
|
+
input="What is the capital of France?",
|
53
|
+
actual_output="Paris",
|
54
|
+
)
|
55
|
+
results = client.run_evaluation(examples=[example],
|
56
|
+
scorers=[scorer],
|
57
|
+
model="gpt-4o",
|
58
|
+
project_name="custom-scorer",
|
59
|
+
eval_run_name="custom-scorer-demo-10",
|
60
|
+
ignore_errors=True)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
{judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/api_scorers/comparison.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{judgeval-0.0.24 → judgeval-0.0.25}/src/judgeval/scorers/judgeval_scorers/classifiers/__init__.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|