judgeval 0.0.8__tar.gz → 0.0.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {judgeval-0.0.8 → judgeval-0.0.9}/PKG-INFO +1 -1
- {judgeval-0.0.8 → judgeval-0.0.9}/pyproject.toml +1 -1
- judgeval-0.0.9/src/demo/cookbooks/custom_scorers/competitor_mentions.py +58 -0
- judgeval-0.0.9/src/demo/cookbooks/custom_scorers/text2sql.py +205 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/__init__.py +2 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/judgeval_scorers/__init__.py +4 -0
- judgeval-0.0.9/src/judgeval/scorers/judgeval_scorers/classifiers/__init__.py +3 -0
- judgeval-0.0.9/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +3 -0
- judgeval-0.0.9/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +54 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/.github/workflows/ci.yaml +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/.gitignore +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/LICENSE.md +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/Pipfile +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/Pipfile.lock +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/README.md +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/README.md +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/development.mdx +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/essentials/code.mdx +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/essentials/images.mdx +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/essentials/markdown.mdx +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/essentials/navigation.mdx +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/essentials/reusable-snippets.mdx +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/essentials/settings.mdx +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/evaluation/data_datasets.mdx +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/evaluation/data_examples.mdx +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/evaluation/introduction.mdx +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/evaluation/judges.mdx +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/evaluation/scorers/answer_correctness.mdx +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/evaluation/scorers/answer_relevancy.mdx +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/evaluation/scorers/classifier_scorer.mdx +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/evaluation/scorers/contextual_precision.mdx +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/evaluation/scorers/contextual_recall.mdx +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/evaluation/scorers/contextual_relevancy.mdx +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/evaluation/scorers/custom_scorers.mdx +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/evaluation/scorers/faithfulness.mdx +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/evaluation/scorers/hallucination.mdx +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/evaluation/scorers/introduction.mdx +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/evaluation/scorers/json_correctness.mdx +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/evaluation/scorers/summarization.mdx +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/evaluation/scorers/tool_correctness.mdx +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/evaluation/unit_testing.mdx +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/favicon.svg +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/getting_started.mdx +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/images/checks-passed.png +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/images/create_aggressive_scorer.png +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/images/create_scorer.png +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/images/evaluation_diagram.png +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/images/hero-dark.svg +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/images/hero-light.svg +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/images/trace_screenshot.png +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/introduction.mdx +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/judgment/introduction.mdx +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/logo/dark.svg +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/logo/light.svg +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/mint.json +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/monitoring/introduction.mdx +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/monitoring/production_insights.mdx +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/monitoring/tracing.mdx +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/notebooks/create_dataset.ipynb +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/notebooks/create_scorer.ipynb +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/notebooks/demo.ipynb +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/notebooks/prompt_scorer.ipynb +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/notebooks/quickstart.ipynb +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/quickstart.mdx +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/docs/snippets/snippet-intro.mdx +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/pytest.ini +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/demo/cookbooks/ci_testing/ci_testing.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/demo/cookbooks/ci_testing/travel_response.txt +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/demo/cookbooks/langchain_basic_rag/basic_agentic_rag.ipynb +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/demo/cookbooks/langchain_basic_rag/tesla_q3.pdf +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/demo/cookbooks/langchain_sales/example_product_price_id_mapping.json +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/demo/cookbooks/langchain_sales/sales_agent_with_context.ipynb +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/demo/cookbooks/langchain_sales/sample_product_catalog.txt +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/demo/cookbooks/openai_travel_agent/agent.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/demo/cookbooks/openai_travel_agent/populate_db.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/__init__.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/clients.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/common/__init__.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/common/exceptions.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/common/logger.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/common/tracer.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/common/utils.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/constants.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/data/__init__.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/data/api_example.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/data/datasets/__init__.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/data/datasets/dataset.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/data/datasets/ground_truth.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/data/datasets/utils.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/data/example.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/data/result.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/data/scorer_data.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/evaluation_run.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/judges/__init__.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/judges/base_judge.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/judges/litellm_judge.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/judges/mixture_of_judges.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/judges/together_judge.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/judges/utils.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/judgment_client.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/run_evaluation.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/api_scorer.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/base_scorer.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/exceptions.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/judgeval_scorer.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/prompt_scorer.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/score.py +0 -0
- {judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/utils.py +0 -0
@@ -0,0 +1,58 @@
|
|
1
|
+
"""
|
2
|
+
This script implements a custom scorer to evaluate customer support responses.
|
3
|
+
|
4
|
+
It checks if a support response mentions competitors (like Adidas, Reebok, etc.) in a positive way.
|
5
|
+
The scorer penalizes responses that promote competitor products, helping maintain brand focus in
|
6
|
+
customer interactions. This would be useful to score customer support responses for Nike, for example.
|
7
|
+
"""
|
8
|
+
|
9
|
+
|
10
|
+
from judgeval import JudgmentClient
|
11
|
+
from judgeval.scorers import ClassifierScorer
|
12
|
+
from judgeval.data import Example
|
13
|
+
|
14
|
+
|
15
|
+
competitor_mentions_scorer = ClassifierScorer(
|
16
|
+
"Competitor Mentions",
|
17
|
+
slug="competitor_mentions-487126418",
|
18
|
+
threshold=1.0,
|
19
|
+
conversation=[{
|
20
|
+
"role": "system",
|
21
|
+
"content": """Does the following customer support response discuss any of the following competitors in a positive way? (Y/N).
|
22
|
+
|
23
|
+
Competitors: Adidas, Reebok, Hoka, ON, Converse
|
24
|
+
|
25
|
+
Customer Question: {{input}}
|
26
|
+
Customer Support Response: {{actual_output}}
|
27
|
+
"""
|
28
|
+
}],
|
29
|
+
options={
|
30
|
+
"Y": 0.0,
|
31
|
+
"N": 1.0
|
32
|
+
}
|
33
|
+
)
|
34
|
+
|
35
|
+
|
36
|
+
if __name__ == "__main__":
|
37
|
+
client = JudgmentClient()
|
38
|
+
|
39
|
+
positive_example = Example(
|
40
|
+
input="What are the best shoes for running priced under $130?",
|
41
|
+
actual_output="You'd want to check out the newest Nike Vaporfly, it's only $120 and built for performance. "
|
42
|
+
)
|
43
|
+
|
44
|
+
negative_example = Example(
|
45
|
+
input="What are the best shoes for running priced under $130?",
|
46
|
+
actual_output="The Nike Vaporfly is a great shoe built for performance. Other great options include the Adidas Ultraboost and the Reebok Nano X which are affordable and speedy."
|
47
|
+
)
|
48
|
+
|
49
|
+
client.run_evaluation(
|
50
|
+
examples=[positive_example, negative_example],
|
51
|
+
scorers=[competitor_mentions_scorer],
|
52
|
+
model="gpt-4o-mini",
|
53
|
+
project_name="competitor_mentions",
|
54
|
+
eval_run_name="competitor_mentions_test",
|
55
|
+
)
|
56
|
+
|
57
|
+
|
58
|
+
|
@@ -0,0 +1,205 @@
|
|
1
|
+
"""
|
2
|
+
This script is a cookbook of how to create a custom scorer using a ClassifierScorer.
|
3
|
+
|
4
|
+
Simply use a natural language prompt and guide the LLM to output a score based on the input by
|
5
|
+
choosing from a set of options.
|
6
|
+
"""
|
7
|
+
|
8
|
+
from judgeval import JudgmentClient
|
9
|
+
from judgeval.scorers import ClassifierScorer
|
10
|
+
from judgeval.data import Example
|
11
|
+
|
12
|
+
text2sql_scorer = ClassifierScorer(
|
13
|
+
"Text to SQL",
|
14
|
+
slug="text2sql-487126418",
|
15
|
+
threshold=1.0,
|
16
|
+
conversation=[{
|
17
|
+
"role": "system",
|
18
|
+
"content": """You will be given a natural language query, a corresponding LLM-generated SQL query, and a table schema + (optional) metadata.
|
19
|
+
|
20
|
+
** TASK INSTRUCTIONS **
|
21
|
+
Your task is to decide whether the LLM generated SQL query properly filters for what the natural language query is asking, based on the table schema + (optional) metadata.
|
22
|
+
Additionally, you should check if the SQL query is valid based on the table schema (checking for syntax errors, false column names, etc.)
|
23
|
+
|
24
|
+
** TIPS **
|
25
|
+
- Look for correct references to the table schema for column names, table names, etc.
|
26
|
+
- Check that the SQL query can be executed; make sure JOINs, GROUP BYs, ORDER BYs, etc. are valid with respect to the table schema.
|
27
|
+
- Check that aggregation functions (COUNT, SUM, AVG, etc.) are used appropriately with GROUP BY clauses
|
28
|
+
- Verify that WHERE conditions use the correct operators and data types for comparisons
|
29
|
+
- Ensure LIMIT and OFFSET clauses make sense for the query's purpose
|
30
|
+
- Check that JOINs use the correct keys and maintain referential integrity
|
31
|
+
- Verify that ORDER BY clauses use valid column names and sort directions
|
32
|
+
- Check for proper handling of NULL values where relevant
|
33
|
+
- Ensure subqueries are properly constructed and correlated when needed
|
34
|
+
- EVEN IF THE QUERY IS VALID, IF IT DOESN'T WORK FOR THE NATURAL LANGUAGE QUERY, YOU SHOULD CHOOSE "N" AS THE ANSWER.
|
35
|
+
|
36
|
+
** FORMATTING YOUR ANSWER **
|
37
|
+
If the SQL query is valid and works for the natural language query, choose option "Y" and otherwise "N". Provide a justification for your decision; if you choose "N", explain what about the LLM-generated SQL query is incorrect, or explain why it doesn't address the natural language query.
|
38
|
+
IF YOUR JUSTIFICATION SHOWS THAT THE SQL QUERY IS VALID AND WORKS FOR THE NATURAL LANGUAGE QUERY, YOU SHOULD CHOOSE "Y" AS THE ANSWER.
|
39
|
+
IF THE SQL QUERY IS INVALID, YOU SHOULD CHOOSE "N" AS THE ANSWER.
|
40
|
+
|
41
|
+
** YOUR TURN **
|
42
|
+
Natural language query:
|
43
|
+
{{input}}
|
44
|
+
|
45
|
+
LLM generated SQL query:
|
46
|
+
{{actual_output}}
|
47
|
+
|
48
|
+
Table schema:
|
49
|
+
{{context}}
|
50
|
+
"""
|
51
|
+
}],
|
52
|
+
options={
|
53
|
+
"Y": 1.0,
|
54
|
+
"N": 0.0
|
55
|
+
}
|
56
|
+
)
|
57
|
+
|
58
|
+
|
59
|
+
if __name__ == "__main__":
|
60
|
+
client = JudgmentClient()
|
61
|
+
|
62
|
+
table_schema = """CREATE TABLE Artists (
|
63
|
+
artist_id VARCHAR(50) PRIMARY KEY,
|
64
|
+
name VARCHAR(255) NOT NULL,
|
65
|
+
genre VARCHAR(100),
|
66
|
+
followers INT,
|
67
|
+
popularity INT,
|
68
|
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
69
|
+
);
|
70
|
+
|
71
|
+
CREATE TABLE Albums (
|
72
|
+
album_id VARCHAR(50) PRIMARY KEY,
|
73
|
+
title VARCHAR(255) NOT NULL,
|
74
|
+
artist_id VARCHAR(50) NOT NULL,
|
75
|
+
release_date DATE,
|
76
|
+
total_tracks INT,
|
77
|
+
album_type VARCHAR(50) CHECK (album_type IN ('album', 'single', 'compilation')),
|
78
|
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
79
|
+
FOREIGN KEY (artist_id) REFERENCES Artists(artist_id) ON DELETE CASCADE
|
80
|
+
);
|
81
|
+
|
82
|
+
CREATE TABLE Tracks (
|
83
|
+
track_id VARCHAR(50) PRIMARY KEY,
|
84
|
+
title VARCHAR(255) NOT NULL,
|
85
|
+
album_id VARCHAR(50) NOT NULL,
|
86
|
+
artist_id VARCHAR(50) NOT NULL,
|
87
|
+
duration_ms INT NOT NULL,
|
88
|
+
explicit BOOLEAN DEFAULT FALSE,
|
89
|
+
popularity INT DEFAULT 0,
|
90
|
+
preview_url VARCHAR(255),
|
91
|
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
92
|
+
FOREIGN KEY (album_id) REFERENCES Albums(album_id) ON DELETE CASCADE,
|
93
|
+
FOREIGN KEY (artist_id) REFERENCES Artists(artist_id) ON DELETE CASCADE
|
94
|
+
);
|
95
|
+
|
96
|
+
CREATE TABLE Users (
|
97
|
+
user_id VARCHAR(50) PRIMARY KEY,
|
98
|
+
username VARCHAR(100) NOT NULL UNIQUE,
|
99
|
+
email VARCHAR(255) NOT NULL UNIQUE,
|
100
|
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
101
|
+
);
|
102
|
+
|
103
|
+
CREATE TABLE Playlists (
|
104
|
+
playlist_id VARCHAR(50) PRIMARY KEY,
|
105
|
+
user_id VARCHAR(50) NOT NULL,
|
106
|
+
name VARCHAR(255) NOT NULL,
|
107
|
+
description TEXT,
|
108
|
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
109
|
+
FOREIGN KEY (user_id) REFERENCES Users(user_id) ON DELETE CASCADE
|
110
|
+
);
|
111
|
+
|
112
|
+
CREATE TABLE PlaylistTracks (
|
113
|
+
playlist_id VARCHAR(50),
|
114
|
+
track_id VARCHAR(50),
|
115
|
+
added_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
116
|
+
PRIMARY KEY (playlist_id, track_id),
|
117
|
+
FOREIGN KEY (playlist_id) REFERENCES Playlists(playlist_id) ON DELETE CASCADE,
|
118
|
+
FOREIGN KEY (track_id) REFERENCES Tracks(track_id) ON DELETE CASCADE
|
119
|
+
);
|
120
|
+
|
121
|
+
CREATE TABLE UserListeningHistory (
|
122
|
+
history_id SERIAL PRIMARY KEY,
|
123
|
+
user_id VARCHAR(50) NOT NULL,
|
124
|
+
track_id VARCHAR(50) NOT NULL,
|
125
|
+
listened_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
126
|
+
FOREIGN KEY (user_id) REFERENCES Users(user_id) ON DELETE CASCADE,
|
127
|
+
FOREIGN KEY (track_id) REFERENCES Tracks(track_id) ON DELETE CASCADE
|
128
|
+
);
|
129
|
+
"""
|
130
|
+
|
131
|
+
all_tracks_one_artist_correct = Example(
|
132
|
+
input="Find all tracks by the artist 'Drake', sorted by popularity.",
|
133
|
+
actual_output="""SELECT t.track_id, t.title, t.popularity, a.name AS artist_name
|
134
|
+
FROM Tracks t
|
135
|
+
JOIN Artists a ON t.artist_id = a.artist_id
|
136
|
+
WHERE a.name = 'Drake'
|
137
|
+
ORDER BY t.popularity DESC;
|
138
|
+
""",
|
139
|
+
retrieval_context=[table_schema]
|
140
|
+
)
|
141
|
+
|
142
|
+
most_listened_to_one_user_correct = Example(
|
143
|
+
input="Find the most listened to track by user 'user123'.",
|
144
|
+
actual_output="""SELECT t.track_id, t.title, COUNT(uh.history_id) AS play_count
|
145
|
+
FROM UserListeningHistory uh
|
146
|
+
JOIN Tracks t ON uh.track_id = t.track_id
|
147
|
+
WHERE uh.user_id = 'user123'
|
148
|
+
GROUP BY t.track_id, t.title
|
149
|
+
ORDER BY play_count DESC
|
150
|
+
LIMIT 1;
|
151
|
+
""",
|
152
|
+
retrieval_context=[table_schema]
|
153
|
+
)
|
154
|
+
|
155
|
+
highest_num_playlists_correct = Example(
|
156
|
+
input="Find the 5 users with the highest number of playlists.",
|
157
|
+
actual_output="""SELECT u.user_id, u.username, COUNT(p.playlist_id) AS total_playlists
|
158
|
+
FROM Users u
|
159
|
+
JOIN Playlists p ON u.user_id = p.user_id
|
160
|
+
GROUP BY u.user_id, u.username
|
161
|
+
ORDER BY total_playlists DESC
|
162
|
+
LIMIT 5;
|
163
|
+
""",
|
164
|
+
retrieval_context=[table_schema]
|
165
|
+
)
|
166
|
+
|
167
|
+
most_popular_tracks_all_users_correct = Example(
|
168
|
+
input="Find the 10 most popular tracks across all users.",
|
169
|
+
actual_output="""SELECT t.track_id, t.title, COUNT(uh.history_id) AS total_listens
|
170
|
+
FROM Tracks t
|
171
|
+
JOIN UserListeningHistory uh ON t.track_id = uh.track_id
|
172
|
+
GROUP BY t.track_id, t.title
|
173
|
+
ORDER BY total_listens DESC
|
174
|
+
LIMIT 10;
|
175
|
+
""",
|
176
|
+
retrieval_context=[table_schema]
|
177
|
+
)
|
178
|
+
|
179
|
+
most_popular_tracks_all_users_incorrect = Example(
|
180
|
+
input="Find the 10 most popular tracks across all users.",
|
181
|
+
actual_output="""SELECT t.track_user, t.title, COUNT(uh.history_id) AS total_listens
|
182
|
+
FROM Tracks t
|
183
|
+
JOIN UserHistory uh ON t.track_user = uh.track_user
|
184
|
+
GROUP BY t.track_user, t.title
|
185
|
+
ORDER BY total_listens DESC
|
186
|
+
LIMIT 10;
|
187
|
+
""",
|
188
|
+
retrieval_context=[table_schema]
|
189
|
+
)
|
190
|
+
|
191
|
+
|
192
|
+
client.run_evaluation(
|
193
|
+
examples=[
|
194
|
+
all_tracks_one_artist_correct,
|
195
|
+
most_listened_to_one_user_correct,
|
196
|
+
highest_num_playlists_correct,
|
197
|
+
most_popular_tracks_all_users_correct,
|
198
|
+
most_popular_tracks_all_users_incorrect
|
199
|
+
],
|
200
|
+
scorers=[text2sql_scorer],
|
201
|
+
model="gpt-4o-mini",
|
202
|
+
project_name="text2sql",
|
203
|
+
eval_run_name="text2sql_test",
|
204
|
+
override=True
|
205
|
+
)
|
@@ -13,6 +13,7 @@ from judgeval.scorers.judgeval_scorers import (
|
|
13
13
|
AnswerRelevancyScorer,
|
14
14
|
ScorerWrapper,
|
15
15
|
AnswerCorrectnessScorer,
|
16
|
+
Text2SQLScorer,
|
16
17
|
)
|
17
18
|
|
18
19
|
__all__ = [
|
@@ -31,4 +32,5 @@ __all__ = [
|
|
31
32
|
"AnswerRelevancyScorer",
|
32
33
|
"ScorerWrapper",
|
33
34
|
"AnswerCorrectnessScorer",
|
35
|
+
"Text2SQLScorer",
|
34
36
|
]
|
@@ -28,6 +28,9 @@ from judgeval.scorers.judgeval_scorers.local_implementations import (
|
|
28
28
|
AnswerCorrectnessScorer as LocalAnswerCorrectnessScorer
|
29
29
|
)
|
30
30
|
|
31
|
+
from judgeval.scorers.judgeval_scorers.classifiers import Text2SQLScorer
|
32
|
+
|
33
|
+
|
31
34
|
class ScorerWrapper:
|
32
35
|
"""
|
33
36
|
Wrapper class that can dynamically load either API or local implementation of a scorer.
|
@@ -141,4 +144,5 @@ __all__ = [
|
|
141
144
|
"ContextualPrecisionScorer",
|
142
145
|
"ContextualRecallScorer",
|
143
146
|
"AnswerRelevancyScorer",
|
147
|
+
"Text2SQLScorer",
|
144
148
|
]
|
@@ -0,0 +1,54 @@
|
|
1
|
+
"""
|
2
|
+
ClassifierScorer implementation for basic Text-to-SQL evaluation.
|
3
|
+
|
4
|
+
Takes a natural language query, a corresponding LLM-generated SQL query, and a table schema + (optional) metadata.
|
5
|
+
Determines if the LLM-generated SQL query is valid and works for the natural language query.
|
6
|
+
"""
|
7
|
+
from judgeval.scorers import ClassifierScorer
|
8
|
+
|
9
|
+
Text2SQLScorer = ClassifierScorer(
|
10
|
+
"Text to SQL",
|
11
|
+
slug="text2sql-1010101010",
|
12
|
+
threshold=1.0,
|
13
|
+
conversation=[{
|
14
|
+
"role": "system",
|
15
|
+
"content": """You will be given a natural language query, a corresponding LLM-generated SQL query, and a table schema + (optional) metadata.
|
16
|
+
|
17
|
+
** TASK INSTRUCTIONS **
|
18
|
+
Your task is to decide whether the LLM generated SQL query properly filters for what the natural language query is asking, based on the table schema + (optional) metadata.
|
19
|
+
Additionally, you should check if the SQL query is valid based on the table schema (checking for syntax errors, false column names, etc.)
|
20
|
+
|
21
|
+
** TIPS **
|
22
|
+
- Look for correct references to the table schema for column names, table names, etc.
|
23
|
+
- Check that the SQL query can be executed; make sure JOINs, GROUP BYs, ORDER BYs, etc. are valid with respect to the table schema.
|
24
|
+
- Check that aggregation functions (COUNT, SUM, AVG, etc.) are used appropriately with GROUP BY clauses
|
25
|
+
- Verify that WHERE conditions use the correct operators and data types for comparisons
|
26
|
+
- Ensure LIMIT and OFFSET clauses make sense for the query's purpose
|
27
|
+
- Check that JOINs use the correct keys and maintain referential integrity
|
28
|
+
- Verify that ORDER BY clauses use valid column names and sort directions
|
29
|
+
- Check for proper handling of NULL values where relevant
|
30
|
+
- Ensure subqueries are properly constructed and correlated when needed
|
31
|
+
- EVEN IF THE QUERY IS VALID, IF IT DOESN'T WORK FOR THE NATURAL LANGUAGE QUERY, YOU SHOULD CHOOSE "N" AS THE ANSWER.
|
32
|
+
|
33
|
+
** FORMATTING YOUR ANSWER **
|
34
|
+
If the SQL query is valid and works for the natural language query, choose option "Y" and otherwise "N". Provide a justification for your decision; if you choose "N", explain what about the LLM-generated SQL query is incorrect, or explain why it doesn't address the natural language query.
|
35
|
+
IF YOUR JUSTIFICATION SHOWS THAT THE SQL QUERY IS VALID AND WORKS FOR THE NATURAL LANGUAGE QUERY, YOU SHOULD CHOOSE "Y" AS THE ANSWER.
|
36
|
+
IF THE SQL QUERY IS INVALID, YOU SHOULD CHOOSE "N" AS THE ANSWER.
|
37
|
+
|
38
|
+
** YOUR TURN **
|
39
|
+
Natural language query:
|
40
|
+
{{input}}
|
41
|
+
|
42
|
+
LLM generated SQL query:
|
43
|
+
{{actual_output}}
|
44
|
+
|
45
|
+
Table schema:
|
46
|
+
{{context}}
|
47
|
+
"""
|
48
|
+
}],
|
49
|
+
options={
|
50
|
+
"Y": 1.0,
|
51
|
+
"N": 0.0
|
52
|
+
}
|
53
|
+
)
|
54
|
+
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{judgeval-0.0.8 → judgeval-0.0.9}/src/demo/cookbooks/langchain_basic_rag/basic_agentic_rag.ipynb
RENAMED
File without changes
|
File without changes
|
File without changes
|
{judgeval-0.0.8 → judgeval-0.0.9}/src/demo/cookbooks/langchain_sales/sales_agent_with_context.ipynb
RENAMED
File without changes
|
{judgeval-0.0.8 → judgeval-0.0.9}/src/demo/cookbooks/langchain_sales/sample_product_catalog.txt
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py
RENAMED
File without changes
|
{judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py
RENAMED
File without changes
|
File without changes
|
{judgeval-0.0.8 → judgeval-0.0.9}/src/judgeval/scorers/judgeval_scorers/api_scorers/summarization.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|