judgeval 0.0.54__py3-none-any.whl → 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. judgeval/common/api/__init__.py +3 -0
  2. judgeval/common/api/api.py +352 -0
  3. judgeval/common/api/constants.py +165 -0
  4. judgeval/common/storage/__init__.py +6 -0
  5. judgeval/common/tracer/__init__.py +31 -0
  6. judgeval/common/tracer/constants.py +22 -0
  7. judgeval/common/tracer/core.py +1916 -0
  8. judgeval/common/tracer/otel_exporter.py +108 -0
  9. judgeval/common/tracer/otel_span_processor.py +234 -0
  10. judgeval/common/tracer/span_processor.py +37 -0
  11. judgeval/common/tracer/span_transformer.py +211 -0
  12. judgeval/common/tracer/trace_manager.py +92 -0
  13. judgeval/common/utils.py +2 -2
  14. judgeval/constants.py +3 -30
  15. judgeval/data/datasets/eval_dataset_client.py +29 -156
  16. judgeval/data/judgment_types.py +4 -12
  17. judgeval/data/result.py +1 -1
  18. judgeval/data/scorer_data.py +2 -2
  19. judgeval/data/scripts/openapi_transform.py +1 -1
  20. judgeval/data/trace.py +66 -1
  21. judgeval/data/trace_run.py +0 -3
  22. judgeval/evaluation_run.py +0 -2
  23. judgeval/integrations/langgraph.py +43 -164
  24. judgeval/judgment_client.py +17 -211
  25. judgeval/run_evaluation.py +209 -611
  26. judgeval/scorers/__init__.py +2 -6
  27. judgeval/scorers/base_scorer.py +4 -23
  28. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +3 -3
  29. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +215 -0
  30. judgeval/scorers/score.py +2 -1
  31. judgeval/scorers/utils.py +1 -13
  32. judgeval/utils/requests.py +21 -0
  33. judgeval-0.1.0.dist-info/METADATA +202 -0
  34. {judgeval-0.0.54.dist-info → judgeval-0.1.0.dist-info}/RECORD +37 -29
  35. judgeval/common/tracer.py +0 -3215
  36. judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +0 -73
  37. judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
  38. judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
  39. judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -53
  40. judgeval-0.0.54.dist-info/METADATA +0 -1384
  41. /judgeval/common/{s3_storage.py → storage/s3_storage.py} +0 -0
  42. {judgeval-0.0.54.dist-info → judgeval-0.1.0.dist-info}/WHEEL +0 -0
  43. {judgeval-0.0.54.dist-info → judgeval-0.1.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,73 +0,0 @@
1
- from judgeval.scorers.api_scorer import APIScorerConfig
2
- from judgeval.constants import APIScorerType
3
- from typing import List, Mapping, Optional, Dict, Any
4
-
5
-
6
- class ClassifierScorer(APIScorerConfig):
7
- """
8
- In the Judgment backend, this scorer is implemented as a PromptScorer that takes
9
- 1. a system role that may involve the Example object
10
- 2. options for scores on the example
11
-
12
- and uses a judge to execute the evaluation from the system role and classify into one of the options
13
-
14
- ex:
15
- system_role = "You are a judge that evaluates whether the response is positive or negative. The response is: {example.actual_output}"
16
- options = {"positive": 1, "negative": 0}
17
-
18
- Args:
19
- slug (str): A unique identifier for the scorer
20
- conversation (List[dict]): The conversation template with placeholders (e.g., {{actual_output}})
21
- options (Mapping[str, float]): A mapping of classification options to their corresponding scores
22
- """
23
-
24
- slug: Optional[str] = None
25
- conversation: Optional[List[dict]] = None
26
- options: Optional[Mapping[str, float]] = None
27
- score_type: APIScorerType = APIScorerType.PROMPT_SCORER
28
-
29
- def update_name(self, name: str):
30
- """
31
- Updates the name of the scorer.
32
- """
33
- self.name = name
34
-
35
- def update_threshold(self, threshold: float):
36
- """
37
- Updates the threshold of the scorer.
38
- """
39
- self.threshold = threshold
40
-
41
- def update_conversation(self, conversation: List[dict]):
42
- """
43
- Updates the conversation with the new conversation.
44
-
45
- Sample conversation:
46
- [{'role': 'system', 'content': "Did the chatbot answer the user's question in a kind way?: {{actual_output}}."}]
47
- """
48
- self.conversation = conversation
49
-
50
- def update_options(self, options: Mapping[str, float]):
51
- """
52
- Updates the options with the new options.
53
-
54
- Sample options:
55
- {"yes": 1, "no": 0}
56
- """
57
- self.options = options
58
-
59
- def __str__(self):
60
- return f"ClassifierScorer(name={self.name}, slug={self.slug}, conversation={self.conversation}, threshold={self.threshold}, options={self.options})"
61
-
62
- def model_dump(self, *args, **kwargs) -> Dict[str, Any]:
63
- base = super().model_dump(*args, **kwargs)
64
- base_fields = set(APIScorerConfig.model_fields.keys())
65
- all_fields = set(self.__class__.model_fields.keys())
66
-
67
- extra_fields = all_fields - base_fields - {"kwargs"}
68
-
69
- base["kwargs"] = {
70
- k: getattr(self, k) for k in extra_fields if getattr(self, k) is not None
71
- }
72
-
73
- return base
@@ -1,3 +0,0 @@
1
- from .text2sql import Text2SQLScorer
2
-
3
- __all__ = ["Text2SQLScorer"]
@@ -1,3 +0,0 @@
1
- from .text2sql_scorer import Text2SQLScorer
2
-
3
- __all__ = ["Text2SQLScorer"]
@@ -1,53 +0,0 @@
1
- """
2
- ClassifierScorer implementation for basic Text-to-SQL evaluation.
3
-
4
- Takes a natural language query, a corresponding LLM-generated SQL query, and a table schema + (optional) metadata.
5
- Determines if the LLM-generated SQL query is valid and works for the natural language query.
6
- """
7
-
8
- from judgeval.scorers import ClassifierScorer
9
-
10
- Text2SQLScorer = ClassifierScorer(
11
- name="Text to SQL",
12
- slug="text2sql-1010101010",
13
- threshold=1.0,
14
- conversation=[
15
- {
16
- "role": "system",
17
- "content": """You will be given a natural language query, a corresponding LLM-generated SQL query, and a table schema + (optional) metadata.
18
-
19
- ** TASK INSTRUCTIONS **
20
- Your task is to decide whether the LLM generated SQL query properly filters for what the natural language query is asking, based on the table schema + (optional) metadata.
21
- Additionally, you should check if the SQL query is valid based on the table schema (checking for syntax errors, false column names, etc.)
22
-
23
- ** TIPS **
24
- - Look for correct references to the table schema for column names, table names, etc.
25
- - Check that the SQL query can be executed; make sure JOINs, GROUP BYs, ORDER BYs, etc. are valid with respect to the table schema.
26
- - Check that aggregation functions (COUNT, SUM, AVG, etc.) are used appropriately with GROUP BY clauses
27
- - Verify that WHERE conditions use the correct operators and data types for comparisons
28
- - Ensure LIMIT and OFFSET clauses make sense for the query's purpose
29
- - Check that JOINs use the correct keys and maintain referential integrity
30
- - Verify that ORDER BY clauses use valid column names and sort directions
31
- - Check for proper handling of NULL values where relevant
32
- - Ensure subqueries are properly constructed and correlated when needed
33
- - EVEN IF THE QUERY IS VALID, IF IT DOESN'T WORK FOR THE NATURAL LANGUAGE QUERY, YOU SHOULD CHOOSE "N" AS THE ANSWER.
34
-
35
- ** FORMATTING YOUR ANSWER **
36
- If the SQL query is valid and works for the natural language query, choose option "Y" and otherwise "N". Provide a justification for your decision; if you choose "N", explain what about the LLM-generated SQL query is incorrect, or explain why it doesn't address the natural language query.
37
- IF YOUR JUSTIFICATION SHOWS THAT THE SQL QUERY IS VALID AND WORKS FOR THE NATURAL LANGUAGE QUERY, YOU SHOULD CHOOSE "Y" AS THE ANSWER.
38
- IF THE SQL QUERY IS INVALID, YOU SHOULD CHOOSE "N" AS THE ANSWER.
39
-
40
- ** YOUR TURN **
41
- Natural language query:
42
- {{input}}
43
-
44
- LLM generated SQL query:
45
- {{actual_output}}
46
-
47
- Table schema:
48
- {{context}}
49
- """,
50
- }
51
- ],
52
- options={"Y": 1.0, "N": 0.0},
53
- )