PyPI - judgeval - Versions diffs - 0.1.0__py3-none-any.whl → 0.23.0__py3-none-any.whl - Mend

judgeval 0.1.0py3-none-any.whl → 0.23.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (234) hide show

judgeval/__init__.py +173 -10
judgeval/api/__init__.py +523 -0
judgeval/api/api_types.py +413 -0
judgeval/cli.py +112 -0
judgeval/constants.py +7 -30
judgeval/data/__init__.py +1 -3
judgeval/data/evaluation_run.py +125 -0
judgeval/data/example.py +14 -40
judgeval/data/judgment_types.py +396 -146
judgeval/data/result.py +11 -18
judgeval/data/scorer_data.py +3 -26
judgeval/data/scripts/openapi_transform.py +5 -5
judgeval/data/trace.py +115 -194
judgeval/dataset/__init__.py +335 -0
judgeval/env.py +55 -0
judgeval/evaluation/__init__.py +346 -0
judgeval/exceptions.py +28 -0
judgeval/integrations/langgraph/__init__.py +13 -0
judgeval/integrations/openlit/__init__.py +51 -0
judgeval/judges/__init__.py +2 -2
judgeval/judges/litellm_judge.py +77 -16
judgeval/judges/together_judge.py +88 -17
judgeval/judges/utils.py +7 -20
judgeval/judgment_attribute_keys.py +55 -0
judgeval/{common/logger.py → logger.py} +24 -8
judgeval/prompt/__init__.py +330 -0
judgeval/scorers/__init__.py +11 -11
judgeval/scorers/agent_scorer.py +15 -19
judgeval/scorers/api_scorer.py +21 -23
judgeval/scorers/base_scorer.py +54 -36
judgeval/scorers/example_scorer.py +1 -3
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -24
judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -10
judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -10
judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -14
judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +171 -59
judgeval/scorers/score.py +64 -47
judgeval/scorers/utils.py +2 -107
judgeval/tracer/__init__.py +1111 -2
judgeval/tracer/constants.py +1 -0
judgeval/tracer/exporters/__init__.py +40 -0
judgeval/tracer/exporters/s3.py +119 -0
judgeval/tracer/exporters/store.py +59 -0
judgeval/tracer/exporters/utils.py +32 -0
judgeval/tracer/keys.py +63 -0
judgeval/tracer/llm/__init__.py +7 -0
judgeval/tracer/llm/config.py +78 -0
judgeval/tracer/llm/constants.py +9 -0
judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
judgeval/tracer/llm/llm_anthropic/config.py +6 -0
judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
judgeval/tracer/llm/llm_google/__init__.py +3 -0
judgeval/tracer/llm/llm_google/config.py +6 -0
judgeval/tracer/llm/llm_google/generate_content.py +127 -0
judgeval/tracer/llm/llm_google/wrapper.py +30 -0
judgeval/tracer/llm/llm_openai/__init__.py +3 -0
judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
judgeval/tracer/llm/llm_openai/config.py +6 -0
judgeval/tracer/llm/llm_openai/responses.py +506 -0
judgeval/tracer/llm/llm_openai/utils.py +42 -0
judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
judgeval/tracer/llm/llm_together/__init__.py +3 -0
judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
judgeval/tracer/llm/llm_together/config.py +6 -0
judgeval/tracer/llm/llm_together/wrapper.py +52 -0
judgeval/tracer/llm/providers.py +19 -0
judgeval/tracer/managers.py +167 -0
judgeval/tracer/processors/__init__.py +220 -0
judgeval/tracer/utils.py +19 -0
judgeval/trainer/__init__.py +14 -0
judgeval/trainer/base_trainer.py +122 -0
judgeval/trainer/config.py +123 -0
judgeval/trainer/console.py +144 -0
judgeval/trainer/fireworks_trainer.py +392 -0
judgeval/trainer/trainable_model.py +252 -0
judgeval/trainer/trainer.py +70 -0
judgeval/utils/async_utils.py +39 -0
judgeval/utils/decorators/__init__.py +0 -0
judgeval/utils/decorators/dont_throw.py +37 -0
judgeval/utils/decorators/use_once.py +13 -0
judgeval/utils/file_utils.py +74 -28
judgeval/utils/guards.py +36 -0
judgeval/utils/meta.py +27 -0
judgeval/utils/project.py +15 -0
judgeval/utils/serialize.py +253 -0
judgeval/utils/testing.py +70 -0
judgeval/utils/url.py +10 -0
judgeval/{version_check.py → utils/version_check.py} +5 -3
judgeval/utils/wrappers/README.md +3 -0
judgeval/utils/wrappers/__init__.py +15 -0
judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
judgeval/utils/wrappers/py.typed +0 -0
judgeval/utils/wrappers/utils.py +35 -0
judgeval/v1/__init__.py +88 -0
judgeval/v1/data/__init__.py +7 -0
judgeval/v1/data/example.py +44 -0
judgeval/v1/data/scorer_data.py +42 -0
judgeval/v1/data/scoring_result.py +44 -0
judgeval/v1/datasets/__init__.py +6 -0
judgeval/v1/datasets/dataset.py +214 -0
judgeval/v1/datasets/dataset_factory.py +94 -0
judgeval/v1/evaluation/__init__.py +6 -0
judgeval/v1/evaluation/evaluation.py +182 -0
judgeval/v1/evaluation/evaluation_factory.py +17 -0
judgeval/v1/instrumentation/__init__.py +6 -0
judgeval/v1/instrumentation/llm/__init__.py +7 -0
judgeval/v1/instrumentation/llm/config.py +78 -0
judgeval/v1/instrumentation/llm/constants.py +11 -0
judgeval/v1/instrumentation/llm/llm_anthropic/__init__.py +5 -0
judgeval/v1/instrumentation/llm/llm_anthropic/config.py +6 -0
judgeval/v1/instrumentation/llm/llm_anthropic/messages.py +414 -0
judgeval/v1/instrumentation/llm/llm_anthropic/messages_stream.py +307 -0
judgeval/v1/instrumentation/llm/llm_anthropic/wrapper.py +61 -0
judgeval/v1/instrumentation/llm/llm_google/__init__.py +5 -0
judgeval/v1/instrumentation/llm/llm_google/config.py +6 -0
judgeval/v1/instrumentation/llm/llm_google/generate_content.py +121 -0
judgeval/v1/instrumentation/llm/llm_google/wrapper.py +30 -0
judgeval/v1/instrumentation/llm/llm_openai/__init__.py +5 -0
judgeval/v1/instrumentation/llm/llm_openai/beta_chat_completions.py +212 -0
judgeval/v1/instrumentation/llm/llm_openai/chat_completions.py +477 -0
judgeval/v1/instrumentation/llm/llm_openai/config.py +6 -0
judgeval/v1/instrumentation/llm/llm_openai/responses.py +472 -0
judgeval/v1/instrumentation/llm/llm_openai/utils.py +41 -0
judgeval/v1/instrumentation/llm/llm_openai/wrapper.py +63 -0
judgeval/v1/instrumentation/llm/llm_together/__init__.py +5 -0
judgeval/v1/instrumentation/llm/llm_together/chat_completions.py +382 -0
judgeval/v1/instrumentation/llm/llm_together/config.py +6 -0
judgeval/v1/instrumentation/llm/llm_together/wrapper.py +57 -0
judgeval/v1/instrumentation/llm/providers.py +19 -0
judgeval/v1/integrations/claude_agent_sdk/__init__.py +119 -0
judgeval/v1/integrations/claude_agent_sdk/wrapper.py +564 -0
judgeval/v1/integrations/langgraph/__init__.py +13 -0
judgeval/v1/integrations/openlit/__init__.py +47 -0
judgeval/v1/internal/api/__init__.py +525 -0
judgeval/v1/internal/api/api_types.py +413 -0
judgeval/v1/prompts/__init__.py +6 -0
judgeval/v1/prompts/prompt.py +29 -0
judgeval/v1/prompts/prompt_factory.py +189 -0
judgeval/v1/py.typed +0 -0
judgeval/v1/scorers/__init__.py +6 -0
judgeval/v1/scorers/api_scorer.py +82 -0
judgeval/v1/scorers/base_scorer.py +17 -0
judgeval/v1/scorers/built_in/__init__.py +17 -0
judgeval/v1/scorers/built_in/answer_correctness.py +28 -0
judgeval/v1/scorers/built_in/answer_relevancy.py +28 -0
judgeval/v1/scorers/built_in/built_in_factory.py +26 -0
judgeval/v1/scorers/built_in/faithfulness.py +28 -0
judgeval/v1/scorers/built_in/instruction_adherence.py +28 -0
judgeval/v1/scorers/custom_scorer/__init__.py +6 -0
judgeval/v1/scorers/custom_scorer/custom_scorer.py +50 -0
judgeval/v1/scorers/custom_scorer/custom_scorer_factory.py +16 -0
judgeval/v1/scorers/prompt_scorer/__init__.py +6 -0
judgeval/v1/scorers/prompt_scorer/prompt_scorer.py +86 -0
judgeval/v1/scorers/prompt_scorer/prompt_scorer_factory.py +85 -0
judgeval/v1/scorers/scorers_factory.py +49 -0
judgeval/v1/tracer/__init__.py +7 -0
judgeval/v1/tracer/base_tracer.py +520 -0
judgeval/v1/tracer/exporters/__init__.py +14 -0
judgeval/v1/tracer/exporters/in_memory_span_exporter.py +25 -0
judgeval/v1/tracer/exporters/judgment_span_exporter.py +42 -0
judgeval/v1/tracer/exporters/noop_span_exporter.py +19 -0
judgeval/v1/tracer/exporters/span_store.py +50 -0
judgeval/v1/tracer/judgment_tracer_provider.py +70 -0
judgeval/v1/tracer/processors/__init__.py +6 -0
judgeval/v1/tracer/processors/_lifecycles/__init__.py +28 -0
judgeval/v1/tracer/processors/_lifecycles/agent_id_processor.py +53 -0
judgeval/v1/tracer/processors/_lifecycles/context_keys.py +11 -0
judgeval/v1/tracer/processors/_lifecycles/customer_id_processor.py +29 -0
judgeval/v1/tracer/processors/_lifecycles/registry.py +18 -0
judgeval/v1/tracer/processors/judgment_span_processor.py +165 -0
judgeval/v1/tracer/processors/noop_span_processor.py +42 -0
judgeval/v1/tracer/tracer.py +67 -0
judgeval/v1/tracer/tracer_factory.py +38 -0
judgeval/v1/trainers/__init__.py +5 -0
judgeval/v1/trainers/base_trainer.py +62 -0
judgeval/v1/trainers/config.py +123 -0
judgeval/v1/trainers/console.py +144 -0
judgeval/v1/trainers/fireworks_trainer.py +392 -0
judgeval/v1/trainers/trainable_model.py +252 -0
judgeval/v1/trainers/trainers_factory.py +37 -0
judgeval/v1/utils.py +18 -0
judgeval/version.py +5 -0
judgeval/warnings.py +4 -0
judgeval-0.23.0.dist-info/METADATA +266 -0
judgeval-0.23.0.dist-info/RECORD +201 -0
judgeval-0.23.0.dist-info/entry_points.txt +2 -0
judgeval/clients.py +0 -34
judgeval/common/__init__.py +0 -13
judgeval/common/api/__init__.py +0 -3
judgeval/common/api/api.py +0 -352
judgeval/common/api/constants.py +0 -165
judgeval/common/exceptions.py +0 -27
judgeval/common/storage/__init__.py +0 -6
judgeval/common/storage/s3_storage.py +0 -98
judgeval/common/tracer/__init__.py +0 -31
judgeval/common/tracer/constants.py +0 -22
judgeval/common/tracer/core.py +0 -1916
judgeval/common/tracer/otel_exporter.py +0 -108
judgeval/common/tracer/otel_span_processor.py +0 -234
judgeval/common/tracer/span_processor.py +0 -37
judgeval/common/tracer/span_transformer.py +0 -211
judgeval/common/tracer/trace_manager.py +0 -92
judgeval/common/utils.py +0 -940
judgeval/data/datasets/__init__.py +0 -4
judgeval/data/datasets/dataset.py +0 -341
judgeval/data/datasets/eval_dataset_client.py +0 -214
judgeval/data/tool.py +0 -5
judgeval/data/trace_run.py +0 -37
judgeval/evaluation_run.py +0 -75
judgeval/integrations/langgraph.py +0 -843
judgeval/judges/mixture_of_judges.py +0 -286
judgeval/judgment_client.py +0 -369
judgeval/rules.py +0 -521
judgeval/run_evaluation.py +0 -684
judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
judgeval/utils/alerts.py +0 -93
judgeval/utils/requests.py +0 -50
judgeval-0.1.0.dist-info/METADATA +0 -202
judgeval-0.1.0.dist-info/RECORD +0 -73
{judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/WHEEL +0 -0
{judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/licenses/LICENSE.md +0 -0

judgeval/judges/together_judge.py CHANGED Viewed

@@ -3,14 +3,77 @@ Implementation of using TogetherAI inference for judges.
 """
 from pydantic import BaseModel
-from typing import List, Union
+from typing import Dict, List, Union, Any, cast
 from judgeval.judges import JudgevalJudge
-from judgeval.common.utils import (
-    fetch_together_api_response,
-    afetch_together_api_response,
+from judgeval.logger import judgeval_logger
+from judgeval.env import (
+    JUDGMENT_DEFAULT_TOGETHER_MODEL,
+    TOGETHERAI_API_KEY,
+    TOGETHER_API_KEY,
 )
-from judgeval.common.logger import judgeval_logger
+together_api_key = TOGETHERAI_API_KEY or TOGETHER_API_KEY
+if together_api_key:
+    try:
+        from together import Together, AsyncTogether  # type: ignore[import-untyped]
+        together_client = Together(api_key=together_api_key)
+        async_together_client = AsyncTogether(api_key=together_api_key)
+    except Exception:
+        pass
+def fetch_together_api_response(
+    model: str,
+    messages: List[Dict[str, str]],
+    response_format: Union[Dict[str, Any], None] = None,
+) -> str:
+    if not messages:
+        raise ValueError("Messages cannot be empty")
+    if response_format is not None:
+        response = together_client.chat.completions.create(
+            model=model,
+            messages=messages,
+            response_format=response_format,
+        )
+    else:
+        response = together_client.chat.completions.create(
+            model=model,
+            messages=messages,
+        )
+    content = response.choices[0].message.content  # type: ignore[attr-defined]
+    if content is None:
+        raise ValueError("Received empty response from TogetherAI")
+    return cast(str, content)
+async def afetch_together_api_response(
+    model: str,
+    messages: List[Dict[str, str]],
+    response_format: Union[Dict[str, Any], None] = None,
+) -> str:
+    if not messages:
+        raise ValueError("Messages cannot be empty")
+    if response_format is not None:
+        response = await async_together_client.chat.completions.create(
+            model=model,
+            messages=messages,
+            response_format=response_format,
+        )
+    else:
+        response = await async_together_client.chat.completions.create(
+            model=model,
+            messages=messages,
+        )
+    content = response.choices[0].message.content  # type: ignore[attr-defined]
+    if content is None:
+        raise ValueError("Received empty response from TogetherAI")
+    return cast(str, content)
 BASE_CONVERSATION = [
     {"role": "system", "content": "You are a helpful assistant."},
@@ -18,44 +81,52 @@ BASE_CONVERSATION = [
 class TogetherJudge(JudgevalJudge):
-    def __init__(self, model: str = "Qwen/Qwen2.5-72B-Instruct-Turbo", **kwargs):
+    def __init__(self, model: str = JUDGMENT_DEFAULT_TOGETHER_MODEL, **kwargs):
         self.model = model
         self.kwargs = kwargs
         super().__init__(model_name=model)
-    # TODO: Fix cost for generate and a_generate
-    def generate(self, input: Union[str, List[dict]], schema: BaseModel = None) -> str:
+    def generate(
+        self,
+        input: Union[str, List[Dict[str, str]]],
+        schema: Union[BaseModel, None] = None,
+    ) -> str:
+        response_format = schema.model_json_schema() if schema else None
         if isinstance(input, str):
             convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
             return fetch_together_api_response(
-                self.model, convo, response_format=schema
+                self.model, convo, response_format=response_format
             )
         elif isinstance(input, list):
-            convo = input
+            messages = [dict(msg) for msg in input]
             return fetch_together_api_response(
-                self.model, convo, response_format=schema
+                self.model, messages, response_format=response_format
             )
         else:
             judgeval_logger.error(f"Invalid input type received: {type(input)}")
             raise TypeError("Input must be a string or a list of dictionaries.")
     async def a_generate(
-        self, input: Union[str, List[dict]], schema: BaseModel = None
+        self,
+        input: Union[str, List[Dict[str, str]]],
+        schema: Union[BaseModel, None] = None,
     ) -> str:
+        response_format = schema.model_json_schema() if schema else None
         if isinstance(input, str):
             convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
             res = await afetch_together_api_response(
-                self.model, convo, response_format=schema
+                self.model, convo, response_format=response_format
             )
             return res
         elif isinstance(input, list):
-            convo = input
+            messages = [dict(msg) for msg in input]
             res = await afetch_together_api_response(
-                self.model, convo, response_format=schema
+                self.model, messages, response_format=response_format
             )
             return res
         else:
-            judgeval_logger.error(f"Invalid input type received: {type(input)}")
             raise TypeError("Input must be a string or a list of dictionaries.")
     def load_model(self) -> str:

judgeval/judges/utils.py CHANGED Viewed

@@ -3,21 +3,21 @@ This module contains utility functions for judge models.
 """
 import litellm
-from typing import Optional, Union, Tuple, List
+from typing import Optional, Union, Tuple
-from judgeval.common.exceptions import InvalidJudgeModelError
-from judgeval.judges import JudgevalJudge, LiteLLMJudge, TogetherJudge, MixtureOfJudges
+from judgeval.exceptions import InvalidJudgeModelError
+from judgeval.judges import JudgevalJudge, LiteLLMJudge, TogetherJudge
+from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
 from judgeval.constants import (
     TOGETHER_SUPPORTED_MODELS,
     JUDGMENT_SUPPORTED_MODELS,
-    ACCEPTABLE_MODELS,
 )
 LITELLM_SUPPORTED_MODELS = set(litellm.model_list)
 def create_judge(
-    model: Optional[Union[str, List[str], JudgevalJudge]] = None,
+    model: Optional[Union[str, JudgevalJudge]] = None,
 ) -> Tuple[JudgevalJudge, bool]:
     """
     Creates a judge model from string(s) or a judgeval judge object.
@@ -30,28 +30,15 @@ def create_judge(
     If no model is provided, uses GPT4o as the default judge.
     """
     if model is None:  # default option
-        return LiteLLMJudge(model="gpt-4.1"), True
+        return LiteLLMJudge(model=JUDGMENT_DEFAULT_GPT_MODEL), True
     if not isinstance(model, (str, list, JudgevalJudge)):
         raise InvalidJudgeModelError(
             f"Model must be a string, list of strings, or a judgeval judge object. Got: {type(model)} instead."
         )
     # If model is already a valid judge type, return it and mark native
-    if isinstance(model, (JudgevalJudge, LiteLLMJudge, TogetherJudge, MixtureOfJudges)):
+    if isinstance(model, (JudgevalJudge, LiteLLMJudge, TogetherJudge)):
         return model, True
-    # Either string or List[str]
-    if isinstance(model, list):
-        for m in model:
-            if m in JUDGMENT_SUPPORTED_MODELS:
-                raise NotImplementedError(
-                    """Judgment models are not yet supported for local scoring.
-                    Please either set the `use_judgment` flag to True or use
-                    non-Judgment models."""
-                )
-            if m not in ACCEPTABLE_MODELS:
-                raise InvalidJudgeModelError(f"Invalid judge model chosen: {m}")
-        return MixtureOfJudges(models=model), True
-    # If model is a string, check that it corresponds to a valid model
     if model in LITELLM_SUPPORTED_MODELS:
         return LiteLLMJudge(model=model), True
     if model in TOGETHER_SUPPORTED_MODELS:

judgeval/judgment_attribute_keys.py ADDED Viewed

@@ -0,0 +1,55 @@
+from __future__ import annotations
+from enum import Enum
+class AttributeKeys(str, Enum):
+    JUDGMENT_SPAN_KIND = "judgment.span_kind"
+    JUDGMENT_INPUT = "judgment.input"
+    JUDGMENT_OUTPUT = "judgment.output"
+    JUDGMENT_OFFLINE_MODE = "judgment.offline_mode"
+    JUDGMENT_UPDATE_ID = "judgment.update_id"
+    JUDGMENT_CUSTOMER_ID = "judgment.customer_id"
+    JUDGMENT_AGENT_ID = "judgment.agent_id"
+    JUDGMENT_PARENT_AGENT_ID = "judgment.parent_agent_id"
+    JUDGMENT_AGENT_CLASS_NAME = "judgment.agent_class_name"
+    JUDGMENT_AGENT_INSTANCE_NAME = "judgment.agent_instance_name"
+    JUDGMENT_IS_AGENT_ENTRY_POINT = "judgment.is_agent_entry_point"
+    JUDGMENT_CUMULATIVE_LLM_COST = "judgment.cumulative_llm_cost"
+    JUDGMENT_STATE_BEFORE = "judgment.state_before"
+    JUDGMENT_STATE_AFTER = "judgment.state_after"
+    JUDGMENT_PENDING_TRACE_EVAL = "judgment.pending_trace_eval"
+    JUDGMENT_USAGE_METADATA = "judgment.usage.metadata"
+    JUDGMENT_LLM_PROVIDER = "judgment.llm.provider"
+    JUDGMENT_LLM_MODEL_NAME = "judgment.llm.model"
+    JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS = "judgment.usage.non_cached_input_tokens"
+    JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS = (
+        "judgment.usage.cache_creation_input_tokens"
+    )
+    JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS = "judgment.usage.cache_read_input_tokens"
+    JUDGMENT_USAGE_OUTPUT_TOKENS = "judgment.usage.output_tokens"
+    JUDGMENT_USAGE_TOTAL_COST_USD = "judgment.usage.total_cost_usd"
+    GEN_AI_PROMPT = "gen_ai.prompt"
+    GEN_AI_COMPLETION = "gen_ai.completion"
+    GEN_AI_REQUEST_MODEL = "gen_ai.request.model"
+    GEN_AI_RESPONSE_MODEL = "gen_ai.response.model"
+    GEN_AI_SYSTEM = "gen_ai.system"
+    GEN_AI_USAGE_INPUT_TOKENS = "gen_ai.usage.input_tokens"
+    GEN_AI_USAGE_OUTPUT_TOKENS = "gen_ai.usage.output_tokens"
+    GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS = (
+        "gen_ai.usage.cache_creation_input_tokens"
+    )
+    GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS = "gen_ai.usage.cache_read_input_tokens"
+    GEN_AI_REQUEST_TEMPERATURE = "gen_ai.request.temperature"
+    GEN_AI_REQUEST_MAX_TOKENS = "gen_ai.request.max_tokens"
+    GEN_AI_RESPONSE_FINISH_REASONS = "gen_ai.response.finish_reasons"
+class ResourceKeys(str, Enum):
+    SERVICE_NAME = "service.name"
+    TELEMETRY_SDK_LANGUAGE = "telemetry.sdk.language"
+    TELEMETRY_SDK_NAME = "telemetry.sdk.name"
+    TELEMETRY_SDK_VERSION = "telemetry.sdk.version"
+    JUDGMENT_PROJECT_ID = "judgment.project_id"

judgeval/{common/logger.py → logger.py} RENAMED Viewed

@@ -1,10 +1,9 @@
-# logger.py
 import logging
 import sys
-import os
-# ANSI escape sequences
+from judgeval.env import JUDGMENT_NO_COLOR, JUDGMENT_LOG_LEVEL
+from judgeval.utils.decorators.use_once import use_once
 RESET = "\033[0m"
 RED = "\033[31m"
 YELLOW = "\033[33m"
@@ -38,10 +37,25 @@ class ColorFormatter(logging.Formatter):
         return message
+def _parse_log_level(level_str: str) -> int:
+    level_map = {
+        "debug": logging.DEBUG,
+        "info": logging.INFO,
+        "warning": logging.WARNING,
+        "warn": logging.WARNING,
+        "error": logging.ERROR,
+        "critical": logging.CRITICAL,
+    }
+    return level_map.get(level_str.lower(), logging.WARNING)
+@use_once
 def _setup_judgeval_logger():
-    use_color = sys.stdout.isatty() and os.getenv("NO_COLOR") is None
+    use_color = sys.stdout.isatty() and JUDGMENT_NO_COLOR is None
+    log_level = _parse_log_level(JUDGMENT_LOG_LEVEL)
     handler = logging.StreamHandler(sys.stdout)
-    handler.setLevel(logging.DEBUG)
+    handler.setLevel(log_level)
     handler.setFormatter(
         ColorFormatter(
             fmt="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
@@ -51,10 +65,12 @@ def _setup_judgeval_logger():
     )
     logger = logging.getLogger("judgeval")
-    logger.setLevel(logging.DEBUG)
+    logger.setLevel(log_level)
     logger.addHandler(handler)
     return logger
-# Global logger you can import elsewhere
 judgeval_logger = _setup_judgeval_logger()
+__all__ = ("judgeval_logger",)

judgeval/prompt/__init__.py ADDED Viewed

@@ -0,0 +1,330 @@
+from typing import List, Optional, Dict
+from judgeval.api import JudgmentSyncClient
+from judgeval.exceptions import JudgmentAPIError
+from judgeval.api.api_types import (
+    PromptCommitInfo,
+    PromptTagResponse,
+    PromptUntagResponse,
+    PromptVersionsResponse,
+)
+from dataclasses import dataclass, field
+import re
+from string import Template
+from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_ORG_ID
+from judgeval.utils.project import _resolve_project_id
+def push_prompt(
+    project_name: str,
+    name: str,
+    prompt: str,
+    tags: List[str],
+    judgment_api_key: str | None = JUDGMENT_API_KEY,
+    organization_id: str | None = JUDGMENT_ORG_ID,
+) -> tuple[str, Optional[str], str]:
+    if not judgment_api_key or not organization_id:
+        raise ValueError("Judgment API key and organization ID are required")
+    client = JudgmentSyncClient(judgment_api_key, organization_id)
+    try:
+        project_id = _resolve_project_id(
+            project_name, judgment_api_key, organization_id
+        )
+        if not project_id:
+            raise JudgmentAPIError(
+                status_code=404,
+                detail=f"Project '{project_name}' not found",
+                response=None,  # type: ignore
+            )
+        r = client.prompts_insert(
+            payload={
+                "project_id": project_id,
+                "name": name,
+                "prompt": prompt,
+                "tags": tags,
+            }
+        )
+        return r["commit_id"], r.get("parent_commit_id"), r["created_at"]
+    except JudgmentAPIError as e:
+        raise JudgmentAPIError(
+            status_code=e.status_code,
+            detail=f"Failed to save prompt: {e.detail}",
+            response=e.response,
+        )
+def fetch_prompt(
+    project_name: str,
+    name: str,
+    commit_id: Optional[str] = None,
+    tag: Optional[str] = None,
+    judgment_api_key: str | None = JUDGMENT_API_KEY,
+    organization_id: str | None = JUDGMENT_ORG_ID,
+) -> Optional[PromptCommitInfo]:
+    if not judgment_api_key or not organization_id:
+        raise ValueError("Judgment API key and organization ID are required")
+    client = JudgmentSyncClient(judgment_api_key, organization_id)
+    try:
+        project_id = _resolve_project_id(
+            project_name, judgment_api_key, organization_id
+        )
+        if not project_id:
+            raise JudgmentAPIError(
+                status_code=404,
+                detail=f"Project '{project_name}' not found",
+                response=None,  # type: ignore
+            )
+        prompt_config = client.prompts_fetch(
+            name=name,
+            project_id=project_id,
+            commit_id=commit_id,
+            tag=tag,
+        )
+        return prompt_config["commit"]
+    except JudgmentAPIError as e:
+        raise JudgmentAPIError(
+            status_code=e.status_code,
+            detail=f"Failed to fetch prompt '{name}': {e.detail}",
+            response=e.response,
+        )
+def tag_prompt(
+    project_name: str,
+    name: str,
+    commit_id: str,
+    tags: List[str],
+    judgment_api_key: str | None = JUDGMENT_API_KEY,
+    organization_id: str | None = JUDGMENT_ORG_ID,
+) -> PromptTagResponse:
+    if not judgment_api_key or not organization_id:
+        raise ValueError("Judgment API key and organization ID are required")
+    client = JudgmentSyncClient(judgment_api_key, organization_id)
+    try:
+        project_id = _resolve_project_id(
+            project_name, judgment_api_key, organization_id
+        )
+        if not project_id:
+            raise JudgmentAPIError(
+                status_code=404,
+                detail=f"Project '{project_name}' not found",
+                response=None,  # type: ignore
+            )
+        prompt_config = client.prompts_tag(
+            payload={
+                "project_id": project_id,
+                "name": name,
+                "commit_id": commit_id,
+                "tags": tags,
+            }
+        )
+        return prompt_config
+    except JudgmentAPIError as e:
+        raise JudgmentAPIError(
+            status_code=e.status_code,
+            detail=f"Failed to tag prompt '{name}': {e.detail}",
+            response=e.response,
+        )
+def untag_prompt(
+    project_name: str,
+    name: str,
+    tags: List[str],
+    judgment_api_key: str | None = JUDGMENT_API_KEY,
+    organization_id: str | None = JUDGMENT_ORG_ID,
+) -> PromptUntagResponse:
+    if not judgment_api_key or not organization_id:
+        raise ValueError("Judgment API key and organization ID are required")
+    client = JudgmentSyncClient(judgment_api_key, organization_id)
+    try:
+        project_id = _resolve_project_id(
+            project_name, judgment_api_key, organization_id
+        )
+        if not project_id:
+            raise JudgmentAPIError(
+                status_code=404,
+                detail=f"Project '{project_name}' not found",
+                response=None,  # type: ignore
+            )
+        prompt_config = client.prompts_untag(
+            payload={"project_id": project_id, "name": name, "tags": tags}
+        )
+        return prompt_config
+    except JudgmentAPIError as e:
+        raise JudgmentAPIError(
+            status_code=e.status_code,
+            detail=f"Failed to untag prompt '{name}': {e.detail}",
+            response=e.response,
+        )
+def list_prompt(
+    project_name: str,
+    name: str,
+    judgment_api_key: str | None = JUDGMENT_API_KEY,
+    organization_id: str | None = JUDGMENT_ORG_ID,
+) -> PromptVersionsResponse:
+    if not judgment_api_key or not organization_id:
+        raise ValueError("Judgment API key and organization ID are required")
+    client = JudgmentSyncClient(judgment_api_key, organization_id)
+    try:
+        project_id = _resolve_project_id(
+            project_name, judgment_api_key, organization_id
+        )
+        if not project_id:
+            raise JudgmentAPIError(
+                status_code=404,
+                detail=f"Project '{project_name}' not found",
+                response=None,  # type: ignore
+            )
+        prompt_config = client.prompts_get_prompt_versions(
+            project_id=project_id, name=name
+        )
+        return prompt_config
+    except JudgmentAPIError as e:
+        raise JudgmentAPIError(
+            status_code=e.status_code,
+            detail=f"Failed to list prompt '{name}': {e.detail}",
+            response=e.response,
+        )
+@dataclass
+class Prompt:
+    name: str
+    prompt: str
+    created_at: str
+    tags: List[str]
+    commit_id: str
+    parent_commit_id: Optional[str] = None
+    metadata: Dict[str, str] = field(default_factory=dict)
+    _template: Template = field(init=False, repr=False)
+    def __post_init__(self):
+        template_str = re.sub(r"\{\{([^}]+)\}\}", r"$\1", self.prompt)
+        self._template = Template(template_str)
+    @classmethod
+    def create(
+        cls,
+        project_name: str,
+        name: str,
+        prompt: str,
+        tags: Optional[List[str]] = None,
+        judgment_api_key: str | None = JUDGMENT_API_KEY,
+        organization_id: str | None = JUDGMENT_ORG_ID,
+    ):
+        if tags is None:
+            tags = []
+        commit_id, parent_commit_id, created_at = push_prompt(
+            project_name, name, prompt, tags, judgment_api_key, organization_id
+        )
+        return cls(
+            name=name,
+            prompt=prompt,
+            created_at=created_at,
+            tags=tags,
+            commit_id=commit_id,
+            parent_commit_id=parent_commit_id,
+        )
+    @classmethod
+    def get(
+        cls,
+        project_name: str,
+        name: str,
+        commit_id: Optional[str] = None,
+        tag: Optional[str] = None,
+        judgment_api_key: str | None = JUDGMENT_API_KEY,
+        organization_id: str | None = JUDGMENT_ORG_ID,
+    ):
+        if commit_id is not None and tag is not None:
+            raise ValueError(
+                "You cannot fetch a prompt by both commit_id and tag at the same time"
+            )
+        prompt_config = fetch_prompt(
+            project_name, name, commit_id, tag, judgment_api_key, organization_id
+        )
+        if prompt_config is None:
+            raise JudgmentAPIError(
+                status_code=404,
+                detail=f"Prompt '{name}' not found in project '{project_name}'",
+                response=None,  # type: ignore
+            )
+        return cls(
+            name=prompt_config["name"],
+            prompt=prompt_config["prompt"],
+            created_at=prompt_config["created_at"],
+            tags=prompt_config["tags"],
+            commit_id=prompt_config["commit_id"],
+            parent_commit_id=prompt_config.get("parent_commit_id"),
+            metadata={
+                "creator_first_name": prompt_config["first_name"],
+                "creator_last_name": prompt_config["last_name"],
+                "creator_email": prompt_config["user_email"],
+            },
+        )
+    @classmethod
+    def tag(
+        cls,
+        project_name: str,
+        name: str,
+        commit_id: str,
+        tags: List[str],
+        judgment_api_key: str | None = JUDGMENT_API_KEY,
+        organization_id: str | None = JUDGMENT_ORG_ID,
+    ):
+        prompt_config = tag_prompt(
+            project_name, name, commit_id, tags, judgment_api_key, organization_id
+        )
+        return prompt_config["commit_id"]
+    @classmethod
+    def untag(
+        cls,
+        project_name: str,
+        name: str,
+        tags: List[str],
+        judgment_api_key: str | None = JUDGMENT_API_KEY,
+        organization_id: str | None = JUDGMENT_ORG_ID,
+    ):
+        prompt_config = untag_prompt(
+            project_name, name, tags, judgment_api_key, organization_id
+        )
+        return prompt_config["commit_ids"]
+    @classmethod
+    def list(
+        cls,
+        project_name: str,
+        name: str,
+        judgment_api_key: str | None = JUDGMENT_API_KEY,
+        organization_id: str | None = JUDGMENT_ORG_ID,
+    ):
+        prompt_configs = list_prompt(
+            project_name, name, judgment_api_key, organization_id
+        )["versions"]
+        return [
+            cls(
+                name=prompt_config["name"],
+                prompt=prompt_config["prompt"],
+                tags=prompt_config["tags"],
+                created_at=prompt_config["created_at"],
+                commit_id=prompt_config["commit_id"],
+                parent_commit_id=prompt_config.get("parent_commit_id"),
+                metadata={
+                    "creator_first_name": prompt_config["first_name"],
+                    "creator_last_name": prompt_config["last_name"],
+                    "creator_email": prompt_config["user_email"],
+                },
+            )
+            for prompt_config in prompt_configs
+        ]
+    def compile(self, **kwargs) -> str:
+        try:
+            return self._template.substitute(**kwargs)
+        except KeyError as e:
+            missing_var = str(e).strip("'")
+            raise ValueError(f"Missing required variable: {missing_var}")

judgeval/scorers/__init__.py CHANGED Viewed

@@ -1,29 +1,29 @@
-from judgeval.scorers.api_scorer import APIScorerConfig
+from judgeval.scorers.api_scorer import (
+    APIScorerConfig,
+    ExampleAPIScorerConfig,
+    TraceAPIScorerConfig,
+)
 from judgeval.scorers.base_scorer import BaseScorer
+from judgeval.scorers.example_scorer import ExampleScorer
 from judgeval.scorers.judgeval_scorers.api_scorers import (
-    ExecutionOrderScorer,
-    HallucinationScorer,
     FaithfulnessScorer,
     AnswerRelevancyScorer,
     AnswerCorrectnessScorer,
     InstructionAdherenceScorer,
-    DerailmentScorer,
-    ToolOrderScorer,
+    TracePromptScorer,
     PromptScorer,
-    ToolDependencyScorer,
 )
 __all__ = [
     "APIScorerConfig",
+    "ExampleAPIScorerConfig",
+    "TraceAPIScorerConfig",
     "BaseScorer",
+    "ExampleScorer",
+    "TracePromptScorer",
     "PromptScorer",
-    "ExecutionOrderScorer",
-    "HallucinationScorer",
     "FaithfulnessScorer",
     "AnswerRelevancyScorer",
     "AnswerCorrectnessScorer",
     "InstructionAdherenceScorer",
-    "DerailmentScorer",
-    "ToolOrderScorer",
-    "ToolDependencyScorer",
 ]

judgeval 0.1.0__py3-none-any.whl → 0.23.0__py3-none-any.whl

judgeval 0.1.0py3-none-any.whl → 0.23.0py3-none-any.whl