judgeval 0.16.4__tar.gz → 0.16.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of judgeval might be problematic. Click here for more details.
- {judgeval-0.16.4 → judgeval-0.16.6}/.pre-commit-config.yaml +2 -2
- {judgeval-0.16.4 → judgeval-0.16.6}/PKG-INFO +2 -2
- {judgeval-0.16.4 → judgeval-0.16.6}/pyproject.toml +2 -2
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/__init__.py +7 -2
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +15 -4
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/tracer/__init__.py +9 -1
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/tracer/llm/llm_anthropic/wrapper.py +160 -130
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/tracer/llm/llm_google/wrapper.py +137 -98
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/tracer/llm/llm_groq/wrapper.py +137 -116
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/tracer/llm/llm_openai/wrapper.py +130 -106
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/tracer/llm/llm_together/wrapper.py +145 -120
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/tracer/utils.py +1 -1
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/utils/decorators/dont_throw.py +1 -1
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/version.py +1 -1
- {judgeval-0.16.4 → judgeval-0.16.6}/uv.lock +59 -3
- {judgeval-0.16.4 → judgeval-0.16.6}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/.github/ISSUE_TEMPLATE/config.yml +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/.github/pull_request_template.md +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/.github/workflows/blocked-pr.yaml +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/.github/workflows/ci.yaml +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/.github/workflows/claude-code-review.yml +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/.github/workflows/claude.yml +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/.github/workflows/lint.yaml +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/.github/workflows/merge-branch-check.yaml +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/.github/workflows/mypy.yaml +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/.github/workflows/pre-commit-autoupdate.yaml +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/.github/workflows/release.yaml +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/.github/workflows/validate-branch.yaml +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/.gitignore +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/CONTRIBUTING.md +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/LICENSE.md +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/README.md +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/assets/Screenshot 2025-05-17 at 8.14.27/342/200/257PM.png" +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/assets/agent.gif +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/assets/agent_trace_example.png +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/assets/brand/company.jpg +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/assets/brand/company_banner.jpg +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/assets/brand/darkmode.svg +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/assets/brand/full_logo.png +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/assets/brand/icon.png +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/assets/brand/lightmode.svg +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/assets/brand/white_background.png +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/assets/custom_scorer_online_abm.png +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/assets/data.gif +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/assets/dataset_clustering_screenshot.png +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/assets/dataset_clustering_screenshot_dm.png +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/assets/datasets_preview_screenshot.png +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/assets/document.gif +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/assets/error_analysis_dashboard.png +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/assets/errors.png +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/assets/experiments_dashboard_screenshot.png +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/assets/experiments_page.png +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/assets/experiments_pagev2.png +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/assets/logo_darkmode.svg +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/assets/logo_lightmode.svg +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/assets/monitoring_screenshot.png +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/assets/online_eval.png +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/assets/product_shot.png +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/assets/quickstart_trajectory_ss.png +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/assets/test.png +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/assets/tests.png +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/assets/trace.gif +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/assets/trace_demo.png +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/assets/trace_screenshot.png +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/assets/trace_screenshot_old.png +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/pytest.ini +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/scripts/api_generator.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/scripts/openapi_transform.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/scripts/update_types.sh +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/api/__init__.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/api/api_types.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/cli.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/constants.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/data/__init__.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/data/evaluation_run.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/data/example.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/data/judgment_types.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/data/result.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/data/scorer_data.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/data/scripts/fix_default_factory.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/data/scripts/openapi_transform.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/data/trace.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/dataset/__init__.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/env.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/evaluation/__init__.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/exceptions.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/integrations/langgraph/__init__.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/integrations/openlit/__init__.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/judges/__init__.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/judges/base_judge.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/judges/litellm_judge.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/judges/together_judge.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/judges/utils.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/logger.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/scorers/__init__.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/scorers/agent_scorer.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/scorers/api_scorer.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/scorers/base_scorer.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/scorers/example_scorer.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/scorers/exceptions.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/scorers/score.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/scorers/utils.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/tracer/constants.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/tracer/exporters/__init__.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/tracer/exporters/s3.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/tracer/exporters/store.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/tracer/exporters/utils.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/tracer/keys.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/tracer/llm/__init__.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/tracer/llm/config.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/tracer/llm/constants.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/tracer/llm/llm_anthropic/__init__.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/tracer/llm/llm_anthropic/config.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/tracer/llm/llm_google/__init__.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/tracer/llm/llm_google/config.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/tracer/llm/llm_groq/__init__.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/tracer/llm/llm_groq/config.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/tracer/llm/llm_openai/__init__.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/tracer/llm/llm_openai/config.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/tracer/llm/llm_together/__init__.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/tracer/llm/llm_together/config.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/tracer/llm/providers.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/tracer/local_eval_queue.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/tracer/managers.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/tracer/processors/__init__.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/trainer/__init__.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/trainer/config.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/trainer/console.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/trainer/trainable_model.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/trainer/trainer.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/utils/async_utils.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/utils/decorators/__init__.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/utils/decorators/use_once.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/utils/file_utils.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/utils/guards.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/utils/meta.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/utils/serialize.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/utils/testing.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/utils/url.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/utils/version_check.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/src/judgeval/warnings.py +0 -0
- {judgeval-0.16.4 → judgeval-0.16.6}/update_version.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: judgeval
|
|
3
|
-
Version: 0.16.
|
|
3
|
+
Version: 0.16.6
|
|
4
4
|
Summary: Judgeval Package
|
|
5
5
|
Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
|
|
6
6
|
Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
|
|
@@ -14,7 +14,7 @@ Requires-Dist: boto3>=1.40.11
|
|
|
14
14
|
Requires-Dist: click<8.2.0
|
|
15
15
|
Requires-Dist: dotenv
|
|
16
16
|
Requires-Dist: httpx>=0.28.1
|
|
17
|
-
Requires-Dist: litellm
|
|
17
|
+
Requires-Dist: litellm>=1.75.0
|
|
18
18
|
Requires-Dist: opentelemetry-exporter-otlp>=1.36.0
|
|
19
19
|
Requires-Dist: opentelemetry-sdk>=1.36.0
|
|
20
20
|
Requires-Dist: orjson>=3.9.0
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "judgeval"
|
|
3
|
-
version = "0.16.
|
|
3
|
+
version = "0.16.6"
|
|
4
4
|
authors = [
|
|
5
5
|
{ name = "Andrew Li", email = "andrew@judgmentlabs.ai" },
|
|
6
6
|
{ name = "Alex Shan", email = "alex@judgmentlabs.ai" },
|
|
@@ -19,7 +19,7 @@ license-files = ["LICENSE.md"]
|
|
|
19
19
|
dependencies = [
|
|
20
20
|
"dotenv",
|
|
21
21
|
"httpx>=0.28.1",
|
|
22
|
-
"litellm
|
|
22
|
+
"litellm>=1.75.0",
|
|
23
23
|
"opentelemetry-exporter-otlp>=1.36.0",
|
|
24
24
|
"opentelemetry-sdk>=1.36.0",
|
|
25
25
|
"orjson>=3.9.0",
|
|
@@ -39,18 +39,23 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
|
39
39
|
def run_evaluation(
|
|
40
40
|
self,
|
|
41
41
|
examples: List[Example],
|
|
42
|
-
scorers: Sequence[Union[ExampleAPIScorerConfig, ExampleScorer]],
|
|
42
|
+
scorers: Sequence[Union[ExampleAPIScorerConfig, ExampleScorer, None]],
|
|
43
43
|
project_name: str = "default_project",
|
|
44
44
|
eval_run_name: str = "default_eval_run",
|
|
45
45
|
model: Optional[str] = None,
|
|
46
46
|
assert_test: bool = False,
|
|
47
47
|
) -> List[ScoringResult]:
|
|
48
48
|
try:
|
|
49
|
+
for scorer in scorers:
|
|
50
|
+
if scorer is None:
|
|
51
|
+
raise ValueError(
|
|
52
|
+
"Failed to run evaluation: At least one Prompt Scorer was not successfuly retrieved."
|
|
53
|
+
)
|
|
49
54
|
eval = ExampleEvaluationRun(
|
|
50
55
|
project_name=project_name,
|
|
51
56
|
eval_name=eval_run_name,
|
|
52
57
|
examples=examples,
|
|
53
|
-
scorers=scorers,
|
|
58
|
+
scorers=scorers, # type: ignore
|
|
54
59
|
model=model,
|
|
55
60
|
)
|
|
56
61
|
|
|
@@ -12,6 +12,7 @@ from judgeval.logger import judgeval_logger
|
|
|
12
12
|
from abc import ABC
|
|
13
13
|
from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
|
|
14
14
|
from copy import copy
|
|
15
|
+
from judgeval.utils.decorators.dont_throw import dont_throw
|
|
15
16
|
|
|
16
17
|
|
|
17
18
|
def push_prompt_scorer(
|
|
@@ -60,10 +61,19 @@ def fetch_prompt_scorer(
|
|
|
60
61
|
):
|
|
61
62
|
client = JudgmentSyncClient(judgment_api_key, organization_id)
|
|
62
63
|
try:
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
64
|
+
fetched_scorers = client.fetch_scorers({"names": [name]})
|
|
65
|
+
if len(fetched_scorers["scorers"]) == 0:
|
|
66
|
+
judgeval_logger.error(f"Prompt scorer '{name}' not found")
|
|
67
|
+
raise JudgmentAPIError(
|
|
68
|
+
status_code=404,
|
|
69
|
+
detail=f"Prompt scorer '{name}' not found",
|
|
70
|
+
response=None, # type: ignore
|
|
71
|
+
)
|
|
72
|
+
else:
|
|
73
|
+
scorer_config = fetched_scorers["scorers"][0]
|
|
74
|
+
scorer_config.pop("created_at")
|
|
75
|
+
scorer_config.pop("updated_at")
|
|
76
|
+
return scorer_config
|
|
67
77
|
except JudgmentAPIError as e:
|
|
68
78
|
if e.status_code == 500:
|
|
69
79
|
raise JudgmentAPIError(
|
|
@@ -109,6 +119,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
|
|
|
109
119
|
organization_id: str = os.getenv("JUDGMENT_ORG_ID") or ""
|
|
110
120
|
|
|
111
121
|
@classmethod
|
|
122
|
+
@dont_throw
|
|
112
123
|
def get(
|
|
113
124
|
cls,
|
|
114
125
|
name: str,
|
|
@@ -267,6 +267,7 @@ class Tracer(metaclass=SingletonMeta):
|
|
|
267
267
|
if span and span.is_recording():
|
|
268
268
|
set_span_attribute(span, AttributeKeys.JUDGMENT_CUSTOMER_ID, customer_id)
|
|
269
269
|
|
|
270
|
+
@dont_throw
|
|
270
271
|
def add_agent_attributes_to_span(self, span):
|
|
271
272
|
"""Add agent ID, class name, and instance name to span if they exist in context"""
|
|
272
273
|
current_agent_context = self.agent_context.get()
|
|
@@ -342,6 +343,9 @@ class Tracer(metaclass=SingletonMeta):
|
|
|
342
343
|
run_condition = scorer_config.run_condition
|
|
343
344
|
sampling_rate = scorer_config.sampling_rate
|
|
344
345
|
|
|
346
|
+
if scorer is None:
|
|
347
|
+
judgeval_logger.error("Prompt Scorer was not found, skipping evaluation.")
|
|
348
|
+
return
|
|
345
349
|
if not isinstance(scorer, (TraceAPIScorerConfig)):
|
|
346
350
|
judgeval_logger.error(
|
|
347
351
|
"Scorer must be an instance of TraceAPIScorerConfig, got %s, skipping evaluation."
|
|
@@ -877,7 +881,7 @@ class Tracer(metaclass=SingletonMeta):
|
|
|
877
881
|
self,
|
|
878
882
|
/,
|
|
879
883
|
*,
|
|
880
|
-
scorer: Union[ExampleAPIScorerConfig, ExampleScorer],
|
|
884
|
+
scorer: Union[ExampleAPIScorerConfig, ExampleScorer, None],
|
|
881
885
|
example: Example,
|
|
882
886
|
model: Optional[str] = None,
|
|
883
887
|
sampling_rate: float = 1.0,
|
|
@@ -886,6 +890,10 @@ class Tracer(metaclass=SingletonMeta):
|
|
|
886
890
|
judgeval_logger.info("Evaluation is not enabled, skipping evaluation")
|
|
887
891
|
return
|
|
888
892
|
|
|
893
|
+
if scorer is None:
|
|
894
|
+
judgeval_logger.error("Prompt Scorer was not found, skipping evaluation.")
|
|
895
|
+
return
|
|
896
|
+
|
|
889
897
|
if not isinstance(scorer, (ExampleAPIScorerConfig, ExampleScorer)):
|
|
890
898
|
judgeval_logger.error(
|
|
891
899
|
"Scorer must be an instance of ExampleAPIScorerConfig or ExampleScorer, got %s, skipping evaluation."
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
import functools
|
|
3
|
-
import orjson
|
|
4
3
|
from typing import (
|
|
5
4
|
TYPE_CHECKING,
|
|
6
5
|
Callable,
|
|
7
6
|
Optional,
|
|
8
7
|
Protocol,
|
|
8
|
+
TypeVar,
|
|
9
9
|
Tuple,
|
|
10
10
|
Union,
|
|
11
11
|
Iterator,
|
|
@@ -19,6 +19,7 @@ from judgeval.tracer.llm.llm_anthropic.config import (
|
|
|
19
19
|
anthropic_AsyncAnthropic,
|
|
20
20
|
)
|
|
21
21
|
from judgeval.tracer.managers import sync_span_context, async_span_context
|
|
22
|
+
from judgeval.logger import judgeval_logger
|
|
22
23
|
from judgeval.tracer.keys import AttributeKeys
|
|
23
24
|
from judgeval.tracer.utils import set_span_attribute
|
|
24
25
|
from judgeval.utils.serialize import safe_serialize
|
|
@@ -28,10 +29,6 @@ if TYPE_CHECKING:
|
|
|
28
29
|
from opentelemetry.trace import Span
|
|
29
30
|
|
|
30
31
|
|
|
31
|
-
# Keep the original client type for runtime compatibility
|
|
32
|
-
AnthropicClientType = Union[anthropic_Anthropic, anthropic_AsyncAnthropic]
|
|
33
|
-
|
|
34
|
-
|
|
35
32
|
# Content block protocols
|
|
36
33
|
@runtime_checkable
|
|
37
34
|
class AnthropicContentBlock(Protocol):
|
|
@@ -81,6 +78,10 @@ class AnthropicAsyncClient(Protocol):
|
|
|
81
78
|
pass
|
|
82
79
|
|
|
83
80
|
|
|
81
|
+
# Generic client type bound to both sync and async client protocols
|
|
82
|
+
TClient = TypeVar("TClient", bound=Union[AnthropicClient, AnthropicAsyncClient])
|
|
83
|
+
|
|
84
|
+
|
|
84
85
|
# Union types
|
|
85
86
|
AnthropicResponseType = AnthropicMessage
|
|
86
87
|
AnthropicStreamType = Union[
|
|
@@ -193,7 +194,7 @@ class TracedAnthropicGenerator:
|
|
|
193
194
|
self,
|
|
194
195
|
tracer: Tracer,
|
|
195
196
|
generator: Iterator[AnthropicStreamEvent],
|
|
196
|
-
client:
|
|
197
|
+
client: AnthropicClient,
|
|
197
198
|
span: Span,
|
|
198
199
|
model_name: str,
|
|
199
200
|
):
|
|
@@ -261,7 +262,7 @@ class TracedAnthropicAsyncGenerator:
|
|
|
261
262
|
self,
|
|
262
263
|
tracer: Tracer,
|
|
263
264
|
async_generator: AsyncIterator[AnthropicStreamEvent],
|
|
264
|
-
client:
|
|
265
|
+
client: AnthropicAsyncClient,
|
|
265
266
|
span: Span,
|
|
266
267
|
model_name: str,
|
|
267
268
|
):
|
|
@@ -278,6 +279,19 @@ class TracedAnthropicAsyncGenerator:
|
|
|
278
279
|
async def __anext__(self) -> AnthropicStreamEvent:
|
|
279
280
|
try:
|
|
280
281
|
chunk = await self.async_generator.__anext__()
|
|
282
|
+
except StopAsyncIteration:
|
|
283
|
+
set_span_attribute(
|
|
284
|
+
self.span, AttributeKeys.GEN_AI_COMPLETION, self.accumulated_content
|
|
285
|
+
)
|
|
286
|
+
self.span.end()
|
|
287
|
+
raise
|
|
288
|
+
except Exception as e:
|
|
289
|
+
if self.span:
|
|
290
|
+
self.span.record_exception(e)
|
|
291
|
+
self.span.end()
|
|
292
|
+
raise
|
|
293
|
+
|
|
294
|
+
try:
|
|
281
295
|
content = _extract_anthropic_content(chunk)
|
|
282
296
|
if content:
|
|
283
297
|
self.accumulated_content += content
|
|
@@ -310,18 +324,14 @@ class TracedAnthropicAsyncGenerator:
|
|
|
310
324
|
AttributeKeys.JUDGMENT_USAGE_METADATA,
|
|
311
325
|
safe_serialize(usage_data),
|
|
312
326
|
)
|
|
313
|
-
return chunk
|
|
314
|
-
except StopAsyncIteration:
|
|
315
|
-
set_span_attribute(
|
|
316
|
-
self.span, AttributeKeys.GEN_AI_COMPLETION, self.accumulated_content
|
|
317
|
-
)
|
|
318
|
-
self.span.end()
|
|
319
|
-
raise
|
|
320
327
|
except Exception as e:
|
|
321
328
|
if self.span:
|
|
322
|
-
self.span.record_exception(e)
|
|
323
329
|
self.span.end()
|
|
324
|
-
|
|
330
|
+
judgeval_logger.error(
|
|
331
|
+
f"[anthropic wrapped_async] Error adding span metadata: {e}"
|
|
332
|
+
)
|
|
333
|
+
finally:
|
|
334
|
+
return chunk
|
|
325
335
|
|
|
326
336
|
|
|
327
337
|
class TracedAnthropicSyncContextManager:
|
|
@@ -329,7 +339,7 @@ class TracedAnthropicSyncContextManager:
|
|
|
329
339
|
self,
|
|
330
340
|
tracer: Tracer,
|
|
331
341
|
context_manager,
|
|
332
|
-
client:
|
|
342
|
+
client: AnthropicClient,
|
|
333
343
|
span: Span,
|
|
334
344
|
model_name: str,
|
|
335
345
|
):
|
|
@@ -354,7 +364,7 @@ class TracedAnthropicAsyncContextManager:
|
|
|
354
364
|
self,
|
|
355
365
|
tracer: Tracer,
|
|
356
366
|
context_manager,
|
|
357
|
-
client:
|
|
367
|
+
client: AnthropicAsyncClient,
|
|
358
368
|
span: Span,
|
|
359
369
|
model_name: str,
|
|
360
370
|
):
|
|
@@ -374,9 +384,7 @@ class TracedAnthropicAsyncContextManager:
|
|
|
374
384
|
return await self.context_manager.__aexit__(exc_type, exc_val, exc_tb)
|
|
375
385
|
|
|
376
386
|
|
|
377
|
-
def wrap_anthropic_client(
|
|
378
|
-
tracer: Tracer, client: AnthropicClientType
|
|
379
|
-
) -> AnthropicClientType:
|
|
387
|
+
def wrap_anthropic_client(tracer: Tracer, client: TClient) -> TClient:
|
|
380
388
|
def wrapped(function: Callable, span_name: str):
|
|
381
389
|
@functools.wraps(function)
|
|
382
390
|
def wrapper(*args, **kwargs):
|
|
@@ -398,68 +406,77 @@ def wrap_anthropic_client(
|
|
|
398
406
|
with sync_span_context(
|
|
399
407
|
tracer, span_name, {AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
|
|
400
408
|
) as span:
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
span, AttributeKeys.GEN_AI_REQUEST_MODEL, model_name
|
|
408
|
-
)
|
|
409
|
-
|
|
410
|
-
response = function(*args, **kwargs)
|
|
411
|
-
|
|
412
|
-
if isinstance(response, AnthropicMessage):
|
|
413
|
-
output, usage_data = _format_anthropic_output(response)
|
|
414
|
-
# Serialize structured data to JSON for span attribute
|
|
415
|
-
if isinstance(output, list):
|
|
416
|
-
output_str = orjson.dumps(
|
|
417
|
-
output, option=orjson.OPT_INDENT_2
|
|
418
|
-
).decode()
|
|
419
|
-
else:
|
|
420
|
-
output_str = str(output) if output is not None else None
|
|
409
|
+
try:
|
|
410
|
+
tracer.add_agent_attributes_to_span(span)
|
|
411
|
+
set_span_attribute(
|
|
412
|
+
span, AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
|
|
413
|
+
)
|
|
414
|
+
model_name = kwargs.get("model", "")
|
|
421
415
|
set_span_attribute(
|
|
422
|
-
span, AttributeKeys.
|
|
416
|
+
span, AttributeKeys.GEN_AI_REQUEST_MODEL, model_name
|
|
417
|
+
)
|
|
418
|
+
except Exception as e:
|
|
419
|
+
judgeval_logger.error(
|
|
420
|
+
f"[anthropic wrapped] Error adding span metadata: {e}"
|
|
423
421
|
)
|
|
424
422
|
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
prompt_tokens,
|
|
436
|
-
)
|
|
437
|
-
set_span_attribute(
|
|
438
|
-
span,
|
|
439
|
-
AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS,
|
|
440
|
-
completion_tokens,
|
|
441
|
-
)
|
|
442
|
-
set_span_attribute(
|
|
443
|
-
span,
|
|
444
|
-
AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS,
|
|
445
|
-
cache_read,
|
|
446
|
-
)
|
|
423
|
+
response = function(*args, **kwargs)
|
|
424
|
+
|
|
425
|
+
try:
|
|
426
|
+
if isinstance(response, AnthropicMessage):
|
|
427
|
+
output, usage_data = _format_anthropic_output(response)
|
|
428
|
+
# Serialize structured data to JSON for span attribute
|
|
429
|
+
if isinstance(output, list):
|
|
430
|
+
output_str = safe_serialize(output)
|
|
431
|
+
else:
|
|
432
|
+
output_str = str(output) if output is not None else None
|
|
447
433
|
set_span_attribute(
|
|
448
|
-
span,
|
|
449
|
-
AttributeKeys.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS,
|
|
450
|
-
cache_creation,
|
|
434
|
+
span, AttributeKeys.GEN_AI_COMPLETION, output_str
|
|
451
435
|
)
|
|
436
|
+
|
|
437
|
+
if usage_data:
|
|
438
|
+
(
|
|
439
|
+
prompt_tokens,
|
|
440
|
+
completion_tokens,
|
|
441
|
+
cache_read,
|
|
442
|
+
cache_creation,
|
|
443
|
+
) = _extract_anthropic_tokens(usage_data)
|
|
444
|
+
set_span_attribute(
|
|
445
|
+
span,
|
|
446
|
+
AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS,
|
|
447
|
+
prompt_tokens,
|
|
448
|
+
)
|
|
449
|
+
set_span_attribute(
|
|
450
|
+
span,
|
|
451
|
+
AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS,
|
|
452
|
+
completion_tokens,
|
|
453
|
+
)
|
|
454
|
+
set_span_attribute(
|
|
455
|
+
span,
|
|
456
|
+
AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS,
|
|
457
|
+
cache_read,
|
|
458
|
+
)
|
|
459
|
+
set_span_attribute(
|
|
460
|
+
span,
|
|
461
|
+
AttributeKeys.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS,
|
|
462
|
+
cache_creation,
|
|
463
|
+
)
|
|
464
|
+
set_span_attribute(
|
|
465
|
+
span,
|
|
466
|
+
AttributeKeys.JUDGMENT_USAGE_METADATA,
|
|
467
|
+
safe_serialize(usage_data),
|
|
468
|
+
)
|
|
452
469
|
set_span_attribute(
|
|
453
470
|
span,
|
|
454
|
-
AttributeKeys.
|
|
455
|
-
|
|
471
|
+
AttributeKeys.GEN_AI_RESPONSE_MODEL,
|
|
472
|
+
getattr(response, "model", model_name),
|
|
456
473
|
)
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
getattr(response, "model", model_name),
|
|
474
|
+
except Exception as e:
|
|
475
|
+
judgeval_logger.error(
|
|
476
|
+
f"[anthropic wrapped] Error adding span metadata: {e}"
|
|
461
477
|
)
|
|
462
|
-
|
|
478
|
+
finally:
|
|
479
|
+
return response
|
|
463
480
|
|
|
464
481
|
return wrapper
|
|
465
482
|
|
|
@@ -484,68 +501,77 @@ def wrap_anthropic_client(
|
|
|
484
501
|
async with async_span_context(
|
|
485
502
|
tracer, span_name, {AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
|
|
486
503
|
) as span:
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
span, AttributeKeys.GEN_AI_REQUEST_MODEL, model_name
|
|
494
|
-
)
|
|
495
|
-
|
|
496
|
-
response = await function(*args, **kwargs)
|
|
497
|
-
|
|
498
|
-
if isinstance(response, AnthropicMessage):
|
|
499
|
-
output, usage_data = _format_anthropic_output(response)
|
|
500
|
-
# Serialize structured data to JSON for span attribute
|
|
501
|
-
if isinstance(output, list):
|
|
502
|
-
output_str = orjson.dumps(
|
|
503
|
-
output, option=orjson.OPT_INDENT_2
|
|
504
|
-
).decode()
|
|
505
|
-
else:
|
|
506
|
-
output_str = str(output) if output is not None else None
|
|
504
|
+
try:
|
|
505
|
+
tracer.add_agent_attributes_to_span(span)
|
|
506
|
+
set_span_attribute(
|
|
507
|
+
span, AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
|
|
508
|
+
)
|
|
509
|
+
model_name = kwargs.get("model", "")
|
|
507
510
|
set_span_attribute(
|
|
508
|
-
span, AttributeKeys.
|
|
511
|
+
span, AttributeKeys.GEN_AI_REQUEST_MODEL, model_name
|
|
512
|
+
)
|
|
513
|
+
except Exception as e:
|
|
514
|
+
judgeval_logger.error(
|
|
515
|
+
f"[anthropic wrapped_async] Error adding span metadata: {e}"
|
|
509
516
|
)
|
|
510
517
|
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
prompt_tokens,
|
|
522
|
-
)
|
|
523
|
-
set_span_attribute(
|
|
524
|
-
span,
|
|
525
|
-
AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS,
|
|
526
|
-
completion_tokens,
|
|
527
|
-
)
|
|
528
|
-
set_span_attribute(
|
|
529
|
-
span,
|
|
530
|
-
AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS,
|
|
531
|
-
cache_read,
|
|
532
|
-
)
|
|
518
|
+
response = await function(*args, **kwargs)
|
|
519
|
+
|
|
520
|
+
try:
|
|
521
|
+
if isinstance(response, AnthropicMessage):
|
|
522
|
+
output, usage_data = _format_anthropic_output(response)
|
|
523
|
+
# Serialize structured data to JSON for span attribute
|
|
524
|
+
if isinstance(output, list):
|
|
525
|
+
output_str = safe_serialize(output)
|
|
526
|
+
else:
|
|
527
|
+
output_str = str(output) if output is not None else None
|
|
533
528
|
set_span_attribute(
|
|
534
|
-
span,
|
|
535
|
-
AttributeKeys.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS,
|
|
536
|
-
cache_creation,
|
|
529
|
+
span, AttributeKeys.GEN_AI_COMPLETION, output_str
|
|
537
530
|
)
|
|
531
|
+
|
|
532
|
+
if usage_data:
|
|
533
|
+
(
|
|
534
|
+
prompt_tokens,
|
|
535
|
+
completion_tokens,
|
|
536
|
+
cache_read,
|
|
537
|
+
cache_creation,
|
|
538
|
+
) = _extract_anthropic_tokens(usage_data)
|
|
539
|
+
set_span_attribute(
|
|
540
|
+
span,
|
|
541
|
+
AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS,
|
|
542
|
+
prompt_tokens,
|
|
543
|
+
)
|
|
544
|
+
set_span_attribute(
|
|
545
|
+
span,
|
|
546
|
+
AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS,
|
|
547
|
+
completion_tokens,
|
|
548
|
+
)
|
|
549
|
+
set_span_attribute(
|
|
550
|
+
span,
|
|
551
|
+
AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS,
|
|
552
|
+
cache_read,
|
|
553
|
+
)
|
|
554
|
+
set_span_attribute(
|
|
555
|
+
span,
|
|
556
|
+
AttributeKeys.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS,
|
|
557
|
+
cache_creation,
|
|
558
|
+
)
|
|
559
|
+
set_span_attribute(
|
|
560
|
+
span,
|
|
561
|
+
AttributeKeys.JUDGMENT_USAGE_METADATA,
|
|
562
|
+
safe_serialize(usage_data),
|
|
563
|
+
)
|
|
538
564
|
set_span_attribute(
|
|
539
565
|
span,
|
|
540
|
-
AttributeKeys.
|
|
541
|
-
|
|
566
|
+
AttributeKeys.GEN_AI_RESPONSE_MODEL,
|
|
567
|
+
getattr(response, "model", model_name),
|
|
542
568
|
)
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
getattr(response, "model", model_name),
|
|
569
|
+
except Exception as e:
|
|
570
|
+
judgeval_logger.error(
|
|
571
|
+
f"[anthropic wrapped_async] Error adding span metadata: {e}"
|
|
547
572
|
)
|
|
548
|
-
|
|
573
|
+
finally:
|
|
574
|
+
return response
|
|
549
575
|
|
|
550
576
|
return wrapper
|
|
551
577
|
|
|
@@ -590,16 +616,20 @@ def wrap_anthropic_client(
|
|
|
590
616
|
return wrapper
|
|
591
617
|
|
|
592
618
|
span_name = "ANTHROPIC_API_CALL"
|
|
593
|
-
if anthropic_Anthropic and isinstance(client, anthropic_Anthropic):
|
|
619
|
+
if anthropic_Anthropic is not None and isinstance(client, anthropic_Anthropic):
|
|
594
620
|
setattr(client.messages, "create", wrapped(client.messages.create, span_name))
|
|
595
621
|
setattr(
|
|
596
622
|
client.messages,
|
|
597
623
|
"stream",
|
|
598
624
|
wrapped_sync_context_manager(client.messages.stream, span_name),
|
|
599
625
|
)
|
|
600
|
-
elif anthropic_AsyncAnthropic and isinstance(
|
|
626
|
+
elif anthropic_AsyncAnthropic is not None and isinstance(
|
|
627
|
+
client, anthropic_AsyncAnthropic
|
|
628
|
+
):
|
|
601
629
|
setattr(
|
|
602
|
-
client.messages,
|
|
630
|
+
client.messages,
|
|
631
|
+
"create",
|
|
632
|
+
wrapped_async(client.messages.create, span_name),
|
|
603
633
|
)
|
|
604
634
|
setattr(
|
|
605
635
|
client.messages,
|