judgeval 0.14.1__tar.gz → 0.15.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {judgeval-0.14.1 → judgeval-0.15.0}/PKG-INFO +1 -1
- {judgeval-0.14.1 → judgeval-0.15.0}/pyproject.toml +3 -2
- {judgeval-0.14.1 → judgeval-0.15.0}/scripts/api_generator.py +0 -1
- {judgeval-0.14.1 → judgeval-0.15.0}/scripts/openapi_transform.py +0 -1
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/api/__init__.py +0 -22
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/api/api_types.py +18 -26
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/data/judgment_types.py +23 -34
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/dataset/__init__.py +1 -1
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/evaluation/__init__.py +9 -21
- judgeval-0.15.0/src/judgeval/integrations/openlit/__init__.py +50 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +25 -2
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/tracer/__init__.py +4 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/version.py +1 -1
- {judgeval-0.14.1 → judgeval-0.15.0}/uv.lock +339 -187
- {judgeval-0.14.1 → judgeval-0.15.0}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/.github/ISSUE_TEMPLATE/config.yml +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/.github/pull_request_template.md +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/.github/workflows/blocked-pr.yaml +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/.github/workflows/ci.yaml +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/.github/workflows/claude-code-review.yml +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/.github/workflows/claude.yml +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/.github/workflows/lint.yaml +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/.github/workflows/merge-branch-check.yaml +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/.github/workflows/mypy.yaml +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/.github/workflows/pre-commit-autoupdate.yaml +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/.github/workflows/release.yaml +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/.github/workflows/validate-branch.yaml +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/.gitignore +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/.pre-commit-config.yaml +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/LICENSE.md +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/README.md +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/assets/Screenshot 2025-05-17 at 8.14.27/342/200/257PM.png" +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/assets/agent.gif +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/assets/agent_trace_example.png +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/assets/brand/company.jpg +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/assets/brand/company_banner.jpg +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/assets/brand/darkmode.svg +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/assets/brand/full_logo.png +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/assets/brand/icon.png +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/assets/brand/lightmode.svg +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/assets/brand/white_background.png +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/assets/data.gif +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/assets/dataset_clustering_screenshot.png +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/assets/dataset_clustering_screenshot_dm.png +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/assets/datasets_preview_screenshot.png +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/assets/document.gif +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/assets/error_analysis_dashboard.png +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/assets/errors.png +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/assets/experiments_dashboard_screenshot.png +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/assets/experiments_page.png +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/assets/experiments_pagev2.png +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/assets/logo-dark.svg +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/assets/logo-light.svg +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/assets/monitoring_screenshot.png +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/assets/new_darkmode.svg +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/assets/new_lightmode.svg +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/assets/online_eval.png +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/assets/product_shot.png +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/assets/test.png +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/assets/tests.png +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/assets/trace.gif +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/assets/trace_demo.png +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/assets/trace_screenshot.png +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/assets/trace_screenshot_old.png +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/pytest.ini +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/scripts/update_types.sh +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/__init__.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/cli.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/constants.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/data/__init__.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/data/evaluation_run.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/data/example.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/data/result.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/data/scorer_data.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/data/scripts/fix_default_factory.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/data/scripts/openapi_transform.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/data/trace.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/env.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/exceptions.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/integrations/langgraph/__init__.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/judges/__init__.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/judges/base_judge.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/judges/litellm_judge.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/judges/together_judge.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/judges/utils.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/logger.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/scorers/__init__.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/scorers/agent_scorer.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/scorers/api_scorer.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/scorers/base_scorer.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/scorers/example_scorer.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/scorers/exceptions.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/scorers/score.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/scorers/utils.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/tracer/constants.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/tracer/exporters/__init__.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/tracer/exporters/s3.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/tracer/exporters/store.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/tracer/exporters/utils.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/tracer/keys.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/tracer/llm/__init__.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/tracer/llm/anthropic/__init__.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/tracer/llm/google/__init__.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/tracer/llm/groq/__init__.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/tracer/llm/openai/__init__.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/tracer/llm/providers.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/tracer/llm/together/__init__.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/tracer/local_eval_queue.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/tracer/managers.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/tracer/processors/__init__.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/tracer/utils.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/trainer/__init__.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/trainer/config.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/trainer/console.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/trainer/trainable_model.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/trainer/trainer.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/utils/async_utils.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/utils/decorators.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/utils/file_utils.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/utils/guards.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/utils/meta.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/utils/serialize.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/utils/testing.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/utils/url.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/utils/version_check.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/warnings.py +0 -0
- {judgeval-0.14.1 → judgeval-0.15.0}/update_version.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
[project]
|
2
2
|
name = "judgeval"
|
3
|
-
version = "0.
|
3
|
+
version = "0.15.0"
|
4
4
|
authors = [
|
5
5
|
{ name = "Andrew Li", email = "andrew@judgmentlabs.ai" },
|
6
6
|
{ name = "Alex Shan", email = "alex@judgmentlabs.ai" },
|
@@ -58,7 +58,7 @@ dev = [
|
|
58
58
|
"langchain-core>=0.3.72",
|
59
59
|
"langgraph>=0.6.4",
|
60
60
|
"mypy>=1.17.1",
|
61
|
-
"openai>=1.
|
61
|
+
"openai>=1.92.0",
|
62
62
|
"opentelemetry-instrumentation-openai>=0.44.1",
|
63
63
|
"ruff>=0.9.1,<0.10.0",
|
64
64
|
"together>=1.5.21",
|
@@ -73,6 +73,7 @@ dev = [
|
|
73
73
|
"langchain-tavily>=0.2.11",
|
74
74
|
"streamlit>=1.49.1",
|
75
75
|
"langchain-community>=0.3.29",
|
76
|
+
"openlit>=1.35.5",
|
76
77
|
]
|
77
78
|
|
78
79
|
|
@@ -111,16 +111,6 @@ class JudgmentSyncClient:
|
|
111
111
|
payload,
|
112
112
|
)
|
113
113
|
|
114
|
-
def get_evaluation_status(self, experiment_run_id: str, project_name: str) -> Any:
|
115
|
-
query_params = {}
|
116
|
-
query_params["experiment_run_id"] = experiment_run_id
|
117
|
-
query_params["project_name"] = project_name
|
118
|
-
return self._request(
|
119
|
-
"GET",
|
120
|
-
url_for("/get_evaluation_status/"),
|
121
|
-
query_params,
|
122
|
-
)
|
123
|
-
|
124
114
|
def datasets_insert_examples_for_judgeval(
|
125
115
|
self, payload: DatasetInsertExamples
|
126
116
|
) -> Any:
|
@@ -318,18 +308,6 @@ class JudgmentAsyncClient:
|
|
318
308
|
payload,
|
319
309
|
)
|
320
310
|
|
321
|
-
async def get_evaluation_status(
|
322
|
-
self, experiment_run_id: str, project_name: str
|
323
|
-
) -> Any:
|
324
|
-
query_params = {}
|
325
|
-
query_params["experiment_run_id"] = experiment_run_id
|
326
|
-
query_params["project_name"] = project_name
|
327
|
-
return await self._request(
|
328
|
-
"GET",
|
329
|
-
url_for("/get_evaluation_status/"),
|
330
|
-
query_params,
|
331
|
-
)
|
332
|
-
|
333
311
|
async def datasets_insert_examples_for_judgeval(
|
334
312
|
self, payload: DatasetInsertExamples
|
335
313
|
) -> Any:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# generated by datamodel-codegen:
|
2
2
|
# filename: .openapi.json
|
3
|
-
# timestamp: 2025-09-
|
3
|
+
# timestamp: 2025-09-30T18:06:51+00:00
|
4
4
|
|
5
5
|
from __future__ import annotations
|
6
6
|
from typing import Any, Dict, List, Literal, Optional, TypedDict, Union
|
@@ -24,6 +24,15 @@ class DatasetsFetch(TypedDict):
|
|
24
24
|
project_name: str
|
25
25
|
|
26
26
|
|
27
|
+
class DatasetsTableRow(TypedDict):
|
28
|
+
dataset_id: str
|
29
|
+
name: str
|
30
|
+
created_at: str
|
31
|
+
kind: Literal["trace", "example"]
|
32
|
+
entries: int
|
33
|
+
creator: str
|
34
|
+
|
35
|
+
|
27
36
|
class ProjectAdd(TypedDict):
|
28
37
|
project_name: str
|
29
38
|
|
@@ -180,18 +189,10 @@ class OtelTraceSpan(TypedDict):
|
|
180
189
|
resource_attributes: NotRequired[Optional[Dict[str, Any]]]
|
181
190
|
span_attributes: NotRequired[Optional[Dict[str, Any]]]
|
182
191
|
duration: NotRequired[Optional[int]]
|
183
|
-
status_code: NotRequired[Optional[
|
192
|
+
status_code: NotRequired[Optional[int]]
|
184
193
|
status_message: NotRequired[Optional[str]]
|
185
194
|
events: NotRequired[Optional[List[Dict[str, Any]]]]
|
186
195
|
links: NotRequired[Optional[List[Dict[str, Any]]]]
|
187
|
-
legacy_span_id: NotRequired[Optional[str]]
|
188
|
-
inputs: NotRequired[Optional[Dict[str, Any]]]
|
189
|
-
output: Any
|
190
|
-
error: NotRequired[Optional[Dict[str, Any]]]
|
191
|
-
agent_id: NotRequired[Optional[str]]
|
192
|
-
cumulative_llm_cost: NotRequired[Optional[float]]
|
193
|
-
state_after: NotRequired[Optional[Dict[str, Any]]]
|
194
|
-
state_before: NotRequired[Optional[Dict[str, Any]]]
|
195
196
|
|
196
197
|
|
197
198
|
class OtelSpanListItemScores(TypedDict):
|
@@ -206,7 +207,7 @@ class OtelSpanDetailScores(TypedDict):
|
|
206
207
|
score: float
|
207
208
|
reason: NotRequired[Optional[str]]
|
208
209
|
name: str
|
209
|
-
|
210
|
+
example_id: NotRequired[Optional[str]]
|
210
211
|
|
211
212
|
|
212
213
|
class ExampleEvaluationRun(TypedDict):
|
@@ -244,15 +245,6 @@ class DatasetInsertExamples(TypedDict):
|
|
244
245
|
project_name: str
|
245
246
|
|
246
247
|
|
247
|
-
class DatasetInfo(TypedDict):
|
248
|
-
dataset_id: str
|
249
|
-
name: str
|
250
|
-
created_at: str
|
251
|
-
dataset_kind: DatasetKind
|
252
|
-
entries: int
|
253
|
-
creator: str
|
254
|
-
|
255
|
-
|
256
248
|
class DatasetCreate(TypedDict):
|
257
249
|
name: str
|
258
250
|
dataset_kind: DatasetKind
|
@@ -279,14 +271,14 @@ class OtelTraceListItem(TypedDict):
|
|
279
271
|
organization_id: str
|
280
272
|
project_id: str
|
281
273
|
trace_id: str
|
282
|
-
|
274
|
+
created_at: str
|
283
275
|
duration: NotRequired[Optional[int]]
|
284
276
|
has_notification: NotRequired[Optional[bool]]
|
285
277
|
tags: NotRequired[Optional[List[str]]]
|
286
278
|
experiment_run_id: NotRequired[Optional[str]]
|
287
279
|
span_name: NotRequired[Optional[str]]
|
288
|
-
|
289
|
-
error: NotRequired[
|
280
|
+
llm_cost: NotRequired[Optional[float]]
|
281
|
+
error: NotRequired[str]
|
290
282
|
scores: NotRequired[List[OtelSpanListItemScores]]
|
291
283
|
customer_id: NotRequired[Optional[str]]
|
292
284
|
input_preview: NotRequired[Optional[str]]
|
@@ -310,9 +302,9 @@ class OtelSpanDetail(TypedDict):
|
|
310
302
|
resource_attributes: NotRequired[Optional[Dict[str, Any]]]
|
311
303
|
span_attributes: NotRequired[Optional[Dict[str, Any]]]
|
312
304
|
duration: NotRequired[Optional[int]]
|
313
|
-
status_code: NotRequired[Optional[
|
305
|
+
status_code: NotRequired[Optional[int]]
|
314
306
|
status_message: NotRequired[Optional[str]]
|
315
|
-
events: NotRequired[Optional[
|
307
|
+
events: NotRequired[Optional[List[Dict[str, Any]]]]
|
316
308
|
links: NotRequired[Optional[Union[List[Dict[str, Any]], Dict[str, Any]]]]
|
317
309
|
llm_cost: NotRequired[Optional[float]]
|
318
310
|
prompt_tokens: NotRequired[Optional[int]]
|
@@ -335,5 +327,5 @@ class DatasetReturn(TypedDict):
|
|
335
327
|
name: str
|
336
328
|
project_name: str
|
337
329
|
dataset_kind: DatasetKind
|
338
|
-
examples: NotRequired[
|
330
|
+
examples: NotRequired[List[Example]]
|
339
331
|
traces: NotRequired[Optional[List[DatasetTraceWithSpans]]]
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# generated by datamodel-codegen:
|
2
2
|
# filename: .openapi.json
|
3
|
-
# timestamp: 2025-09-
|
3
|
+
# timestamp: 2025-09-30T18:06:50+00:00
|
4
4
|
|
5
5
|
from __future__ import annotations
|
6
6
|
from typing import Annotated, Any, Dict, List, Optional, Union
|
@@ -26,6 +26,20 @@ class DatasetsFetch(BaseModel):
|
|
26
26
|
project_name: Annotated[str, Field(title="Project Name")]
|
27
27
|
|
28
28
|
|
29
|
+
class Kind(Enum):
|
30
|
+
trace = "trace"
|
31
|
+
example = "example"
|
32
|
+
|
33
|
+
|
34
|
+
class DatasetsTableRow(BaseModel):
|
35
|
+
dataset_id: Annotated[str, Field(title="Dataset Id")]
|
36
|
+
name: Annotated[str, Field(title="Name")]
|
37
|
+
created_at: Annotated[str, Field(title="Created At")]
|
38
|
+
kind: Annotated[Kind, Field(title="Kind")]
|
39
|
+
entries: Annotated[int, Field(title="Entries")]
|
40
|
+
creator: Annotated[str, Field(title="Creator")]
|
41
|
+
|
42
|
+
|
29
43
|
class ProjectAdd(BaseModel):
|
30
44
|
project_name: Annotated[str, Field(title="Project Name")]
|
31
45
|
|
@@ -199,22 +213,10 @@ class OtelTraceSpan(BaseModel):
|
|
199
213
|
Optional[Dict[str, Any]], Field(title="Span Attributes")
|
200
214
|
] = None
|
201
215
|
duration: Annotated[Optional[int], Field(title="Duration")] = None
|
202
|
-
status_code: Annotated[Optional[
|
216
|
+
status_code: Annotated[Optional[int], Field(title="Status Code")] = None
|
203
217
|
status_message: Annotated[Optional[str], Field(title="Status Message")] = None
|
204
218
|
events: Annotated[Optional[List[Dict[str, Any]]], Field(title="Events")] = None
|
205
219
|
links: Annotated[Optional[List[Dict[str, Any]]], Field(title="Links")] = None
|
206
|
-
legacy_span_id: Annotated[Optional[str], Field(title="Legacy Span Id")] = None
|
207
|
-
inputs: Annotated[Optional[Dict[str, Any]], Field(title="Inputs")] = None
|
208
|
-
output: Annotated[Any, Field(title="Output")]
|
209
|
-
error: Annotated[Optional[Dict[str, Any]], Field(title="Error")] = None
|
210
|
-
agent_id: Annotated[Optional[str], Field(title="Agent Id")] = None
|
211
|
-
cumulative_llm_cost: Annotated[
|
212
|
-
Optional[float], Field(title="Cumulative Llm Cost")
|
213
|
-
] = None
|
214
|
-
state_after: Annotated[Optional[Dict[str, Any]], Field(title="State After")] = None
|
215
|
-
state_before: Annotated[Optional[Dict[str, Any]], Field(title="State Before")] = (
|
216
|
-
None
|
217
|
-
)
|
218
220
|
|
219
221
|
|
220
222
|
class OtelSpanListItemScores(BaseModel):
|
@@ -229,7 +231,7 @@ class OtelSpanDetailScores(BaseModel):
|
|
229
231
|
score: Annotated[float, Field(title="Score")]
|
230
232
|
reason: Annotated[Optional[str], Field(title="Reason")] = None
|
231
233
|
name: Annotated[str, Field(title="Name")]
|
232
|
-
|
234
|
+
example_id: Annotated[Optional[str], Field(title="Example Id")] = None
|
233
235
|
|
234
236
|
|
235
237
|
class ExampleEvaluationRun(BaseModel):
|
@@ -277,15 +279,6 @@ class DatasetInsertExamples(BaseModel):
|
|
277
279
|
project_name: Annotated[str, Field(title="Project Name")]
|
278
280
|
|
279
281
|
|
280
|
-
class DatasetInfo(BaseModel):
|
281
|
-
dataset_id: Annotated[str, Field(title="Dataset Id")]
|
282
|
-
name: Annotated[str, Field(title="Name")]
|
283
|
-
created_at: Annotated[str, Field(title="Created At")]
|
284
|
-
dataset_kind: DatasetKind
|
285
|
-
entries: Annotated[int, Field(title="Entries")]
|
286
|
-
creator: Annotated[str, Field(title="Creator")]
|
287
|
-
|
288
|
-
|
289
282
|
class DatasetCreate(BaseModel):
|
290
283
|
name: Annotated[str, Field(title="Name")]
|
291
284
|
dataset_kind: DatasetKind
|
@@ -314,16 +307,14 @@ class OtelTraceListItem(BaseModel):
|
|
314
307
|
organization_id: Annotated[str, Field(title="Organization Id")]
|
315
308
|
project_id: Annotated[str, Field(title="Project Id")]
|
316
309
|
trace_id: Annotated[str, Field(title="Trace Id")]
|
317
|
-
|
310
|
+
created_at: Annotated[AwareDatetime, Field(title="Created At")]
|
318
311
|
duration: Annotated[Optional[int], Field(title="Duration")] = None
|
319
312
|
has_notification: Annotated[Optional[bool], Field(title="Has Notification")] = None
|
320
313
|
tags: Annotated[Optional[List[str]], Field(title="Tags")] = None
|
321
314
|
experiment_run_id: Annotated[Optional[str], Field(title="Experiment Run Id")] = None
|
322
315
|
span_name: Annotated[Optional[str], Field(title="Span Name")] = None
|
323
|
-
|
324
|
-
|
325
|
-
] = None
|
326
|
-
error: Annotated[Optional[Dict[str, Any]], Field(title="Error")] = None
|
316
|
+
llm_cost: Annotated[Optional[float], Field(title="Llm Cost")] = None
|
317
|
+
error: Annotated[Optional[str], Field(title="Error")] = ""
|
327
318
|
scores: Annotated[
|
328
319
|
Optional[List[OtelSpanListItemScores]], Field(title="Scores")
|
329
320
|
] = []
|
@@ -338,7 +329,7 @@ class OtelTraceListItem(BaseModel):
|
|
338
329
|
class OtelSpanDetail(BaseModel):
|
339
330
|
organization_id: Annotated[str, Field(title="Organization Id")]
|
340
331
|
project_id: Annotated[str, Field(title="Project Id")]
|
341
|
-
timestamp: Annotated[
|
332
|
+
timestamp: Annotated[AwareDatetime, Field(title="Timestamp")]
|
342
333
|
trace_id: Annotated[str, Field(title="Trace Id")]
|
343
334
|
span_id: Annotated[str, Field(title="Span Id")]
|
344
335
|
parent_span_id: Annotated[Optional[str], Field(title="Parent Span Id")] = None
|
@@ -353,11 +344,9 @@ class OtelSpanDetail(BaseModel):
|
|
353
344
|
Optional[Dict[str, Any]], Field(title="Span Attributes")
|
354
345
|
] = None
|
355
346
|
duration: Annotated[Optional[int], Field(title="Duration")] = None
|
356
|
-
status_code: Annotated[Optional[
|
347
|
+
status_code: Annotated[Optional[int], Field(title="Status Code")] = None
|
357
348
|
status_message: Annotated[Optional[str], Field(title="Status Message")] = None
|
358
|
-
events: Annotated[
|
359
|
-
Optional[Union[List[Dict[str, Any]], Dict[str, Any]]], Field(title="Events")
|
360
|
-
] = None
|
349
|
+
events: Annotated[Optional[List[Dict[str, Any]]], Field(title="Events")] = None
|
361
350
|
links: Annotated[
|
362
351
|
Optional[Union[List[Dict[str, Any]], Dict[str, Any]]], Field(title="Links")
|
363
352
|
] = None
|
@@ -84,7 +84,7 @@ def log_evaluation_results(
|
|
84
84
|
|
85
85
|
def _poll_evaluation_until_complete(
|
86
86
|
evaluation_run: ExampleEvaluationRun,
|
87
|
-
|
87
|
+
expected_examples_count: int,
|
88
88
|
poll_interval_seconds: float = 5,
|
89
89
|
max_failures: int = 5,
|
90
90
|
max_poll_count: int = 60, # This should be equivalent to 5 minutes
|
@@ -117,29 +117,22 @@ def _poll_evaluation_until_complete(
|
|
117
117
|
poll_count += 1
|
118
118
|
try:
|
119
119
|
# Check status
|
120
|
-
status_response = api_client.get_evaluation_status(
|
121
|
-
experiment_run_id, project_name
|
122
|
-
)
|
123
|
-
|
124
|
-
if status_response.get("status") != "completed":
|
125
|
-
time.sleep(poll_interval_seconds)
|
126
|
-
continue
|
127
|
-
|
128
|
-
example_scorer_pairings = status_response.get("results", [])
|
129
|
-
if len(example_scorer_pairings) != expected_scorer_data_count:
|
130
|
-
time.sleep(poll_interval_seconds)
|
131
|
-
continue
|
132
|
-
|
133
120
|
results_response = api_client.fetch_experiment_run(
|
134
121
|
{
|
135
122
|
"experiment_run_id": experiment_run_id,
|
136
123
|
"project_name": project_name,
|
137
124
|
}
|
138
125
|
)
|
126
|
+
|
127
|
+
example_scorer_pairings = results_response.get("results", [])
|
128
|
+
if len(example_scorer_pairings) != expected_examples_count:
|
129
|
+
time.sleep(poll_interval_seconds)
|
130
|
+
continue
|
131
|
+
|
139
132
|
url = results_response.get("ui_results_url")
|
140
133
|
|
141
134
|
scoring_result_list = []
|
142
|
-
for res in
|
135
|
+
for res in example_scorer_pairings:
|
143
136
|
example = res.get("data", {}).copy()
|
144
137
|
example["example_id"] = res.get("example_id")
|
145
138
|
scoring_result = ScoringResult(
|
@@ -241,14 +234,9 @@ def run_eval(
|
|
241
234
|
)
|
242
235
|
raise JudgmentRuntimeError(error_message)
|
243
236
|
|
244
|
-
num_scorers = (
|
245
|
-
len(evaluation_run.judgment_scorers)
|
246
|
-
if evaluation_run.judgment_scorers
|
247
|
-
else sum(1 for cs in evaluation_run.custom_scorers if cs.server_hosted)
|
248
|
-
)
|
249
237
|
results, url = _poll_evaluation_until_complete(
|
250
238
|
evaluation_run=evaluation_run,
|
251
|
-
|
239
|
+
expected_examples_count=len(evaluation_run.examples),
|
252
240
|
)
|
253
241
|
finally:
|
254
242
|
stop_event.set()
|
@@ -0,0 +1,50 @@
|
|
1
|
+
from abc import ABC
|
2
|
+
from judgeval.tracer import Tracer
|
3
|
+
from judgeval.logger import judgeval_logger
|
4
|
+
from judgeval.utils.url import url_for
|
5
|
+
|
6
|
+
|
7
|
+
try:
|
8
|
+
import openlit # type: ignore
|
9
|
+
except ImportError:
|
10
|
+
raise ImportError(
|
11
|
+
"Openlit is not installed and required for the openlit integration. Please install it with `pip install openlit`."
|
12
|
+
)
|
13
|
+
|
14
|
+
|
15
|
+
class Openlit(ABC):
|
16
|
+
@staticmethod
|
17
|
+
def initialize(
|
18
|
+
**kwargs,
|
19
|
+
):
|
20
|
+
tracer = Tracer.get_instance()
|
21
|
+
if not tracer or not tracer._initialized:
|
22
|
+
raise ValueError(
|
23
|
+
"Openlit must be initialized after the tracer has been initialized. Please create the Tracer instance first before initializing Openlit."
|
24
|
+
)
|
25
|
+
|
26
|
+
api_key = tracer.api_key
|
27
|
+
organization_id = tracer.organization_id
|
28
|
+
project_name = tracer.project_name
|
29
|
+
|
30
|
+
project_id = Tracer._resolve_project_id(project_name, api_key, organization_id)
|
31
|
+
if not project_id:
|
32
|
+
judgeval_logger.warning(
|
33
|
+
f"Project {project_name} not found. Please create it first at https://app.judgmentlabs.ai/org/{organization_id}/projects."
|
34
|
+
)
|
35
|
+
return
|
36
|
+
|
37
|
+
openlit.init(
|
38
|
+
service_name=project_name,
|
39
|
+
otlp_endpoint=url_for("/otel"),
|
40
|
+
otlp_headers={
|
41
|
+
"Authorization": f"Bearer {api_key}",
|
42
|
+
"X-Organization-Id": organization_id,
|
43
|
+
"X-Project-Id": project_id,
|
44
|
+
},
|
45
|
+
tracer=tracer.get_tracer(),
|
46
|
+
**kwargs,
|
47
|
+
)
|
48
|
+
|
49
|
+
|
50
|
+
__all__ = ["Openlit"]
|
@@ -20,6 +20,7 @@ def push_prompt_scorer(
|
|
20
20
|
threshold: float,
|
21
21
|
options: Optional[Dict[str, float]] = None,
|
22
22
|
model: str = JUDGMENT_DEFAULT_GPT_MODEL,
|
23
|
+
description: Optional[str] = None,
|
23
24
|
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
|
24
25
|
organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
|
25
26
|
is_trace: bool = False,
|
@@ -33,6 +34,7 @@ def push_prompt_scorer(
|
|
33
34
|
"threshold": threshold,
|
34
35
|
"options": options,
|
35
36
|
"model": model,
|
37
|
+
"description": description,
|
36
38
|
"is_trace": is_trace,
|
37
39
|
}
|
38
40
|
)
|
@@ -102,6 +104,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
|
|
102
104
|
score_type: APIScorerType
|
103
105
|
prompt: str
|
104
106
|
options: Optional[Dict[str, float]] = None
|
107
|
+
description: Optional[str] = None
|
105
108
|
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or ""
|
106
109
|
organization_id: str = os.getenv("JUDGMENT_ORG_ID") or ""
|
107
110
|
|
@@ -130,6 +133,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
|
|
130
133
|
threshold=scorer_config["threshold"],
|
131
134
|
options=scorer_config.get("options"),
|
132
135
|
model=scorer_config.get("model"),
|
136
|
+
description=scorer_config.get("description"),
|
133
137
|
judgment_api_key=judgment_api_key,
|
134
138
|
organization_id=organization_id,
|
135
139
|
)
|
@@ -142,6 +146,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
|
|
142
146
|
threshold: float = 0.5,
|
143
147
|
options: Optional[Dict[str, float]] = None,
|
144
148
|
model: str = JUDGMENT_DEFAULT_GPT_MODEL,
|
149
|
+
description: Optional[str] = None,
|
145
150
|
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
|
146
151
|
organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
|
147
152
|
):
|
@@ -158,6 +163,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
|
|
158
163
|
threshold,
|
159
164
|
options,
|
160
165
|
model,
|
166
|
+
description,
|
161
167
|
judgment_api_key,
|
162
168
|
organization_id,
|
163
169
|
is_trace,
|
@@ -170,6 +176,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
|
|
170
176
|
threshold=threshold,
|
171
177
|
options=options,
|
172
178
|
model=model,
|
179
|
+
description=description,
|
173
180
|
judgment_api_key=judgment_api_key,
|
174
181
|
organization_id=organization_id,
|
175
182
|
)
|
@@ -215,6 +222,14 @@ class BasePromptScorer(ABC, APIScorerConfig):
|
|
215
222
|
self.push_prompt_scorer()
|
216
223
|
judgeval_logger.info(f"Successfully updated options for {self.name}")
|
217
224
|
|
225
|
+
def set_description(self, description: Optional[str]):
|
226
|
+
"""
|
227
|
+
Updates the description of the scorer.
|
228
|
+
"""
|
229
|
+
self.description = description
|
230
|
+
self.push_prompt_scorer()
|
231
|
+
judgeval_logger.info(f"Successfully updated description for {self.name}")
|
232
|
+
|
218
233
|
def append_to_prompt(self, prompt_addition: str):
|
219
234
|
"""
|
220
235
|
Appends a string to the prompt.
|
@@ -248,7 +263,13 @@ class BasePromptScorer(ABC, APIScorerConfig):
|
|
248
263
|
"""
|
249
264
|
return copy(self.options) if self.options is not None else None
|
250
265
|
|
251
|
-
def
|
266
|
+
def get_description(self) -> str | None:
|
267
|
+
"""
|
268
|
+
Returns the description of the scorer.
|
269
|
+
"""
|
270
|
+
return self.description
|
271
|
+
|
272
|
+
def get_name(self) -> str:
|
252
273
|
"""
|
253
274
|
Returns the name of the scorer.
|
254
275
|
"""
|
@@ -264,6 +285,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
|
|
264
285
|
"prompt": self.prompt,
|
265
286
|
"threshold": self.threshold,
|
266
287
|
"options": self.options,
|
288
|
+
"description": self.description,
|
267
289
|
}
|
268
290
|
|
269
291
|
def push_prompt_scorer(self):
|
@@ -276,13 +298,14 @@ class BasePromptScorer(ABC, APIScorerConfig):
|
|
276
298
|
self.threshold,
|
277
299
|
self.options,
|
278
300
|
self.model,
|
301
|
+
self.description,
|
279
302
|
self.judgment_api_key,
|
280
303
|
self.organization_id,
|
281
304
|
isinstance(self, TracePromptScorer),
|
282
305
|
)
|
283
306
|
|
284
307
|
def __str__(self):
|
285
|
-
return f"PromptScorer(name={self.name}, model={self.model}, prompt={self.prompt}, threshold={self.threshold}, options={self.options})"
|
308
|
+
return f"PromptScorer(name={self.name}, model={self.model}, prompt={self.prompt}, threshold={self.threshold}, options={self.options}, description={self.description})"
|
286
309
|
|
287
310
|
def model_dump(self, *args, **kwargs) -> Dict[str, Any]:
|
288
311
|
base = super().model_dump(*args, **kwargs)
|
@@ -255,6 +255,10 @@ class Tracer(metaclass=SingletonMeta):
|
|
255
255
|
def get_current_agent_context(self):
|
256
256
|
return self.agent_context
|
257
257
|
|
258
|
+
def get_span_processor(self) -> JudgmentSpanProcessor:
|
259
|
+
"""Get the internal span processor of this tracer instance."""
|
260
|
+
return self.judgment_processor
|
261
|
+
|
258
262
|
def set_customer_id(self, customer_id: str) -> None:
|
259
263
|
span = self.get_current_span()
|
260
264
|
if span and span.is_recording():
|