judgeval 0.1.0__py3-none-any.whl → 0.23.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +173 -10
- judgeval/api/__init__.py +523 -0
- judgeval/api/api_types.py +413 -0
- judgeval/cli.py +112 -0
- judgeval/constants.py +7 -30
- judgeval/data/__init__.py +1 -3
- judgeval/data/evaluation_run.py +125 -0
- judgeval/data/example.py +14 -40
- judgeval/data/judgment_types.py +396 -146
- judgeval/data/result.py +11 -18
- judgeval/data/scorer_data.py +3 -26
- judgeval/data/scripts/openapi_transform.py +5 -5
- judgeval/data/trace.py +115 -194
- judgeval/dataset/__init__.py +335 -0
- judgeval/env.py +55 -0
- judgeval/evaluation/__init__.py +346 -0
- judgeval/exceptions.py +28 -0
- judgeval/integrations/langgraph/__init__.py +13 -0
- judgeval/integrations/openlit/__init__.py +51 -0
- judgeval/judges/__init__.py +2 -2
- judgeval/judges/litellm_judge.py +77 -16
- judgeval/judges/together_judge.py +88 -17
- judgeval/judges/utils.py +7 -20
- judgeval/judgment_attribute_keys.py +55 -0
- judgeval/{common/logger.py → logger.py} +24 -8
- judgeval/prompt/__init__.py +330 -0
- judgeval/scorers/__init__.py +11 -11
- judgeval/scorers/agent_scorer.py +15 -19
- judgeval/scorers/api_scorer.py +21 -23
- judgeval/scorers/base_scorer.py +54 -36
- judgeval/scorers/example_scorer.py +1 -3
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -24
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -10
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -10
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -14
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +171 -59
- judgeval/scorers/score.py +64 -47
- judgeval/scorers/utils.py +2 -107
- judgeval/tracer/__init__.py +1111 -2
- judgeval/tracer/constants.py +1 -0
- judgeval/tracer/exporters/__init__.py +40 -0
- judgeval/tracer/exporters/s3.py +119 -0
- judgeval/tracer/exporters/store.py +59 -0
- judgeval/tracer/exporters/utils.py +32 -0
- judgeval/tracer/keys.py +63 -0
- judgeval/tracer/llm/__init__.py +7 -0
- judgeval/tracer/llm/config.py +78 -0
- judgeval/tracer/llm/constants.py +9 -0
- judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
- judgeval/tracer/llm/llm_anthropic/config.py +6 -0
- judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
- judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
- judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
- judgeval/tracer/llm/llm_google/__init__.py +3 -0
- judgeval/tracer/llm/llm_google/config.py +6 -0
- judgeval/tracer/llm/llm_google/generate_content.py +127 -0
- judgeval/tracer/llm/llm_google/wrapper.py +30 -0
- judgeval/tracer/llm/llm_openai/__init__.py +3 -0
- judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
- judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
- judgeval/tracer/llm/llm_openai/config.py +6 -0
- judgeval/tracer/llm/llm_openai/responses.py +506 -0
- judgeval/tracer/llm/llm_openai/utils.py +42 -0
- judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
- judgeval/tracer/llm/llm_together/__init__.py +3 -0
- judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
- judgeval/tracer/llm/llm_together/config.py +6 -0
- judgeval/tracer/llm/llm_together/wrapper.py +52 -0
- judgeval/tracer/llm/providers.py +19 -0
- judgeval/tracer/managers.py +167 -0
- judgeval/tracer/processors/__init__.py +220 -0
- judgeval/tracer/utils.py +19 -0
- judgeval/trainer/__init__.py +14 -0
- judgeval/trainer/base_trainer.py +122 -0
- judgeval/trainer/config.py +123 -0
- judgeval/trainer/console.py +144 -0
- judgeval/trainer/fireworks_trainer.py +392 -0
- judgeval/trainer/trainable_model.py +252 -0
- judgeval/trainer/trainer.py +70 -0
- judgeval/utils/async_utils.py +39 -0
- judgeval/utils/decorators/__init__.py +0 -0
- judgeval/utils/decorators/dont_throw.py +37 -0
- judgeval/utils/decorators/use_once.py +13 -0
- judgeval/utils/file_utils.py +74 -28
- judgeval/utils/guards.py +36 -0
- judgeval/utils/meta.py +27 -0
- judgeval/utils/project.py +15 -0
- judgeval/utils/serialize.py +253 -0
- judgeval/utils/testing.py +70 -0
- judgeval/utils/url.py +10 -0
- judgeval/{version_check.py → utils/version_check.py} +5 -3
- judgeval/utils/wrappers/README.md +3 -0
- judgeval/utils/wrappers/__init__.py +15 -0
- judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
- judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
- judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
- judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
- judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
- judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
- judgeval/utils/wrappers/py.typed +0 -0
- judgeval/utils/wrappers/utils.py +35 -0
- judgeval/v1/__init__.py +88 -0
- judgeval/v1/data/__init__.py +7 -0
- judgeval/v1/data/example.py +44 -0
- judgeval/v1/data/scorer_data.py +42 -0
- judgeval/v1/data/scoring_result.py +44 -0
- judgeval/v1/datasets/__init__.py +6 -0
- judgeval/v1/datasets/dataset.py +214 -0
- judgeval/v1/datasets/dataset_factory.py +94 -0
- judgeval/v1/evaluation/__init__.py +6 -0
- judgeval/v1/evaluation/evaluation.py +182 -0
- judgeval/v1/evaluation/evaluation_factory.py +17 -0
- judgeval/v1/instrumentation/__init__.py +6 -0
- judgeval/v1/instrumentation/llm/__init__.py +7 -0
- judgeval/v1/instrumentation/llm/config.py +78 -0
- judgeval/v1/instrumentation/llm/constants.py +11 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/messages.py +414 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/messages_stream.py +307 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/wrapper.py +61 -0
- judgeval/v1/instrumentation/llm/llm_google/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_google/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_google/generate_content.py +121 -0
- judgeval/v1/instrumentation/llm/llm_google/wrapper.py +30 -0
- judgeval/v1/instrumentation/llm/llm_openai/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_openai/beta_chat_completions.py +212 -0
- judgeval/v1/instrumentation/llm/llm_openai/chat_completions.py +477 -0
- judgeval/v1/instrumentation/llm/llm_openai/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_openai/responses.py +472 -0
- judgeval/v1/instrumentation/llm/llm_openai/utils.py +41 -0
- judgeval/v1/instrumentation/llm/llm_openai/wrapper.py +63 -0
- judgeval/v1/instrumentation/llm/llm_together/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_together/chat_completions.py +382 -0
- judgeval/v1/instrumentation/llm/llm_together/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_together/wrapper.py +57 -0
- judgeval/v1/instrumentation/llm/providers.py +19 -0
- judgeval/v1/integrations/claude_agent_sdk/__init__.py +119 -0
- judgeval/v1/integrations/claude_agent_sdk/wrapper.py +564 -0
- judgeval/v1/integrations/langgraph/__init__.py +13 -0
- judgeval/v1/integrations/openlit/__init__.py +47 -0
- judgeval/v1/internal/api/__init__.py +525 -0
- judgeval/v1/internal/api/api_types.py +413 -0
- judgeval/v1/prompts/__init__.py +6 -0
- judgeval/v1/prompts/prompt.py +29 -0
- judgeval/v1/prompts/prompt_factory.py +189 -0
- judgeval/v1/py.typed +0 -0
- judgeval/v1/scorers/__init__.py +6 -0
- judgeval/v1/scorers/api_scorer.py +82 -0
- judgeval/v1/scorers/base_scorer.py +17 -0
- judgeval/v1/scorers/built_in/__init__.py +17 -0
- judgeval/v1/scorers/built_in/answer_correctness.py +28 -0
- judgeval/v1/scorers/built_in/answer_relevancy.py +28 -0
- judgeval/v1/scorers/built_in/built_in_factory.py +26 -0
- judgeval/v1/scorers/built_in/faithfulness.py +28 -0
- judgeval/v1/scorers/built_in/instruction_adherence.py +28 -0
- judgeval/v1/scorers/custom_scorer/__init__.py +6 -0
- judgeval/v1/scorers/custom_scorer/custom_scorer.py +50 -0
- judgeval/v1/scorers/custom_scorer/custom_scorer_factory.py +16 -0
- judgeval/v1/scorers/prompt_scorer/__init__.py +6 -0
- judgeval/v1/scorers/prompt_scorer/prompt_scorer.py +86 -0
- judgeval/v1/scorers/prompt_scorer/prompt_scorer_factory.py +85 -0
- judgeval/v1/scorers/scorers_factory.py +49 -0
- judgeval/v1/tracer/__init__.py +7 -0
- judgeval/v1/tracer/base_tracer.py +520 -0
- judgeval/v1/tracer/exporters/__init__.py +14 -0
- judgeval/v1/tracer/exporters/in_memory_span_exporter.py +25 -0
- judgeval/v1/tracer/exporters/judgment_span_exporter.py +42 -0
- judgeval/v1/tracer/exporters/noop_span_exporter.py +19 -0
- judgeval/v1/tracer/exporters/span_store.py +50 -0
- judgeval/v1/tracer/judgment_tracer_provider.py +70 -0
- judgeval/v1/tracer/processors/__init__.py +6 -0
- judgeval/v1/tracer/processors/_lifecycles/__init__.py +28 -0
- judgeval/v1/tracer/processors/_lifecycles/agent_id_processor.py +53 -0
- judgeval/v1/tracer/processors/_lifecycles/context_keys.py +11 -0
- judgeval/v1/tracer/processors/_lifecycles/customer_id_processor.py +29 -0
- judgeval/v1/tracer/processors/_lifecycles/registry.py +18 -0
- judgeval/v1/tracer/processors/judgment_span_processor.py +165 -0
- judgeval/v1/tracer/processors/noop_span_processor.py +42 -0
- judgeval/v1/tracer/tracer.py +67 -0
- judgeval/v1/tracer/tracer_factory.py +38 -0
- judgeval/v1/trainers/__init__.py +5 -0
- judgeval/v1/trainers/base_trainer.py +62 -0
- judgeval/v1/trainers/config.py +123 -0
- judgeval/v1/trainers/console.py +144 -0
- judgeval/v1/trainers/fireworks_trainer.py +392 -0
- judgeval/v1/trainers/trainable_model.py +252 -0
- judgeval/v1/trainers/trainers_factory.py +37 -0
- judgeval/v1/utils.py +18 -0
- judgeval/version.py +5 -0
- judgeval/warnings.py +4 -0
- judgeval-0.23.0.dist-info/METADATA +266 -0
- judgeval-0.23.0.dist-info/RECORD +201 -0
- judgeval-0.23.0.dist-info/entry_points.txt +2 -0
- judgeval/clients.py +0 -34
- judgeval/common/__init__.py +0 -13
- judgeval/common/api/__init__.py +0 -3
- judgeval/common/api/api.py +0 -352
- judgeval/common/api/constants.py +0 -165
- judgeval/common/exceptions.py +0 -27
- judgeval/common/storage/__init__.py +0 -6
- judgeval/common/storage/s3_storage.py +0 -98
- judgeval/common/tracer/__init__.py +0 -31
- judgeval/common/tracer/constants.py +0 -22
- judgeval/common/tracer/core.py +0 -1916
- judgeval/common/tracer/otel_exporter.py +0 -108
- judgeval/common/tracer/otel_span_processor.py +0 -234
- judgeval/common/tracer/span_processor.py +0 -37
- judgeval/common/tracer/span_transformer.py +0 -211
- judgeval/common/tracer/trace_manager.py +0 -92
- judgeval/common/utils.py +0 -940
- judgeval/data/datasets/__init__.py +0 -4
- judgeval/data/datasets/dataset.py +0 -341
- judgeval/data/datasets/eval_dataset_client.py +0 -214
- judgeval/data/tool.py +0 -5
- judgeval/data/trace_run.py +0 -37
- judgeval/evaluation_run.py +0 -75
- judgeval/integrations/langgraph.py +0 -843
- judgeval/judges/mixture_of_judges.py +0 -286
- judgeval/judgment_client.py +0 -369
- judgeval/rules.py +0 -521
- judgeval/run_evaluation.py +0 -684
- judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
- judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
- judgeval/utils/alerts.py +0 -93
- judgeval/utils/requests.py +0 -50
- judgeval-0.1.0.dist-info/METADATA +0 -202
- judgeval-0.1.0.dist-info/RECORD +0 -73
- {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/WHEEL +0 -0
- {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,335 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
import orjson
|
|
3
|
+
import os
|
|
4
|
+
import yaml
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import List, Literal, Optional, Iterable, Iterator
|
|
7
|
+
from itertools import islice
|
|
8
|
+
from rich.progress import (
|
|
9
|
+
Progress,
|
|
10
|
+
SpinnerColumn,
|
|
11
|
+
TextColumn,
|
|
12
|
+
BarColumn,
|
|
13
|
+
TaskProgressColumn,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
from judgeval.data import Example
|
|
17
|
+
from judgeval.data.trace import Trace
|
|
18
|
+
from judgeval.utils.file_utils import get_examples_from_yaml, get_examples_from_json
|
|
19
|
+
from judgeval.api import JudgmentSyncClient
|
|
20
|
+
from judgeval.logger import judgeval_logger
|
|
21
|
+
from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_ORG_ID
|
|
22
|
+
|
|
23
|
+
from judgeval.data.judgment_types import DatasetKind
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _batch_examples(
|
|
27
|
+
examples: Iterable[Example], batch_size: int = 100
|
|
28
|
+
) -> Iterator[List[Example]]:
|
|
29
|
+
"""Generator that yields batches of examples for efficient memory usage.
|
|
30
|
+
|
|
31
|
+
Works with any iterable including generators, consuming only batch_size items at a time.
|
|
32
|
+
"""
|
|
33
|
+
iterator = iter(examples)
|
|
34
|
+
while True:
|
|
35
|
+
batch = list(islice(iterator, batch_size))
|
|
36
|
+
if not batch:
|
|
37
|
+
break
|
|
38
|
+
yield batch
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class DatasetInfo:
|
|
43
|
+
dataset_id: str
|
|
44
|
+
name: str
|
|
45
|
+
created_at: str
|
|
46
|
+
kind: DatasetKind
|
|
47
|
+
entries: int
|
|
48
|
+
creator: str
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@dataclass
|
|
52
|
+
class Dataset:
|
|
53
|
+
name: str
|
|
54
|
+
project_name: str
|
|
55
|
+
dataset_kind: DatasetKind = DatasetKind.example
|
|
56
|
+
examples: Optional[List[Example]] = None
|
|
57
|
+
traces: Optional[List[Trace]] = None
|
|
58
|
+
judgment_api_key: str | None = JUDGMENT_API_KEY
|
|
59
|
+
organization_id: str | None = JUDGMENT_ORG_ID
|
|
60
|
+
|
|
61
|
+
@classmethod
|
|
62
|
+
def get(
|
|
63
|
+
cls,
|
|
64
|
+
name: str,
|
|
65
|
+
project_name: str,
|
|
66
|
+
):
|
|
67
|
+
if not cls.judgment_api_key or not cls.organization_id:
|
|
68
|
+
raise ValueError("Judgment API key and organization ID are required")
|
|
69
|
+
client = JudgmentSyncClient(cls.judgment_api_key, cls.organization_id)
|
|
70
|
+
dataset = client.datasets_pull_for_judgeval(
|
|
71
|
+
{
|
|
72
|
+
"dataset_name": name,
|
|
73
|
+
"project_name": project_name,
|
|
74
|
+
},
|
|
75
|
+
)
|
|
76
|
+
if not dataset:
|
|
77
|
+
raise ValueError(f"Dataset {name} not found in project {project_name}")
|
|
78
|
+
|
|
79
|
+
dataset_kind = DatasetKind(dataset.get("dataset_kind", "example"))
|
|
80
|
+
|
|
81
|
+
if dataset_kind == DatasetKind.example:
|
|
82
|
+
examples = dataset.get("examples", [])
|
|
83
|
+
if examples is None:
|
|
84
|
+
examples = []
|
|
85
|
+
|
|
86
|
+
for e in examples:
|
|
87
|
+
if isinstance(e, dict) and isinstance(e.get("data", {}), dict):
|
|
88
|
+
e.update(e.pop("data")) # type: ignore
|
|
89
|
+
e.pop(
|
|
90
|
+
"example_id"
|
|
91
|
+
) # TODO: remove once scorer data migration is complete
|
|
92
|
+
judgeval_logger.info(f"Successfully retrieved example dataset {name}!")
|
|
93
|
+
return cls(
|
|
94
|
+
name=name,
|
|
95
|
+
project_name=project_name,
|
|
96
|
+
dataset_kind=dataset_kind,
|
|
97
|
+
examples=[Example(**e) for e in examples],
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
elif dataset_kind == DatasetKind.trace:
|
|
101
|
+
trace_data = dataset.get("traces", [])
|
|
102
|
+
if trace_data is None:
|
|
103
|
+
trace_data = []
|
|
104
|
+
|
|
105
|
+
traces = []
|
|
106
|
+
for trace_item in trace_data:
|
|
107
|
+
if isinstance(trace_item, dict):
|
|
108
|
+
trace = Trace.from_dataset_trace_with_spans(trace_item)
|
|
109
|
+
traces.append(trace)
|
|
110
|
+
|
|
111
|
+
judgeval_logger.info(f"Successfully retrieved trace dataset {name}!")
|
|
112
|
+
return cls(
|
|
113
|
+
name=name,
|
|
114
|
+
project_name=project_name,
|
|
115
|
+
dataset_kind=dataset_kind,
|
|
116
|
+
traces=traces,
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
else:
|
|
120
|
+
raise ValueError(f"Unsupported dataset kind: {dataset_kind}")
|
|
121
|
+
|
|
122
|
+
@classmethod
|
|
123
|
+
def create(
|
|
124
|
+
cls,
|
|
125
|
+
name: str,
|
|
126
|
+
project_name: str,
|
|
127
|
+
examples: Iterable[Example] = [],
|
|
128
|
+
overwrite: bool = False,
|
|
129
|
+
batch_size: int = 100,
|
|
130
|
+
):
|
|
131
|
+
"""Create a dataset with batched example uploads for large datasets.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
name: Dataset name
|
|
135
|
+
project_name: Project name
|
|
136
|
+
examples: Iterable of examples to add (can be a list, generator, etc.)
|
|
137
|
+
overwrite: Whether to overwrite existing dataset
|
|
138
|
+
batch_size: Number of examples to upload per batch (default: 100)
|
|
139
|
+
"""
|
|
140
|
+
if not cls.judgment_api_key or not cls.organization_id:
|
|
141
|
+
raise ValueError("Judgment API key and organization ID are required")
|
|
142
|
+
|
|
143
|
+
client = JudgmentSyncClient(cls.judgment_api_key, cls.organization_id)
|
|
144
|
+
|
|
145
|
+
client.datasets_create_for_judgeval(
|
|
146
|
+
{
|
|
147
|
+
"name": name,
|
|
148
|
+
"project_name": project_name,
|
|
149
|
+
"examples": [], # type: ignore
|
|
150
|
+
"dataset_kind": "example",
|
|
151
|
+
"overwrite": overwrite,
|
|
152
|
+
}
|
|
153
|
+
)
|
|
154
|
+
judgeval_logger.info(f"Created dataset {name}")
|
|
155
|
+
|
|
156
|
+
if not isinstance(examples, list):
|
|
157
|
+
examples = list(examples)
|
|
158
|
+
|
|
159
|
+
dataset = cls(
|
|
160
|
+
name=name,
|
|
161
|
+
project_name=project_name,
|
|
162
|
+
examples=examples,
|
|
163
|
+
)
|
|
164
|
+
dataset.add_examples(examples, batch_size=batch_size)
|
|
165
|
+
|
|
166
|
+
return dataset
|
|
167
|
+
|
|
168
|
+
@classmethod
|
|
169
|
+
def list(cls, project_name: str):
|
|
170
|
+
if not cls.judgment_api_key or not cls.organization_id:
|
|
171
|
+
raise ValueError("Judgment API key and organization ID are required")
|
|
172
|
+
client = JudgmentSyncClient(cls.judgment_api_key, cls.organization_id)
|
|
173
|
+
datasets = client.datasets_pull_all_for_judgeval({"project_name": project_name})
|
|
174
|
+
|
|
175
|
+
judgeval_logger.info(f"Fetched all datasets for project {project_name}!")
|
|
176
|
+
|
|
177
|
+
return [DatasetInfo(**dataset_info) for dataset_info in datasets]
|
|
178
|
+
|
|
179
|
+
def add_from_json(self, file_path: str) -> None:
|
|
180
|
+
"""
|
|
181
|
+
Adds examples from a JSON file.
|
|
182
|
+
|
|
183
|
+
The JSON file is expected to have the following format:
|
|
184
|
+
[
|
|
185
|
+
{
|
|
186
|
+
"key_01": "value_01",
|
|
187
|
+
"key_02": "value_02"
|
|
188
|
+
},
|
|
189
|
+
{
|
|
190
|
+
"key_11": "value_11",
|
|
191
|
+
"key_12": "value_12",
|
|
192
|
+
"key_13": "value_13"
|
|
193
|
+
},
|
|
194
|
+
...
|
|
195
|
+
]
|
|
196
|
+
"""
|
|
197
|
+
examples = get_examples_from_json(file_path)
|
|
198
|
+
self.add_examples(examples)
|
|
199
|
+
|
|
200
|
+
def add_from_yaml(self, file_path: str) -> None:
|
|
201
|
+
"""
|
|
202
|
+
Adds examples from a YAML file.
|
|
203
|
+
|
|
204
|
+
The YAML file is expected to have the following format:
|
|
205
|
+
- key_01: value_01
|
|
206
|
+
key_02: value_02
|
|
207
|
+
- key_11: value_11
|
|
208
|
+
key_12: value_12
|
|
209
|
+
key_13: value_13
|
|
210
|
+
...
|
|
211
|
+
"""
|
|
212
|
+
|
|
213
|
+
examples = get_examples_from_yaml(file_path)
|
|
214
|
+
self.add_examples(examples)
|
|
215
|
+
|
|
216
|
+
def add_examples(self, examples: Iterable[Example], batch_size: int = 100) -> None:
|
|
217
|
+
if not self.judgment_api_key or not self.organization_id:
|
|
218
|
+
raise ValueError("Judgment API key and organization ID are required")
|
|
219
|
+
|
|
220
|
+
client = JudgmentSyncClient(self.judgment_api_key, self.organization_id)
|
|
221
|
+
|
|
222
|
+
batches = _batch_examples(examples, batch_size)
|
|
223
|
+
total_uploaded = 0
|
|
224
|
+
|
|
225
|
+
with Progress(
|
|
226
|
+
SpinnerColumn(),
|
|
227
|
+
TextColumn("[bold blue]{task.description}"),
|
|
228
|
+
BarColumn(pulse_style="green"),
|
|
229
|
+
TaskProgressColumn(),
|
|
230
|
+
TextColumn("[dim]{task.fields[info]}"),
|
|
231
|
+
) as progress:
|
|
232
|
+
task = progress.add_task(
|
|
233
|
+
f"Uploading to {self.name}",
|
|
234
|
+
total=None,
|
|
235
|
+
info="",
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
batch_num = 0
|
|
239
|
+
for batch in batches:
|
|
240
|
+
if len(batch) > 0 and not isinstance(batch[0], Example):
|
|
241
|
+
raise TypeError("Examples must be a list of Example objects")
|
|
242
|
+
|
|
243
|
+
batch_num += 1
|
|
244
|
+
batch_size_actual = len(batch)
|
|
245
|
+
total_uploaded += batch_size_actual
|
|
246
|
+
|
|
247
|
+
progress.update(
|
|
248
|
+
task,
|
|
249
|
+
advance=1,
|
|
250
|
+
info=f"Batch {batch_num} ({batch_size_actual} examples, {total_uploaded} total)",
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
client.datasets_insert_examples_for_judgeval(
|
|
254
|
+
{
|
|
255
|
+
"dataset_name": self.name,
|
|
256
|
+
"project_name": self.project_name,
|
|
257
|
+
"examples": batch, # type: ignore
|
|
258
|
+
}
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
judgeval_logger.info(
|
|
262
|
+
f"Successfully added {total_uploaded} examples to dataset {self.name}"
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
def save_as(
|
|
266
|
+
self,
|
|
267
|
+
file_type: Literal["json", "yaml"],
|
|
268
|
+
dir_path: str,
|
|
269
|
+
save_name: str | None = None,
|
|
270
|
+
) -> None:
|
|
271
|
+
"""
|
|
272
|
+
Saves the dataset as a file. Save only the examples.
|
|
273
|
+
|
|
274
|
+
Args:
|
|
275
|
+
file_type (Literal["json", "csv"]): The file type to save the dataset as.
|
|
276
|
+
dir_path (str): The directory path to save the file to.
|
|
277
|
+
save_name (str, optional): The name of the file to save. Defaults to None.
|
|
278
|
+
"""
|
|
279
|
+
if not os.path.exists(dir_path):
|
|
280
|
+
os.makedirs(dir_path)
|
|
281
|
+
file_name = (
|
|
282
|
+
datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
283
|
+
if save_name is None
|
|
284
|
+
else save_name
|
|
285
|
+
)
|
|
286
|
+
complete_path = os.path.join(dir_path, f"{file_name}.{file_type}")
|
|
287
|
+
if file_type == "json":
|
|
288
|
+
with open(complete_path, "wb") as file:
|
|
289
|
+
file.write(
|
|
290
|
+
orjson.dumps(
|
|
291
|
+
{
|
|
292
|
+
"examples": [e.to_dict() for e in self.examples]
|
|
293
|
+
if self.examples
|
|
294
|
+
else [],
|
|
295
|
+
},
|
|
296
|
+
option=orjson.OPT_INDENT_2,
|
|
297
|
+
)
|
|
298
|
+
)
|
|
299
|
+
elif file_type == "yaml":
|
|
300
|
+
with open(complete_path, "w") as file:
|
|
301
|
+
yaml_data = {
|
|
302
|
+
"examples": [e.to_dict() for e in self.examples]
|
|
303
|
+
if self.examples
|
|
304
|
+
else [],
|
|
305
|
+
}
|
|
306
|
+
yaml.dump(yaml_data, file, default_flow_style=False)
|
|
307
|
+
else:
|
|
308
|
+
ACCEPTABLE_FILE_TYPES = ["json", "yaml"]
|
|
309
|
+
raise TypeError(
|
|
310
|
+
f"Invalid file type: {file_type}. Please choose from {ACCEPTABLE_FILE_TYPES}"
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
def __iter__(self):
|
|
314
|
+
if self.dataset_kind == DatasetKind.example and self.examples:
|
|
315
|
+
return iter(self.examples)
|
|
316
|
+
elif self.dataset_kind == DatasetKind.trace and self.traces:
|
|
317
|
+
return iter(self.traces)
|
|
318
|
+
else:
|
|
319
|
+
return iter([])
|
|
320
|
+
|
|
321
|
+
def __len__(self):
|
|
322
|
+
if self.dataset_kind == DatasetKind.example and self.examples:
|
|
323
|
+
return len(self.examples)
|
|
324
|
+
elif self.dataset_kind == DatasetKind.trace and self.traces:
|
|
325
|
+
return len(self.traces)
|
|
326
|
+
else:
|
|
327
|
+
return 0
|
|
328
|
+
|
|
329
|
+
def __str__(self):
|
|
330
|
+
if self.dataset_kind == DatasetKind.example:
|
|
331
|
+
return (
|
|
332
|
+
f"{self.__class__.__name__}(examples={self.examples}, name={self.name})"
|
|
333
|
+
)
|
|
334
|
+
else:
|
|
335
|
+
return f"{self.__class__.__name__}(traces={self.traces}, name={self.name})"
|
judgeval/env.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from dotenv import load_dotenv
|
|
3
|
+
|
|
4
|
+
load_dotenv()
|
|
5
|
+
|
|
6
|
+
import os
|
|
7
|
+
from typing import overload
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@overload
|
|
11
|
+
def optional_env_var(var_name: str) -> str | None: ...
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@overload
|
|
15
|
+
def optional_env_var(var_name: str, default: str) -> str: ...
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def optional_env_var(var_name: str, default: str | None = None) -> str | None:
|
|
19
|
+
return os.getenv(var_name, default)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
JUDGMENT_API_KEY = optional_env_var("JUDGMENT_API_KEY")
|
|
23
|
+
JUDGMENT_ORG_ID = optional_env_var("JUDGMENT_ORG_ID")
|
|
24
|
+
JUDGMENT_API_URL = optional_env_var("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
|
|
25
|
+
|
|
26
|
+
JUDGMENT_DEFAULT_GPT_MODEL = optional_env_var(
|
|
27
|
+
"JUDGMENT_DEFAULT_GPT_MODEL", "gpt-5-mini"
|
|
28
|
+
)
|
|
29
|
+
JUDGMENT_DEFAULT_TOGETHER_MODEL = optional_env_var(
|
|
30
|
+
"JUDGMENT_DEFAULT_TOGETHER_MODEL", "meta-llama/Meta-Llama-3-8B-Instruct-Lite"
|
|
31
|
+
)
|
|
32
|
+
JUDGMENT_MAX_CONCURRENT_EVALUATIONS = int(
|
|
33
|
+
optional_env_var("JUDGMENT_MAX_CONCURRENT_EVALUATIONS", "10")
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
JUDGMENT_ENABLE_MONITORING = optional_env_var("JUDGMENT_ENABLE_MONITORING", "true")
|
|
38
|
+
JUDGMENT_ENABLE_EVALUATIONS = optional_env_var("JUDGMENT_ENABLE_EVALUATIONS", "true")
|
|
39
|
+
|
|
40
|
+
JUDGMENT_S3_ACCESS_KEY_ID = optional_env_var("JUDGMENT_S3_ACCESS_KEY_ID")
|
|
41
|
+
JUDGMENT_S3_SECRET_ACCESS_KEY = optional_env_var("JUDGMENT_S3_SECRET_ACCESS_KEY")
|
|
42
|
+
JUDGMENT_S3_REGION_NAME = optional_env_var("JUDGMENT_S3_REGION_NAME")
|
|
43
|
+
JUDGMENT_S3_BUCKET_NAME = optional_env_var("JUDGMENT_S3_BUCKET_NAME")
|
|
44
|
+
JUDGMENT_S3_PREFIX = optional_env_var("JUDGMENT_S3_PREFIX", "spans/")
|
|
45
|
+
JUDGMENT_S3_ENDPOINT_URL = optional_env_var("JUDGMENT_S3_ENDPOINT_URL")
|
|
46
|
+
JUDGMENT_S3_SIGNATURE_VERSION = optional_env_var("JUDGMENT_S3_SIGNATURE_VERSION", "s3")
|
|
47
|
+
JUDGMENT_S3_ADDRESSING_STYLE = optional_env_var("JUDGMENT_S3_ADDRESSING_STYLE", "auto")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
JUDGMENT_NO_COLOR = optional_env_var("JUDGMENT_NO_COLOR")
|
|
51
|
+
JUDGMENT_LOG_LEVEL = optional_env_var("JUDGMENT_LOG_LEVEL", "WARNING")
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
TOGETHERAI_API_KEY = optional_env_var("TOGETHERAI_API_KEY")
|
|
55
|
+
TOGETHER_API_KEY = optional_env_var("TOGETHER_API_KEY")
|