judgeval 0.1.0__py3-none-any.whl → 0.23.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +173 -10
- judgeval/api/__init__.py +523 -0
- judgeval/api/api_types.py +413 -0
- judgeval/cli.py +112 -0
- judgeval/constants.py +7 -30
- judgeval/data/__init__.py +1 -3
- judgeval/data/evaluation_run.py +125 -0
- judgeval/data/example.py +14 -40
- judgeval/data/judgment_types.py +396 -146
- judgeval/data/result.py +11 -18
- judgeval/data/scorer_data.py +3 -26
- judgeval/data/scripts/openapi_transform.py +5 -5
- judgeval/data/trace.py +115 -194
- judgeval/dataset/__init__.py +335 -0
- judgeval/env.py +55 -0
- judgeval/evaluation/__init__.py +346 -0
- judgeval/exceptions.py +28 -0
- judgeval/integrations/langgraph/__init__.py +13 -0
- judgeval/integrations/openlit/__init__.py +51 -0
- judgeval/judges/__init__.py +2 -2
- judgeval/judges/litellm_judge.py +77 -16
- judgeval/judges/together_judge.py +88 -17
- judgeval/judges/utils.py +7 -20
- judgeval/judgment_attribute_keys.py +55 -0
- judgeval/{common/logger.py → logger.py} +24 -8
- judgeval/prompt/__init__.py +330 -0
- judgeval/scorers/__init__.py +11 -11
- judgeval/scorers/agent_scorer.py +15 -19
- judgeval/scorers/api_scorer.py +21 -23
- judgeval/scorers/base_scorer.py +54 -36
- judgeval/scorers/example_scorer.py +1 -3
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -24
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -10
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -10
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -14
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +171 -59
- judgeval/scorers/score.py +64 -47
- judgeval/scorers/utils.py +2 -107
- judgeval/tracer/__init__.py +1111 -2
- judgeval/tracer/constants.py +1 -0
- judgeval/tracer/exporters/__init__.py +40 -0
- judgeval/tracer/exporters/s3.py +119 -0
- judgeval/tracer/exporters/store.py +59 -0
- judgeval/tracer/exporters/utils.py +32 -0
- judgeval/tracer/keys.py +63 -0
- judgeval/tracer/llm/__init__.py +7 -0
- judgeval/tracer/llm/config.py +78 -0
- judgeval/tracer/llm/constants.py +9 -0
- judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
- judgeval/tracer/llm/llm_anthropic/config.py +6 -0
- judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
- judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
- judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
- judgeval/tracer/llm/llm_google/__init__.py +3 -0
- judgeval/tracer/llm/llm_google/config.py +6 -0
- judgeval/tracer/llm/llm_google/generate_content.py +127 -0
- judgeval/tracer/llm/llm_google/wrapper.py +30 -0
- judgeval/tracer/llm/llm_openai/__init__.py +3 -0
- judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
- judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
- judgeval/tracer/llm/llm_openai/config.py +6 -0
- judgeval/tracer/llm/llm_openai/responses.py +506 -0
- judgeval/tracer/llm/llm_openai/utils.py +42 -0
- judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
- judgeval/tracer/llm/llm_together/__init__.py +3 -0
- judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
- judgeval/tracer/llm/llm_together/config.py +6 -0
- judgeval/tracer/llm/llm_together/wrapper.py +52 -0
- judgeval/tracer/llm/providers.py +19 -0
- judgeval/tracer/managers.py +167 -0
- judgeval/tracer/processors/__init__.py +220 -0
- judgeval/tracer/utils.py +19 -0
- judgeval/trainer/__init__.py +14 -0
- judgeval/trainer/base_trainer.py +122 -0
- judgeval/trainer/config.py +123 -0
- judgeval/trainer/console.py +144 -0
- judgeval/trainer/fireworks_trainer.py +392 -0
- judgeval/trainer/trainable_model.py +252 -0
- judgeval/trainer/trainer.py +70 -0
- judgeval/utils/async_utils.py +39 -0
- judgeval/utils/decorators/__init__.py +0 -0
- judgeval/utils/decorators/dont_throw.py +37 -0
- judgeval/utils/decorators/use_once.py +13 -0
- judgeval/utils/file_utils.py +74 -28
- judgeval/utils/guards.py +36 -0
- judgeval/utils/meta.py +27 -0
- judgeval/utils/project.py +15 -0
- judgeval/utils/serialize.py +253 -0
- judgeval/utils/testing.py +70 -0
- judgeval/utils/url.py +10 -0
- judgeval/{version_check.py → utils/version_check.py} +5 -3
- judgeval/utils/wrappers/README.md +3 -0
- judgeval/utils/wrappers/__init__.py +15 -0
- judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
- judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
- judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
- judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
- judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
- judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
- judgeval/utils/wrappers/py.typed +0 -0
- judgeval/utils/wrappers/utils.py +35 -0
- judgeval/v1/__init__.py +88 -0
- judgeval/v1/data/__init__.py +7 -0
- judgeval/v1/data/example.py +44 -0
- judgeval/v1/data/scorer_data.py +42 -0
- judgeval/v1/data/scoring_result.py +44 -0
- judgeval/v1/datasets/__init__.py +6 -0
- judgeval/v1/datasets/dataset.py +214 -0
- judgeval/v1/datasets/dataset_factory.py +94 -0
- judgeval/v1/evaluation/__init__.py +6 -0
- judgeval/v1/evaluation/evaluation.py +182 -0
- judgeval/v1/evaluation/evaluation_factory.py +17 -0
- judgeval/v1/instrumentation/__init__.py +6 -0
- judgeval/v1/instrumentation/llm/__init__.py +7 -0
- judgeval/v1/instrumentation/llm/config.py +78 -0
- judgeval/v1/instrumentation/llm/constants.py +11 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/messages.py +414 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/messages_stream.py +307 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/wrapper.py +61 -0
- judgeval/v1/instrumentation/llm/llm_google/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_google/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_google/generate_content.py +121 -0
- judgeval/v1/instrumentation/llm/llm_google/wrapper.py +30 -0
- judgeval/v1/instrumentation/llm/llm_openai/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_openai/beta_chat_completions.py +212 -0
- judgeval/v1/instrumentation/llm/llm_openai/chat_completions.py +477 -0
- judgeval/v1/instrumentation/llm/llm_openai/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_openai/responses.py +472 -0
- judgeval/v1/instrumentation/llm/llm_openai/utils.py +41 -0
- judgeval/v1/instrumentation/llm/llm_openai/wrapper.py +63 -0
- judgeval/v1/instrumentation/llm/llm_together/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_together/chat_completions.py +382 -0
- judgeval/v1/instrumentation/llm/llm_together/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_together/wrapper.py +57 -0
- judgeval/v1/instrumentation/llm/providers.py +19 -0
- judgeval/v1/integrations/claude_agent_sdk/__init__.py +119 -0
- judgeval/v1/integrations/claude_agent_sdk/wrapper.py +564 -0
- judgeval/v1/integrations/langgraph/__init__.py +13 -0
- judgeval/v1/integrations/openlit/__init__.py +47 -0
- judgeval/v1/internal/api/__init__.py +525 -0
- judgeval/v1/internal/api/api_types.py +413 -0
- judgeval/v1/prompts/__init__.py +6 -0
- judgeval/v1/prompts/prompt.py +29 -0
- judgeval/v1/prompts/prompt_factory.py +189 -0
- judgeval/v1/py.typed +0 -0
- judgeval/v1/scorers/__init__.py +6 -0
- judgeval/v1/scorers/api_scorer.py +82 -0
- judgeval/v1/scorers/base_scorer.py +17 -0
- judgeval/v1/scorers/built_in/__init__.py +17 -0
- judgeval/v1/scorers/built_in/answer_correctness.py +28 -0
- judgeval/v1/scorers/built_in/answer_relevancy.py +28 -0
- judgeval/v1/scorers/built_in/built_in_factory.py +26 -0
- judgeval/v1/scorers/built_in/faithfulness.py +28 -0
- judgeval/v1/scorers/built_in/instruction_adherence.py +28 -0
- judgeval/v1/scorers/custom_scorer/__init__.py +6 -0
- judgeval/v1/scorers/custom_scorer/custom_scorer.py +50 -0
- judgeval/v1/scorers/custom_scorer/custom_scorer_factory.py +16 -0
- judgeval/v1/scorers/prompt_scorer/__init__.py +6 -0
- judgeval/v1/scorers/prompt_scorer/prompt_scorer.py +86 -0
- judgeval/v1/scorers/prompt_scorer/prompt_scorer_factory.py +85 -0
- judgeval/v1/scorers/scorers_factory.py +49 -0
- judgeval/v1/tracer/__init__.py +7 -0
- judgeval/v1/tracer/base_tracer.py +520 -0
- judgeval/v1/tracer/exporters/__init__.py +14 -0
- judgeval/v1/tracer/exporters/in_memory_span_exporter.py +25 -0
- judgeval/v1/tracer/exporters/judgment_span_exporter.py +42 -0
- judgeval/v1/tracer/exporters/noop_span_exporter.py +19 -0
- judgeval/v1/tracer/exporters/span_store.py +50 -0
- judgeval/v1/tracer/judgment_tracer_provider.py +70 -0
- judgeval/v1/tracer/processors/__init__.py +6 -0
- judgeval/v1/tracer/processors/_lifecycles/__init__.py +28 -0
- judgeval/v1/tracer/processors/_lifecycles/agent_id_processor.py +53 -0
- judgeval/v1/tracer/processors/_lifecycles/context_keys.py +11 -0
- judgeval/v1/tracer/processors/_lifecycles/customer_id_processor.py +29 -0
- judgeval/v1/tracer/processors/_lifecycles/registry.py +18 -0
- judgeval/v1/tracer/processors/judgment_span_processor.py +165 -0
- judgeval/v1/tracer/processors/noop_span_processor.py +42 -0
- judgeval/v1/tracer/tracer.py +67 -0
- judgeval/v1/tracer/tracer_factory.py +38 -0
- judgeval/v1/trainers/__init__.py +5 -0
- judgeval/v1/trainers/base_trainer.py +62 -0
- judgeval/v1/trainers/config.py +123 -0
- judgeval/v1/trainers/console.py +144 -0
- judgeval/v1/trainers/fireworks_trainer.py +392 -0
- judgeval/v1/trainers/trainable_model.py +252 -0
- judgeval/v1/trainers/trainers_factory.py +37 -0
- judgeval/v1/utils.py +18 -0
- judgeval/version.py +5 -0
- judgeval/warnings.py +4 -0
- judgeval-0.23.0.dist-info/METADATA +266 -0
- judgeval-0.23.0.dist-info/RECORD +201 -0
- judgeval-0.23.0.dist-info/entry_points.txt +2 -0
- judgeval/clients.py +0 -34
- judgeval/common/__init__.py +0 -13
- judgeval/common/api/__init__.py +0 -3
- judgeval/common/api/api.py +0 -352
- judgeval/common/api/constants.py +0 -165
- judgeval/common/exceptions.py +0 -27
- judgeval/common/storage/__init__.py +0 -6
- judgeval/common/storage/s3_storage.py +0 -98
- judgeval/common/tracer/__init__.py +0 -31
- judgeval/common/tracer/constants.py +0 -22
- judgeval/common/tracer/core.py +0 -1916
- judgeval/common/tracer/otel_exporter.py +0 -108
- judgeval/common/tracer/otel_span_processor.py +0 -234
- judgeval/common/tracer/span_processor.py +0 -37
- judgeval/common/tracer/span_transformer.py +0 -211
- judgeval/common/tracer/trace_manager.py +0 -92
- judgeval/common/utils.py +0 -940
- judgeval/data/datasets/__init__.py +0 -4
- judgeval/data/datasets/dataset.py +0 -341
- judgeval/data/datasets/eval_dataset_client.py +0 -214
- judgeval/data/tool.py +0 -5
- judgeval/data/trace_run.py +0 -37
- judgeval/evaluation_run.py +0 -75
- judgeval/integrations/langgraph.py +0 -843
- judgeval/judges/mixture_of_judges.py +0 -286
- judgeval/judgment_client.py +0 -369
- judgeval/rules.py +0 -521
- judgeval/run_evaluation.py +0 -684
- judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
- judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
- judgeval/utils/alerts.py +0 -93
- judgeval/utils/requests.py +0 -50
- judgeval-0.1.0.dist-info/METADATA +0 -202
- judgeval-0.1.0.dist-info/RECORD +0 -73
- {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/WHEEL +0 -0
- {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import Any, Dict, Optional
|
|
5
|
+
|
|
6
|
+
from judgeval.v1.internal.api.api_types import ScorerData as APIScorerData
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass(slots=True)
|
|
10
|
+
class ScorerData:
|
|
11
|
+
name: str
|
|
12
|
+
threshold: float
|
|
13
|
+
success: bool
|
|
14
|
+
score: Optional[float] = None
|
|
15
|
+
reason: Optional[str] = None
|
|
16
|
+
strict_mode: Optional[bool] = None
|
|
17
|
+
evaluation_model: Optional[str] = None
|
|
18
|
+
error: Optional[str] = None
|
|
19
|
+
additional_metadata: Dict[str, Any] = field(default_factory=dict)
|
|
20
|
+
id: Optional[str] = None
|
|
21
|
+
|
|
22
|
+
def to_dict(self) -> APIScorerData:
|
|
23
|
+
result: APIScorerData = {
|
|
24
|
+
"name": self.name,
|
|
25
|
+
"threshold": self.threshold,
|
|
26
|
+
"success": self.success,
|
|
27
|
+
}
|
|
28
|
+
if self.score is not None:
|
|
29
|
+
result["score"] = self.score
|
|
30
|
+
if self.reason is not None:
|
|
31
|
+
result["reason"] = self.reason
|
|
32
|
+
if self.strict_mode is not None:
|
|
33
|
+
result["strict_mode"] = self.strict_mode
|
|
34
|
+
if self.evaluation_model is not None:
|
|
35
|
+
result["evaluation_model"] = self.evaluation_model
|
|
36
|
+
if self.error is not None:
|
|
37
|
+
result["error"] = self.error
|
|
38
|
+
if self.additional_metadata:
|
|
39
|
+
result["additional_metadata"] = self.additional_metadata
|
|
40
|
+
if self.id is not None:
|
|
41
|
+
result["id"] = self.id
|
|
42
|
+
return result
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import List, Optional, Union
|
|
5
|
+
|
|
6
|
+
from judgeval.v1.internal.api.api_types import (
|
|
7
|
+
OtelTraceSpan,
|
|
8
|
+
ScorerData as APIScorerData,
|
|
9
|
+
ScoringResult as APIScoringResult,
|
|
10
|
+
)
|
|
11
|
+
from judgeval.v1.data.example import Example
|
|
12
|
+
from judgeval.v1.data.scorer_data import ScorerData
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass(slots=True)
|
|
16
|
+
class ScoringResult:
|
|
17
|
+
success: bool
|
|
18
|
+
scorers_data: List[ScorerData]
|
|
19
|
+
name: Optional[str] = None
|
|
20
|
+
data_object: Optional[Union[OtelTraceSpan, Example]] = None
|
|
21
|
+
trace_id: Optional[str] = None
|
|
22
|
+
run_duration: Optional[float] = None
|
|
23
|
+
evaluation_cost: Optional[float] = None
|
|
24
|
+
|
|
25
|
+
def to_dict(self) -> APIScoringResult:
|
|
26
|
+
scorers_list: List[APIScorerData] = [s.to_dict() for s in self.scorers_data]
|
|
27
|
+
result: APIScoringResult = {
|
|
28
|
+
"success": self.success,
|
|
29
|
+
"scorers_data": scorers_list,
|
|
30
|
+
}
|
|
31
|
+
if self.name is not None:
|
|
32
|
+
result["name"] = self.name
|
|
33
|
+
if self.data_object is not None:
|
|
34
|
+
if isinstance(self.data_object, Example):
|
|
35
|
+
result["data_object"] = self.data_object.to_dict()
|
|
36
|
+
else:
|
|
37
|
+
result["data_object"] = self.data_object
|
|
38
|
+
if self.trace_id is not None:
|
|
39
|
+
result["trace_id"] = self.trace_id
|
|
40
|
+
if self.run_duration is not None:
|
|
41
|
+
result["run_duration"] = self.run_duration
|
|
42
|
+
if self.evaluation_cost is not None:
|
|
43
|
+
result["evaluation_cost"] = self.evaluation_cost
|
|
44
|
+
return result
|
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import datetime
|
|
4
|
+
import orjson
|
|
5
|
+
import os
|
|
6
|
+
import yaml
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from typing import List, Literal, Optional, Iterable, Iterator
|
|
9
|
+
from itertools import islice
|
|
10
|
+
from rich.progress import (
|
|
11
|
+
Progress,
|
|
12
|
+
SpinnerColumn,
|
|
13
|
+
TextColumn,
|
|
14
|
+
BarColumn,
|
|
15
|
+
TaskProgressColumn,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
from judgeval.v1.data.example import Example
|
|
20
|
+
from judgeval.v1.internal.api import JudgmentSyncClient
|
|
21
|
+
from judgeval.logger import judgeval_logger
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _batch_examples(
|
|
25
|
+
examples: Iterable[Example], batch_size: int = 100
|
|
26
|
+
) -> Iterator[List[Example]]:
|
|
27
|
+
"""Generator that yields batches of examples for efficient memory usage.
|
|
28
|
+
|
|
29
|
+
Works with any iterable including generators, consuming only batch_size items at a time.
|
|
30
|
+
"""
|
|
31
|
+
iterator = iter(examples)
|
|
32
|
+
while True:
|
|
33
|
+
batch = list(islice(iterator, batch_size))
|
|
34
|
+
if not batch:
|
|
35
|
+
break
|
|
36
|
+
yield batch
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class DatasetInfo:
|
|
41
|
+
dataset_id: str
|
|
42
|
+
name: str
|
|
43
|
+
created_at: str
|
|
44
|
+
kind: str
|
|
45
|
+
entries: int
|
|
46
|
+
creator: str
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclass
|
|
50
|
+
class Dataset:
|
|
51
|
+
name: str
|
|
52
|
+
project_name: str
|
|
53
|
+
dataset_kind: str = "example"
|
|
54
|
+
examples: Optional[List[Example]] = None
|
|
55
|
+
client: Optional[JudgmentSyncClient] = None
|
|
56
|
+
|
|
57
|
+
def add_from_json(self, file_path: str, batch_size: int = 100) -> None:
|
|
58
|
+
with open(file_path, "rb") as file:
|
|
59
|
+
data = orjson.loads(file.read())
|
|
60
|
+
examples = []
|
|
61
|
+
for e in data:
|
|
62
|
+
if isinstance(e, dict):
|
|
63
|
+
name = e.get("name")
|
|
64
|
+
example = Example(name=name)
|
|
65
|
+
for key, value in e.items():
|
|
66
|
+
if key != "name":
|
|
67
|
+
example.set_property(key, value)
|
|
68
|
+
examples.append(example)
|
|
69
|
+
else:
|
|
70
|
+
examples.append(e)
|
|
71
|
+
self.add_examples(examples, batch_size=batch_size)
|
|
72
|
+
|
|
73
|
+
def add_from_yaml(self, file_path: str, batch_size: int = 100) -> None:
|
|
74
|
+
with open(file_path, "r") as file:
|
|
75
|
+
data = yaml.safe_load(file)
|
|
76
|
+
examples = []
|
|
77
|
+
for e in data:
|
|
78
|
+
if isinstance(e, dict):
|
|
79
|
+
name = e.get("name")
|
|
80
|
+
example = Example(name=name)
|
|
81
|
+
for key, value in e.items():
|
|
82
|
+
if key != "name":
|
|
83
|
+
example.set_property(key, value)
|
|
84
|
+
examples.append(example)
|
|
85
|
+
else:
|
|
86
|
+
examples.append(e)
|
|
87
|
+
self.add_examples(examples, batch_size=batch_size)
|
|
88
|
+
|
|
89
|
+
def add_examples(self, examples: Iterable[Example], batch_size: int = 100) -> None:
|
|
90
|
+
if not self.client:
|
|
91
|
+
return
|
|
92
|
+
|
|
93
|
+
batches = _batch_examples(examples, batch_size)
|
|
94
|
+
total_uploaded = 0
|
|
95
|
+
|
|
96
|
+
with Progress(
|
|
97
|
+
SpinnerColumn(),
|
|
98
|
+
TextColumn("[bold blue]{task.description}"),
|
|
99
|
+
BarColumn(pulse_style="green"),
|
|
100
|
+
TaskProgressColumn(),
|
|
101
|
+
TextColumn("[dim]{task.fields[info]}"),
|
|
102
|
+
) as progress:
|
|
103
|
+
task = progress.add_task(
|
|
104
|
+
f"Uploading to {self.name}",
|
|
105
|
+
total=None,
|
|
106
|
+
info="",
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
batch_num = 0
|
|
110
|
+
for batch in batches:
|
|
111
|
+
if len(batch) > 0 and not isinstance(batch[0], Example):
|
|
112
|
+
raise TypeError("Examples must be a list of Example objects")
|
|
113
|
+
|
|
114
|
+
batch_num += 1
|
|
115
|
+
batch_size_actual = len(batch)
|
|
116
|
+
total_uploaded += batch_size_actual
|
|
117
|
+
|
|
118
|
+
progress.update(
|
|
119
|
+
task,
|
|
120
|
+
advance=1,
|
|
121
|
+
info=f"Batch {batch_num} ({batch_size_actual} examples, {total_uploaded} total)",
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
self.client.datasets_insert_examples_for_judgeval(
|
|
125
|
+
{
|
|
126
|
+
"dataset_name": self.name,
|
|
127
|
+
"project_name": self.project_name,
|
|
128
|
+
"examples": [e.to_dict() for e in batch],
|
|
129
|
+
}
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
judgeval_logger.info(
|
|
133
|
+
f"Successfully added {total_uploaded} examples to dataset {self.name}"
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
def save_as(
|
|
137
|
+
self,
|
|
138
|
+
file_type: Literal["json", "yaml"],
|
|
139
|
+
dir_path: str,
|
|
140
|
+
save_name: Optional[str] = None,
|
|
141
|
+
) -> None:
|
|
142
|
+
if not os.path.exists(dir_path):
|
|
143
|
+
os.makedirs(dir_path)
|
|
144
|
+
|
|
145
|
+
file_name = save_name or datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
146
|
+
complete_path = os.path.join(dir_path, f"{file_name}.{file_type}")
|
|
147
|
+
|
|
148
|
+
examples_data = [e.to_dict() for e in self.examples] if self.examples else []
|
|
149
|
+
|
|
150
|
+
if file_type == "json":
|
|
151
|
+
with open(complete_path, "wb") as file:
|
|
152
|
+
file.write(
|
|
153
|
+
orjson.dumps(
|
|
154
|
+
{"examples": examples_data}, option=orjson.OPT_INDENT_2
|
|
155
|
+
)
|
|
156
|
+
)
|
|
157
|
+
elif file_type == "yaml":
|
|
158
|
+
with open(complete_path, "w") as file:
|
|
159
|
+
yaml.dump({"examples": examples_data}, file, default_flow_style=False)
|
|
160
|
+
|
|
161
|
+
def __iter__(self):
|
|
162
|
+
return iter(self.examples or [])
|
|
163
|
+
|
|
164
|
+
def __len__(self):
|
|
165
|
+
return len(self.examples) if self.examples else 0
|
|
166
|
+
|
|
167
|
+
def __str__(self):
|
|
168
|
+
return f"Dataset(name={self.name}, examples={len(self.examples) if self.examples else 0})"
|
|
169
|
+
|
|
170
|
+
def display(self, max_examples: int = 5) -> None:
|
|
171
|
+
from rich.console import Console
|
|
172
|
+
from rich.table import Table
|
|
173
|
+
|
|
174
|
+
console = Console()
|
|
175
|
+
|
|
176
|
+
total = len(self.examples) if self.examples else 0
|
|
177
|
+
console.print(f"\n[bold cyan]Dataset: {self.name}[/bold cyan]")
|
|
178
|
+
console.print(f"[dim]Project:[/dim] {self.project_name}")
|
|
179
|
+
console.print(f"[dim]Total examples:[/dim] {total}")
|
|
180
|
+
|
|
181
|
+
if not self.examples:
|
|
182
|
+
console.print("[dim]No examples found[/dim]")
|
|
183
|
+
return
|
|
184
|
+
|
|
185
|
+
display_count = min(max_examples, total)
|
|
186
|
+
|
|
187
|
+
if total > 0:
|
|
188
|
+
first_example = self.examples[0]
|
|
189
|
+
property_keys = list(first_example.properties.keys())
|
|
190
|
+
|
|
191
|
+
table = Table(show_header=True, header_style="bold")
|
|
192
|
+
table.add_column("#", style="dim", width=4)
|
|
193
|
+
table.add_column("Name", style="cyan")
|
|
194
|
+
for key in property_keys[:3]:
|
|
195
|
+
table.add_column(key, max_width=30)
|
|
196
|
+
|
|
197
|
+
for i, example in enumerate(self.examples[:display_count]):
|
|
198
|
+
row = [str(i + 1), example.name or "—"]
|
|
199
|
+
for key in property_keys[:3]:
|
|
200
|
+
value = str(example.get_property(key) or "")
|
|
201
|
+
if len(value) > 30:
|
|
202
|
+
value = value[:27] + "..."
|
|
203
|
+
row.append(value)
|
|
204
|
+
table.add_row(*row)
|
|
205
|
+
|
|
206
|
+
console.print()
|
|
207
|
+
console.print(table)
|
|
208
|
+
|
|
209
|
+
if total > display_count:
|
|
210
|
+
console.print(
|
|
211
|
+
f"[dim]... and {total - display_count} more examples[/dim]"
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
console.print()
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import List, Iterable
|
|
4
|
+
|
|
5
|
+
from judgeval.v1.internal.api import JudgmentSyncClient
|
|
6
|
+
from judgeval.v1.datasets.dataset import Dataset, DatasetInfo
|
|
7
|
+
from judgeval.v1.data.example import Example
|
|
8
|
+
from judgeval.logger import judgeval_logger
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class DatasetFactory:
|
|
12
|
+
__slots__ = "_client"
|
|
13
|
+
|
|
14
|
+
def __init__(self, client: JudgmentSyncClient):
|
|
15
|
+
self._client = client
|
|
16
|
+
|
|
17
|
+
def get(self, name: str, project_name: str) -> Dataset:
|
|
18
|
+
dataset = self._client.datasets_pull_for_judgeval(
|
|
19
|
+
{
|
|
20
|
+
"dataset_name": name,
|
|
21
|
+
"project_name": project_name,
|
|
22
|
+
}
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
dataset_kind = dataset.get("dataset_kind", "example")
|
|
26
|
+
examples_data = dataset.get("examples", []) or []
|
|
27
|
+
|
|
28
|
+
examples = []
|
|
29
|
+
for e in examples_data:
|
|
30
|
+
if isinstance(e, dict):
|
|
31
|
+
judgeval_logger.debug(f"Raw example keys: {e.keys()}")
|
|
32
|
+
|
|
33
|
+
data_obj = e.get("data", {})
|
|
34
|
+
if isinstance(data_obj, dict):
|
|
35
|
+
example_id = data_obj.get("example_id", "")
|
|
36
|
+
created_at = data_obj.get("created_at", "")
|
|
37
|
+
name_field = data_obj.get("name")
|
|
38
|
+
|
|
39
|
+
example = Example(
|
|
40
|
+
example_id=example_id, created_at=created_at, name=name_field
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
for key, value in data_obj.items():
|
|
44
|
+
if key not in ["example_id", "created_at", "name"]:
|
|
45
|
+
example.set_property(key, value)
|
|
46
|
+
|
|
47
|
+
examples.append(example)
|
|
48
|
+
judgeval_logger.debug(
|
|
49
|
+
f"Created example with name={name_field}, properties={list(example.properties.keys())}"
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
judgeval_logger.info(f"Retrieved dataset {name} with {len(examples)} examples")
|
|
53
|
+
return Dataset(
|
|
54
|
+
name=name,
|
|
55
|
+
project_name=project_name,
|
|
56
|
+
dataset_kind=dataset_kind,
|
|
57
|
+
examples=examples,
|
|
58
|
+
client=self._client,
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
def create(
|
|
62
|
+
self,
|
|
63
|
+
name: str,
|
|
64
|
+
project_name: str,
|
|
65
|
+
examples: Iterable[Example] = [],
|
|
66
|
+
overwrite: bool = False,
|
|
67
|
+
batch_size: int = 100,
|
|
68
|
+
) -> Dataset:
|
|
69
|
+
self._client.datasets_create_for_judgeval(
|
|
70
|
+
{
|
|
71
|
+
"name": name,
|
|
72
|
+
"project_name": project_name,
|
|
73
|
+
"examples": [],
|
|
74
|
+
"dataset_kind": "example",
|
|
75
|
+
"overwrite": overwrite,
|
|
76
|
+
}
|
|
77
|
+
)
|
|
78
|
+
judgeval_logger.info(f"Created dataset {name}")
|
|
79
|
+
|
|
80
|
+
if not isinstance(examples, list):
|
|
81
|
+
examples = list(examples)
|
|
82
|
+
|
|
83
|
+
dataset = Dataset(
|
|
84
|
+
name=name, project_name=project_name, examples=examples, client=self._client
|
|
85
|
+
)
|
|
86
|
+
dataset.add_examples(examples, batch_size=batch_size)
|
|
87
|
+
return dataset
|
|
88
|
+
|
|
89
|
+
def list(self, project_name: str) -> List[DatasetInfo]:
|
|
90
|
+
datasets = self._client.datasets_pull_all_for_judgeval(
|
|
91
|
+
{"project_name": project_name}
|
|
92
|
+
)
|
|
93
|
+
judgeval_logger.info(f"Fetched datasets for project {project_name}")
|
|
94
|
+
return [DatasetInfo(**d) for d in datasets]
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
import uuid
|
|
5
|
+
from datetime import datetime, timezone
|
|
6
|
+
from typing import List, Optional
|
|
7
|
+
|
|
8
|
+
from rich.console import Console
|
|
9
|
+
from rich.progress import Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
|
|
10
|
+
|
|
11
|
+
from judgeval.v1.internal.api import JudgmentSyncClient
|
|
12
|
+
from judgeval.v1.internal.api.api_types import ExampleEvaluationRun
|
|
13
|
+
from judgeval.v1.data.example import Example
|
|
14
|
+
from judgeval.v1.data.scoring_result import ScoringResult
|
|
15
|
+
from judgeval.v1.data.scorer_data import ScorerData
|
|
16
|
+
from judgeval.v1.scorers.base_scorer import BaseScorer
|
|
17
|
+
from judgeval.logger import judgeval_logger
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class Evaluation:
|
|
21
|
+
__slots__ = ("_client",)
|
|
22
|
+
|
|
23
|
+
def __init__(self, client: JudgmentSyncClient):
|
|
24
|
+
self._client = client
|
|
25
|
+
|
|
26
|
+
def run(
|
|
27
|
+
self,
|
|
28
|
+
examples: List[Example],
|
|
29
|
+
scorers: List[BaseScorer],
|
|
30
|
+
project_name: str,
|
|
31
|
+
eval_run_name: str,
|
|
32
|
+
model: Optional[str] = None,
|
|
33
|
+
assert_test: bool = False,
|
|
34
|
+
timeout_seconds: int = 300,
|
|
35
|
+
) -> List[ScoringResult]:
|
|
36
|
+
console = Console()
|
|
37
|
+
eval_id = str(uuid.uuid4())
|
|
38
|
+
created_at = datetime.now(timezone.utc).isoformat()
|
|
39
|
+
|
|
40
|
+
console.print("\n[bold cyan]Starting Evaluation[/bold cyan]")
|
|
41
|
+
console.print(f"[dim]Run:[/dim] {eval_run_name}")
|
|
42
|
+
console.print(f"[dim]Project:[/dim] {project_name}")
|
|
43
|
+
console.print(
|
|
44
|
+
f"[dim]Examples:[/dim] {len(examples)} | [dim]Scorers:[/dim] {len(scorers)}"
|
|
45
|
+
)
|
|
46
|
+
if model:
|
|
47
|
+
console.print(f"[dim]Model:[/dim] {model}")
|
|
48
|
+
|
|
49
|
+
judgeval_logger.info(f"Starting evaluation: {eval_run_name}")
|
|
50
|
+
judgeval_logger.info(f"Examples: {len(examples)}, Scorers: {len(scorers)}")
|
|
51
|
+
|
|
52
|
+
payload: ExampleEvaluationRun = {
|
|
53
|
+
"id": eval_id,
|
|
54
|
+
"project_name": project_name,
|
|
55
|
+
"eval_name": eval_run_name,
|
|
56
|
+
"created_at": created_at,
|
|
57
|
+
"examples": [e.to_dict() for e in examples],
|
|
58
|
+
"judgment_scorers": [s.get_scorer_config() for s in scorers],
|
|
59
|
+
"custom_scorers": [],
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
console.print()
|
|
63
|
+
with Progress(
|
|
64
|
+
SpinnerColumn(),
|
|
65
|
+
TextColumn("[progress.description]{task.description}"),
|
|
66
|
+
TimeElapsedColumn(),
|
|
67
|
+
console=console,
|
|
68
|
+
) as progress:
|
|
69
|
+
task = progress.add_task("Submitting evaluation...", total=None)
|
|
70
|
+
self._client.add_to_run_eval_queue_examples(payload)
|
|
71
|
+
judgeval_logger.info(f"Evaluation submitted: {eval_id}")
|
|
72
|
+
|
|
73
|
+
progress.update(task, description="Running evaluation...")
|
|
74
|
+
start_time = time.time()
|
|
75
|
+
poll_count = 0
|
|
76
|
+
|
|
77
|
+
while True:
|
|
78
|
+
elapsed = time.time() - start_time
|
|
79
|
+
if elapsed > timeout_seconds:
|
|
80
|
+
raise TimeoutError(f"Evaluation timed out after {timeout_seconds}s")
|
|
81
|
+
|
|
82
|
+
response = self._client.fetch_experiment_run(
|
|
83
|
+
{"experiment_run_id": eval_id, "project_name": project_name}
|
|
84
|
+
)
|
|
85
|
+
results_data = response.get("results", []) or []
|
|
86
|
+
poll_count += 1
|
|
87
|
+
|
|
88
|
+
completed = len(results_data)
|
|
89
|
+
total = len(examples)
|
|
90
|
+
progress.update(
|
|
91
|
+
task,
|
|
92
|
+
description=f"Running evaluation... ({completed}/{total} completed)",
|
|
93
|
+
)
|
|
94
|
+
judgeval_logger.info(
|
|
95
|
+
f"Poll {poll_count}: {completed}/{total} results ready"
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
if completed == total:
|
|
99
|
+
break
|
|
100
|
+
time.sleep(2)
|
|
101
|
+
|
|
102
|
+
console.print(
|
|
103
|
+
f"[green]✓[/green] Evaluation completed in [bold]{elapsed:.1f}s[/bold]"
|
|
104
|
+
)
|
|
105
|
+
judgeval_logger.info(f"Evaluation completed in {elapsed:.1f}s")
|
|
106
|
+
|
|
107
|
+
console.print()
|
|
108
|
+
results = []
|
|
109
|
+
passed = 0
|
|
110
|
+
failed = 0
|
|
111
|
+
|
|
112
|
+
for i, res in enumerate(results_data):
|
|
113
|
+
judgeval_logger.info(f"Processing result {i + 1}: {res.keys()}")
|
|
114
|
+
|
|
115
|
+
scorers_raw = res.get("scorers", [])
|
|
116
|
+
scorers_data = []
|
|
117
|
+
for scorer_dict in scorers_raw:
|
|
118
|
+
judgeval_logger.debug(f"Scorer data fields: {scorer_dict.keys()}")
|
|
119
|
+
|
|
120
|
+
scorer_fields = {
|
|
121
|
+
"name": scorer_dict.get("name"),
|
|
122
|
+
"threshold": scorer_dict.get("threshold"),
|
|
123
|
+
"success": scorer_dict.get("success"),
|
|
124
|
+
"score": scorer_dict.get("score"),
|
|
125
|
+
"reason": scorer_dict.get("reason"),
|
|
126
|
+
"strict_mode": scorer_dict.get("strict_mode"),
|
|
127
|
+
"evaluation_model": scorer_dict.get("evaluation_model"),
|
|
128
|
+
"error": scorer_dict.get("error"),
|
|
129
|
+
"additional_metadata": scorer_dict.get("additional_metadata", {}),
|
|
130
|
+
"id": scorer_dict.get("scorer_data_id") or scorer_dict.get("id"),
|
|
131
|
+
}
|
|
132
|
+
scorers_data.append(ScorerData(**scorer_fields))
|
|
133
|
+
|
|
134
|
+
success = all(s.success for s in scorers_data)
|
|
135
|
+
|
|
136
|
+
if success:
|
|
137
|
+
passed += 1
|
|
138
|
+
console.print(
|
|
139
|
+
f"[green]✓[/green] Example {i + 1}: [green]PASSED[/green]"
|
|
140
|
+
)
|
|
141
|
+
else:
|
|
142
|
+
failed += 1
|
|
143
|
+
console.print(f"[red]✗[/red] Example {i + 1}: [red]FAILED[/red]")
|
|
144
|
+
|
|
145
|
+
for scorer_data in scorers_data:
|
|
146
|
+
score_str = (
|
|
147
|
+
f"{scorer_data.score:.3f}"
|
|
148
|
+
if scorer_data.score is not None
|
|
149
|
+
else "N/A"
|
|
150
|
+
)
|
|
151
|
+
status_color = "green" if scorer_data.success else "red"
|
|
152
|
+
console.print(
|
|
153
|
+
f" [dim]{scorer_data.name}:[/dim] [{status_color}]{score_str}[/{status_color}] (threshold: {scorer_data.threshold})"
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
results.append(
|
|
157
|
+
ScoringResult(
|
|
158
|
+
success=success,
|
|
159
|
+
scorers_data=scorers_data,
|
|
160
|
+
)
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
console.print()
|
|
164
|
+
url = response.get("ui_results_url", "")
|
|
165
|
+
|
|
166
|
+
if passed == len(results):
|
|
167
|
+
console.print(
|
|
168
|
+
f"[bold green]✓ All tests passed![/bold green] ({passed}/{len(results)})"
|
|
169
|
+
)
|
|
170
|
+
else:
|
|
171
|
+
console.print(
|
|
172
|
+
f"[bold yellow]⚠ Results:[/bold yellow] [green]{passed} passed[/green] | [red]{failed} failed[/red]"
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
console.print(f"[dim]View full details:[/dim] [link={url}]{url}[/link]\n")
|
|
176
|
+
|
|
177
|
+
if assert_test and not all(r.success for r in results):
|
|
178
|
+
raise AssertionError(
|
|
179
|
+
f"Evaluation failed: {failed}/{len(results)} tests failed"
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
return results
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from judgeval.v1.internal.api import JudgmentSyncClient
|
|
4
|
+
from judgeval.v1.evaluation.evaluation import Evaluation
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class EvaluationFactory:
|
|
8
|
+
__slots__ = "_client"
|
|
9
|
+
|
|
10
|
+
def __init__(
|
|
11
|
+
self,
|
|
12
|
+
client: JudgmentSyncClient,
|
|
13
|
+
):
|
|
14
|
+
self._client = client
|
|
15
|
+
|
|
16
|
+
def create(self) -> Evaluation:
|
|
17
|
+
return Evaluation(client=self._client)
|