judgeval 0.1.0__py3-none-any.whl → 0.23.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +173 -10
- judgeval/api/__init__.py +523 -0
- judgeval/api/api_types.py +413 -0
- judgeval/cli.py +112 -0
- judgeval/constants.py +7 -30
- judgeval/data/__init__.py +1 -3
- judgeval/data/evaluation_run.py +125 -0
- judgeval/data/example.py +14 -40
- judgeval/data/judgment_types.py +396 -146
- judgeval/data/result.py +11 -18
- judgeval/data/scorer_data.py +3 -26
- judgeval/data/scripts/openapi_transform.py +5 -5
- judgeval/data/trace.py +115 -194
- judgeval/dataset/__init__.py +335 -0
- judgeval/env.py +55 -0
- judgeval/evaluation/__init__.py +346 -0
- judgeval/exceptions.py +28 -0
- judgeval/integrations/langgraph/__init__.py +13 -0
- judgeval/integrations/openlit/__init__.py +51 -0
- judgeval/judges/__init__.py +2 -2
- judgeval/judges/litellm_judge.py +77 -16
- judgeval/judges/together_judge.py +88 -17
- judgeval/judges/utils.py +7 -20
- judgeval/judgment_attribute_keys.py +55 -0
- judgeval/{common/logger.py → logger.py} +24 -8
- judgeval/prompt/__init__.py +330 -0
- judgeval/scorers/__init__.py +11 -11
- judgeval/scorers/agent_scorer.py +15 -19
- judgeval/scorers/api_scorer.py +21 -23
- judgeval/scorers/base_scorer.py +54 -36
- judgeval/scorers/example_scorer.py +1 -3
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -24
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -10
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -10
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -14
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +171 -59
- judgeval/scorers/score.py +64 -47
- judgeval/scorers/utils.py +2 -107
- judgeval/tracer/__init__.py +1111 -2
- judgeval/tracer/constants.py +1 -0
- judgeval/tracer/exporters/__init__.py +40 -0
- judgeval/tracer/exporters/s3.py +119 -0
- judgeval/tracer/exporters/store.py +59 -0
- judgeval/tracer/exporters/utils.py +32 -0
- judgeval/tracer/keys.py +63 -0
- judgeval/tracer/llm/__init__.py +7 -0
- judgeval/tracer/llm/config.py +78 -0
- judgeval/tracer/llm/constants.py +9 -0
- judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
- judgeval/tracer/llm/llm_anthropic/config.py +6 -0
- judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
- judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
- judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
- judgeval/tracer/llm/llm_google/__init__.py +3 -0
- judgeval/tracer/llm/llm_google/config.py +6 -0
- judgeval/tracer/llm/llm_google/generate_content.py +127 -0
- judgeval/tracer/llm/llm_google/wrapper.py +30 -0
- judgeval/tracer/llm/llm_openai/__init__.py +3 -0
- judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
- judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
- judgeval/tracer/llm/llm_openai/config.py +6 -0
- judgeval/tracer/llm/llm_openai/responses.py +506 -0
- judgeval/tracer/llm/llm_openai/utils.py +42 -0
- judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
- judgeval/tracer/llm/llm_together/__init__.py +3 -0
- judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
- judgeval/tracer/llm/llm_together/config.py +6 -0
- judgeval/tracer/llm/llm_together/wrapper.py +52 -0
- judgeval/tracer/llm/providers.py +19 -0
- judgeval/tracer/managers.py +167 -0
- judgeval/tracer/processors/__init__.py +220 -0
- judgeval/tracer/utils.py +19 -0
- judgeval/trainer/__init__.py +14 -0
- judgeval/trainer/base_trainer.py +122 -0
- judgeval/trainer/config.py +123 -0
- judgeval/trainer/console.py +144 -0
- judgeval/trainer/fireworks_trainer.py +392 -0
- judgeval/trainer/trainable_model.py +252 -0
- judgeval/trainer/trainer.py +70 -0
- judgeval/utils/async_utils.py +39 -0
- judgeval/utils/decorators/__init__.py +0 -0
- judgeval/utils/decorators/dont_throw.py +37 -0
- judgeval/utils/decorators/use_once.py +13 -0
- judgeval/utils/file_utils.py +74 -28
- judgeval/utils/guards.py +36 -0
- judgeval/utils/meta.py +27 -0
- judgeval/utils/project.py +15 -0
- judgeval/utils/serialize.py +253 -0
- judgeval/utils/testing.py +70 -0
- judgeval/utils/url.py +10 -0
- judgeval/{version_check.py → utils/version_check.py} +5 -3
- judgeval/utils/wrappers/README.md +3 -0
- judgeval/utils/wrappers/__init__.py +15 -0
- judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
- judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
- judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
- judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
- judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
- judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
- judgeval/utils/wrappers/py.typed +0 -0
- judgeval/utils/wrappers/utils.py +35 -0
- judgeval/v1/__init__.py +88 -0
- judgeval/v1/data/__init__.py +7 -0
- judgeval/v1/data/example.py +44 -0
- judgeval/v1/data/scorer_data.py +42 -0
- judgeval/v1/data/scoring_result.py +44 -0
- judgeval/v1/datasets/__init__.py +6 -0
- judgeval/v1/datasets/dataset.py +214 -0
- judgeval/v1/datasets/dataset_factory.py +94 -0
- judgeval/v1/evaluation/__init__.py +6 -0
- judgeval/v1/evaluation/evaluation.py +182 -0
- judgeval/v1/evaluation/evaluation_factory.py +17 -0
- judgeval/v1/instrumentation/__init__.py +6 -0
- judgeval/v1/instrumentation/llm/__init__.py +7 -0
- judgeval/v1/instrumentation/llm/config.py +78 -0
- judgeval/v1/instrumentation/llm/constants.py +11 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/messages.py +414 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/messages_stream.py +307 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/wrapper.py +61 -0
- judgeval/v1/instrumentation/llm/llm_google/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_google/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_google/generate_content.py +121 -0
- judgeval/v1/instrumentation/llm/llm_google/wrapper.py +30 -0
- judgeval/v1/instrumentation/llm/llm_openai/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_openai/beta_chat_completions.py +212 -0
- judgeval/v1/instrumentation/llm/llm_openai/chat_completions.py +477 -0
- judgeval/v1/instrumentation/llm/llm_openai/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_openai/responses.py +472 -0
- judgeval/v1/instrumentation/llm/llm_openai/utils.py +41 -0
- judgeval/v1/instrumentation/llm/llm_openai/wrapper.py +63 -0
- judgeval/v1/instrumentation/llm/llm_together/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_together/chat_completions.py +382 -0
- judgeval/v1/instrumentation/llm/llm_together/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_together/wrapper.py +57 -0
- judgeval/v1/instrumentation/llm/providers.py +19 -0
- judgeval/v1/integrations/claude_agent_sdk/__init__.py +119 -0
- judgeval/v1/integrations/claude_agent_sdk/wrapper.py +564 -0
- judgeval/v1/integrations/langgraph/__init__.py +13 -0
- judgeval/v1/integrations/openlit/__init__.py +47 -0
- judgeval/v1/internal/api/__init__.py +525 -0
- judgeval/v1/internal/api/api_types.py +413 -0
- judgeval/v1/prompts/__init__.py +6 -0
- judgeval/v1/prompts/prompt.py +29 -0
- judgeval/v1/prompts/prompt_factory.py +189 -0
- judgeval/v1/py.typed +0 -0
- judgeval/v1/scorers/__init__.py +6 -0
- judgeval/v1/scorers/api_scorer.py +82 -0
- judgeval/v1/scorers/base_scorer.py +17 -0
- judgeval/v1/scorers/built_in/__init__.py +17 -0
- judgeval/v1/scorers/built_in/answer_correctness.py +28 -0
- judgeval/v1/scorers/built_in/answer_relevancy.py +28 -0
- judgeval/v1/scorers/built_in/built_in_factory.py +26 -0
- judgeval/v1/scorers/built_in/faithfulness.py +28 -0
- judgeval/v1/scorers/built_in/instruction_adherence.py +28 -0
- judgeval/v1/scorers/custom_scorer/__init__.py +6 -0
- judgeval/v1/scorers/custom_scorer/custom_scorer.py +50 -0
- judgeval/v1/scorers/custom_scorer/custom_scorer_factory.py +16 -0
- judgeval/v1/scorers/prompt_scorer/__init__.py +6 -0
- judgeval/v1/scorers/prompt_scorer/prompt_scorer.py +86 -0
- judgeval/v1/scorers/prompt_scorer/prompt_scorer_factory.py +85 -0
- judgeval/v1/scorers/scorers_factory.py +49 -0
- judgeval/v1/tracer/__init__.py +7 -0
- judgeval/v1/tracer/base_tracer.py +520 -0
- judgeval/v1/tracer/exporters/__init__.py +14 -0
- judgeval/v1/tracer/exporters/in_memory_span_exporter.py +25 -0
- judgeval/v1/tracer/exporters/judgment_span_exporter.py +42 -0
- judgeval/v1/tracer/exporters/noop_span_exporter.py +19 -0
- judgeval/v1/tracer/exporters/span_store.py +50 -0
- judgeval/v1/tracer/judgment_tracer_provider.py +70 -0
- judgeval/v1/tracer/processors/__init__.py +6 -0
- judgeval/v1/tracer/processors/_lifecycles/__init__.py +28 -0
- judgeval/v1/tracer/processors/_lifecycles/agent_id_processor.py +53 -0
- judgeval/v1/tracer/processors/_lifecycles/context_keys.py +11 -0
- judgeval/v1/tracer/processors/_lifecycles/customer_id_processor.py +29 -0
- judgeval/v1/tracer/processors/_lifecycles/registry.py +18 -0
- judgeval/v1/tracer/processors/judgment_span_processor.py +165 -0
- judgeval/v1/tracer/processors/noop_span_processor.py +42 -0
- judgeval/v1/tracer/tracer.py +67 -0
- judgeval/v1/tracer/tracer_factory.py +38 -0
- judgeval/v1/trainers/__init__.py +5 -0
- judgeval/v1/trainers/base_trainer.py +62 -0
- judgeval/v1/trainers/config.py +123 -0
- judgeval/v1/trainers/console.py +144 -0
- judgeval/v1/trainers/fireworks_trainer.py +392 -0
- judgeval/v1/trainers/trainable_model.py +252 -0
- judgeval/v1/trainers/trainers_factory.py +37 -0
- judgeval/v1/utils.py +18 -0
- judgeval/version.py +5 -0
- judgeval/warnings.py +4 -0
- judgeval-0.23.0.dist-info/METADATA +266 -0
- judgeval-0.23.0.dist-info/RECORD +201 -0
- judgeval-0.23.0.dist-info/entry_points.txt +2 -0
- judgeval/clients.py +0 -34
- judgeval/common/__init__.py +0 -13
- judgeval/common/api/__init__.py +0 -3
- judgeval/common/api/api.py +0 -352
- judgeval/common/api/constants.py +0 -165
- judgeval/common/exceptions.py +0 -27
- judgeval/common/storage/__init__.py +0 -6
- judgeval/common/storage/s3_storage.py +0 -98
- judgeval/common/tracer/__init__.py +0 -31
- judgeval/common/tracer/constants.py +0 -22
- judgeval/common/tracer/core.py +0 -1916
- judgeval/common/tracer/otel_exporter.py +0 -108
- judgeval/common/tracer/otel_span_processor.py +0 -234
- judgeval/common/tracer/span_processor.py +0 -37
- judgeval/common/tracer/span_transformer.py +0 -211
- judgeval/common/tracer/trace_manager.py +0 -92
- judgeval/common/utils.py +0 -940
- judgeval/data/datasets/__init__.py +0 -4
- judgeval/data/datasets/dataset.py +0 -341
- judgeval/data/datasets/eval_dataset_client.py +0 -214
- judgeval/data/tool.py +0 -5
- judgeval/data/trace_run.py +0 -37
- judgeval/evaluation_run.py +0 -75
- judgeval/integrations/langgraph.py +0 -843
- judgeval/judges/mixture_of_judges.py +0 -286
- judgeval/judgment_client.py +0 -369
- judgeval/rules.py +0 -521
- judgeval/run_evaluation.py +0 -684
- judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
- judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
- judgeval/utils/alerts.py +0 -93
- judgeval/utils/requests.py +0 -50
- judgeval-0.1.0.dist-info/METADATA +0 -202
- judgeval-0.1.0.dist-info/RECORD +0 -73
- {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/WHEEL +0 -0
- {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/licenses/LICENSE.md +0 -0
judgeval/data/result.py
CHANGED
|
@@ -1,10 +1,9 @@
|
|
|
1
1
|
from typing import List, Union
|
|
2
2
|
from judgeval.data import ScorerData, Example
|
|
3
|
-
from judgeval.data.
|
|
4
|
-
from judgeval.data.judgment_types import ScoringResultJudgmentType
|
|
3
|
+
from judgeval.data.judgment_types import ScoringResult as JudgmentScoringResult
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
class ScoringResult(
|
|
6
|
+
class ScoringResult(JudgmentScoringResult):
|
|
8
7
|
"""
|
|
9
8
|
A ScoringResult contains the output of one or more scorers applied to a single example.
|
|
10
9
|
Ie: One input, one actual_output, one expected_output, etc..., and 1+ scorer (Faithfulness, Hallucination, Summarization, etc...)
|
|
@@ -17,15 +16,14 @@ class ScoringResult(ScoringResultJudgmentType):
|
|
|
17
16
|
|
|
18
17
|
"""
|
|
19
18
|
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
}
|
|
19
|
+
# Need to override this so that it uses this repo's Example class
|
|
20
|
+
data_object: Example
|
|
21
|
+
scorers_data: List[ScorerData]
|
|
22
|
+
|
|
23
|
+
def model_dump(self, **kwargs):
|
|
24
|
+
data = super().model_dump(**kwargs)
|
|
25
|
+
data["data_object"] = self.data_object.model_dump()
|
|
26
|
+
return data
|
|
29
27
|
|
|
30
28
|
def __str__(self) -> str:
|
|
31
29
|
return f"ScoringResult(\
|
|
@@ -36,7 +34,7 @@ class ScoringResult(ScoringResultJudgmentType):
|
|
|
36
34
|
|
|
37
35
|
|
|
38
36
|
def generate_scoring_result(
|
|
39
|
-
data_object: Union[Example
|
|
37
|
+
data_object: Union[Example],
|
|
40
38
|
scorers_data: List[ScorerData],
|
|
41
39
|
run_duration: float,
|
|
42
40
|
success: bool,
|
|
@@ -47,12 +45,7 @@ def generate_scoring_result(
|
|
|
47
45
|
When an LLMTestCase is executed, it turns into an LLMApiTestCase and the progress of the evaluation run is tracked.
|
|
48
46
|
At the end of the evaluation run, we create a TestResult object out of the completed LLMApiTestCase.
|
|
49
47
|
"""
|
|
50
|
-
if hasattr(data_object, "name") and data_object.name is not None:
|
|
51
|
-
name = data_object.name
|
|
52
|
-
else:
|
|
53
|
-
name = "Test Case Placeholder"
|
|
54
48
|
scoring_result = ScoringResult(
|
|
55
|
-
name=name,
|
|
56
49
|
data_object=data_object,
|
|
57
50
|
success=success,
|
|
58
51
|
scorers_data=scorers_data,
|
judgeval/data/scorer_data.py
CHANGED
|
@@ -4,36 +4,13 @@ Implementation of the ScorerData class.
|
|
|
4
4
|
ScorerData holds the information related to a single, completed Scorer evaluation run.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
-
from
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from judgeval.data.judgment_types import ScorerData
|
|
8
10
|
from judgeval.scorers import BaseScorer
|
|
9
11
|
from typing import List
|
|
10
12
|
|
|
11
13
|
|
|
12
|
-
class ScorerData(ScorerDataJudgmentType):
|
|
13
|
-
"""
|
|
14
|
-
ScorerData holds the information related to a single, completed Scorer evaluation run.
|
|
15
|
-
|
|
16
|
-
For example, if running the Judgment Faithfulness scorer on an example, the ScorerData
|
|
17
|
-
object will contain whether the example passed its threshold expectation, as well as more detailed
|
|
18
|
-
information surrounding the evaluation run such as the claims and verdicts generated by the
|
|
19
|
-
judge model(s).
|
|
20
|
-
"""
|
|
21
|
-
|
|
22
|
-
def to_dict(self) -> dict:
|
|
23
|
-
"""Convert the ScorerData instance to a JSON-serializable dictionary."""
|
|
24
|
-
return {
|
|
25
|
-
"name": self.name,
|
|
26
|
-
"threshold": self.threshold,
|
|
27
|
-
"success": self.success,
|
|
28
|
-
"score": self.score,
|
|
29
|
-
"reason": self.reason,
|
|
30
|
-
"strict_mode": self.strict_mode,
|
|
31
|
-
"evaluation_model": self.evaluation_model,
|
|
32
|
-
"error": self.error,
|
|
33
|
-
"additional_metadata": self.additional_metadata,
|
|
34
|
-
}
|
|
35
|
-
|
|
36
|
-
|
|
37
14
|
def create_scorer_data(scorer: BaseScorer) -> List[ScorerData]:
|
|
38
15
|
"""
|
|
39
16
|
After a `scorer` is run, it contains information about the example that was evaluated
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
import
|
|
1
|
+
import orjson
|
|
2
2
|
import sys
|
|
3
3
|
from typing import Any, Dict, Generator, List
|
|
4
|
-
|
|
4
|
+
import requests
|
|
5
5
|
|
|
6
6
|
spec_file = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:8000/openapi.json"
|
|
7
7
|
|
|
@@ -10,8 +10,8 @@ if spec_file.startswith("http"):
|
|
|
10
10
|
r.raise_for_status()
|
|
11
11
|
SPEC = r.json()
|
|
12
12
|
else:
|
|
13
|
-
with open(spec_file, "
|
|
14
|
-
SPEC =
|
|
13
|
+
with open(spec_file, "rb") as f:
|
|
14
|
+
SPEC = orjson.loads(f.read())
|
|
15
15
|
|
|
16
16
|
JUDGEVAL_PATHS: List[str] = [
|
|
17
17
|
"/log_eval_results/",
|
|
@@ -120,4 +120,4 @@ spec = {
|
|
|
120
120
|
},
|
|
121
121
|
}
|
|
122
122
|
|
|
123
|
-
print(
|
|
123
|
+
print(orjson.dumps(spec, option=orjson.OPT_INDENT_2).decode("utf-8"))
|
judgeval/data/trace.py
CHANGED
|
@@ -1,200 +1,121 @@
|
|
|
1
|
-
from typing import Any
|
|
2
|
-
import json
|
|
3
|
-
import sys
|
|
4
|
-
import threading
|
|
5
|
-
from datetime import datetime, timezone
|
|
6
|
-
from judgeval.data.judgment_types import (
|
|
7
|
-
TraceUsageJudgmentType,
|
|
8
|
-
TraceSpanJudgmentType,
|
|
9
|
-
TraceJudgmentType,
|
|
10
|
-
)
|
|
11
|
-
from judgeval.constants import SPAN_LIFECYCLE_END_UPDATE_ID
|
|
1
|
+
from typing import Optional, List, Dict, Any
|
|
12
2
|
from pydantic import BaseModel
|
|
3
|
+
from .judgment_types import (
|
|
4
|
+
OtelSpanDetailScores,
|
|
5
|
+
OtelSpanDetail,
|
|
6
|
+
OtelTraceListItem,
|
|
7
|
+
)
|
|
13
8
|
|
|
14
9
|
|
|
15
|
-
class TraceUsage(
|
|
16
|
-
|
|
10
|
+
class TraceUsage(BaseModel):
|
|
11
|
+
prompt_tokens: Optional[int] = None
|
|
12
|
+
completion_tokens: Optional[int] = None
|
|
13
|
+
cache_creation_input_tokens: Optional[int] = None
|
|
14
|
+
cache_read_input_tokens: Optional[int] = None
|
|
15
|
+
total_tokens: Optional[int] = None
|
|
16
|
+
prompt_tokens_cost_usd: Optional[float] = None
|
|
17
|
+
completion_tokens_cost_usd: Optional[float] = None
|
|
18
|
+
total_cost_usd: Optional[float] = None
|
|
19
|
+
model_name: Optional[str] = None
|
|
17
20
|
|
|
18
21
|
|
|
19
|
-
class
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
"span_id": self.span_id,
|
|
23
|
-
"trace_id": self.trace_id,
|
|
24
|
-
"depth": self.depth,
|
|
25
|
-
"created_at": datetime.fromtimestamp(
|
|
26
|
-
self.created_at, tz=timezone.utc
|
|
27
|
-
).isoformat(),
|
|
28
|
-
"inputs": self._serialize_value(self.inputs),
|
|
29
|
-
"output": self._serialize_value(self.output),
|
|
30
|
-
"error": self._serialize_value(self.error),
|
|
31
|
-
"parent_span_id": self.parent_span_id,
|
|
32
|
-
"function": self.function,
|
|
33
|
-
"duration": self.duration,
|
|
34
|
-
"span_type": self.span_type,
|
|
35
|
-
"usage": self.usage.model_dump() if self.usage else None,
|
|
36
|
-
"has_evaluation": self.has_evaluation,
|
|
37
|
-
"agent_name": self.agent_name,
|
|
38
|
-
"state_before": self.state_before,
|
|
39
|
-
"state_after": self.state_after,
|
|
40
|
-
"additional_metadata": self._serialize_value(self.additional_metadata),
|
|
41
|
-
"update_id": self.update_id,
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
def __init__(self, **data):
|
|
45
|
-
super().__init__(**data)
|
|
46
|
-
# Initialize thread lock for thread-safe update_id increment
|
|
47
|
-
self._update_id_lock = threading.Lock()
|
|
48
|
-
|
|
49
|
-
def increment_update_id(self) -> int:
|
|
50
|
-
"""
|
|
51
|
-
Thread-safe method to increment the update_id counter.
|
|
52
|
-
Returns:
|
|
53
|
-
int: The new update_id value after incrementing
|
|
54
|
-
"""
|
|
55
|
-
with self._update_id_lock:
|
|
56
|
-
self.update_id += 1
|
|
57
|
-
return self.update_id
|
|
58
|
-
|
|
59
|
-
def set_update_id_to_ending_number(
|
|
60
|
-
self, ending_number: int = SPAN_LIFECYCLE_END_UPDATE_ID
|
|
61
|
-
) -> int:
|
|
62
|
-
"""
|
|
63
|
-
Thread-safe method to set the update_id to a predetermined ending number.
|
|
64
|
-
|
|
65
|
-
Args:
|
|
66
|
-
ending_number (int): The number to set update_id to. Defaults to SPAN_LIFECYCLE_END_UPDATE_ID.
|
|
67
|
-
|
|
68
|
-
Returns:
|
|
69
|
-
int: The new update_id value after setting
|
|
70
|
-
"""
|
|
71
|
-
with self._update_id_lock:
|
|
72
|
-
self.update_id = ending_number
|
|
73
|
-
return self.update_id
|
|
74
|
-
|
|
75
|
-
def print_span(self):
|
|
76
|
-
"""Print the span with proper formatting and parent relationship information."""
|
|
77
|
-
indent = " " * self.depth
|
|
78
|
-
parent_info = (
|
|
79
|
-
f" (parent_id: {self.parent_span_id})" if self.parent_span_id else ""
|
|
80
|
-
)
|
|
81
|
-
print(f"{indent}→ {self.function} (id: {self.span_id}){parent_info}")
|
|
82
|
-
|
|
83
|
-
def _is_json_serializable(self, obj: Any) -> bool:
|
|
84
|
-
"""Helper method to check if an object is JSON serializable."""
|
|
85
|
-
try:
|
|
86
|
-
json.dumps(obj)
|
|
87
|
-
return True
|
|
88
|
-
except (TypeError, OverflowError, ValueError):
|
|
89
|
-
return False
|
|
90
|
-
|
|
91
|
-
def safe_stringify(self, output, function_name):
|
|
92
|
-
"""
|
|
93
|
-
Safely converts an object to a JSON-serializable structure, handling common object types intelligently.
|
|
94
|
-
"""
|
|
95
|
-
# Handle Pydantic models
|
|
96
|
-
if hasattr(output, "model_dump"):
|
|
97
|
-
try:
|
|
98
|
-
return output.model_dump()
|
|
99
|
-
except Exception:
|
|
100
|
-
pass
|
|
101
|
-
|
|
102
|
-
# Handle LangChain messages and similar objects with content/type
|
|
103
|
-
if hasattr(output, "content") and hasattr(output, "type"):
|
|
104
|
-
try:
|
|
105
|
-
result = {"type": output.type, "content": output.content}
|
|
106
|
-
# Add additional fields if they exist
|
|
107
|
-
if hasattr(output, "additional_kwargs"):
|
|
108
|
-
result["additional_kwargs"] = output.additional_kwargs
|
|
109
|
-
if hasattr(output, "response_metadata"):
|
|
110
|
-
result["response_metadata"] = output.response_metadata
|
|
111
|
-
if hasattr(output, "name"):
|
|
112
|
-
result["name"] = output.name
|
|
113
|
-
return result
|
|
114
|
-
except Exception:
|
|
115
|
-
pass
|
|
116
|
-
|
|
117
|
-
if hasattr(output, "dict"):
|
|
118
|
-
try:
|
|
119
|
-
return output.dict()
|
|
120
|
-
except Exception:
|
|
121
|
-
pass
|
|
122
|
-
|
|
123
|
-
if hasattr(output, "to_dict"):
|
|
124
|
-
try:
|
|
125
|
-
return output.to_dict()
|
|
126
|
-
except Exception:
|
|
127
|
-
pass
|
|
128
|
-
|
|
129
|
-
if hasattr(output, "__dataclass_fields__"):
|
|
130
|
-
try:
|
|
131
|
-
import dataclasses
|
|
132
|
-
|
|
133
|
-
return dataclasses.asdict(output)
|
|
134
|
-
except Exception:
|
|
135
|
-
pass
|
|
136
|
-
|
|
137
|
-
if hasattr(output, "__dict__"):
|
|
138
|
-
try:
|
|
139
|
-
return output.__dict__
|
|
140
|
-
except Exception:
|
|
141
|
-
pass
|
|
142
|
-
|
|
143
|
-
try:
|
|
144
|
-
return str(output)
|
|
145
|
-
except (TypeError, OverflowError, ValueError):
|
|
146
|
-
pass
|
|
147
|
-
|
|
148
|
-
try:
|
|
149
|
-
return repr(output)
|
|
150
|
-
except (TypeError, OverflowError, ValueError):
|
|
151
|
-
pass
|
|
152
|
-
|
|
153
|
-
return None
|
|
154
|
-
|
|
155
|
-
def _serialize_value(self, value: Any) -> Any:
|
|
156
|
-
"""Helper method to deep serialize a value safely supporting Pydantic Models / regular PyObjects."""
|
|
157
|
-
if value is None:
|
|
158
|
-
return None
|
|
159
|
-
|
|
160
|
-
recursion_limit = sys.getrecursionlimit()
|
|
161
|
-
recursion_limit = int(recursion_limit * 0.75)
|
|
162
|
-
|
|
163
|
-
def serialize_value(value, current_depth=0):
|
|
164
|
-
try:
|
|
165
|
-
if current_depth > recursion_limit:
|
|
166
|
-
return {"error": "max_depth_reached: " + type(value).__name__}
|
|
167
|
-
|
|
168
|
-
if isinstance(value, BaseModel):
|
|
169
|
-
return value.model_dump()
|
|
170
|
-
elif isinstance(value, dict):
|
|
171
|
-
# Recursively serialize dictionary values
|
|
172
|
-
return {
|
|
173
|
-
k: serialize_value(v, current_depth + 1)
|
|
174
|
-
for k, v in value.items()
|
|
175
|
-
}
|
|
176
|
-
elif isinstance(value, (list, tuple)):
|
|
177
|
-
# Recursively serialize list/tuple items
|
|
178
|
-
return [serialize_value(item, current_depth + 1) for item in value]
|
|
179
|
-
else:
|
|
180
|
-
# Try direct JSON serialization first
|
|
181
|
-
try:
|
|
182
|
-
json.dumps(value)
|
|
183
|
-
return value
|
|
184
|
-
except (TypeError, OverflowError, ValueError):
|
|
185
|
-
# Fallback to safe stringification
|
|
186
|
-
return self.safe_stringify(value, self.function)
|
|
187
|
-
except Exception:
|
|
188
|
-
return {"error": "Unable to serialize"}
|
|
189
|
-
except Exception:
|
|
190
|
-
return {"error": "Unable to serialize"}
|
|
191
|
-
|
|
192
|
-
# Start serialization with the top-level value
|
|
193
|
-
try:
|
|
194
|
-
return serialize_value(value, current_depth=0)
|
|
195
|
-
except Exception:
|
|
196
|
-
return {"error": "Unable to serialize"}
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
class Trace(TraceJudgmentType):
|
|
22
|
+
class TraceScore(OtelSpanDetailScores):
|
|
23
|
+
"""Score information for a trace or span."""
|
|
24
|
+
|
|
200
25
|
pass
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class TraceRule(BaseModel):
|
|
29
|
+
"""Rule that was triggered for a trace."""
|
|
30
|
+
|
|
31
|
+
rule_id: str
|
|
32
|
+
rule_name: str
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class TraceSpan(OtelSpanDetail):
|
|
36
|
+
"""Individual span within a trace with complete telemetry data."""
|
|
37
|
+
|
|
38
|
+
@classmethod
|
|
39
|
+
def from_otel_span_detail(cls, span_detail: OtelSpanDetail) -> "TraceSpan":
|
|
40
|
+
"""Create TraceSpan from OtelSpanDetail, converting scores to TraceScore."""
|
|
41
|
+
data = span_detail.model_dump()
|
|
42
|
+
|
|
43
|
+
if "scores" in data and data["scores"]:
|
|
44
|
+
data["scores"] = [TraceScore(**score) for score in data["scores"]]
|
|
45
|
+
|
|
46
|
+
return cls(**data)
|
|
47
|
+
|
|
48
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
49
|
+
"""Convert TraceSpan to dictionary."""
|
|
50
|
+
return self.model_dump(exclude_none=True)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class Trace(OtelTraceListItem):
|
|
54
|
+
"""Complete trace with metadata and all associated spans."""
|
|
55
|
+
|
|
56
|
+
spans: List[TraceSpan] = []
|
|
57
|
+
rules: Optional[List[TraceRule]] = []
|
|
58
|
+
|
|
59
|
+
@classmethod
|
|
60
|
+
def from_dataset_trace_with_spans(cls, dataset_trace: Any) -> "Trace":
|
|
61
|
+
"""Create Trace from DatasetTraceWithSpans (handles both API and judgment types)."""
|
|
62
|
+
|
|
63
|
+
if hasattr(dataset_trace, "trace_detail"):
|
|
64
|
+
trace_detail = dataset_trace.trace_detail
|
|
65
|
+
spans_data = dataset_trace.spans
|
|
66
|
+
else:
|
|
67
|
+
trace_detail = dataset_trace.get("trace_detail", {})
|
|
68
|
+
spans_data = dataset_trace.get("spans", [])
|
|
69
|
+
|
|
70
|
+
if hasattr(trace_detail, "model_dump"):
|
|
71
|
+
trace_data = trace_detail.model_dump()
|
|
72
|
+
elif isinstance(trace_detail, dict):
|
|
73
|
+
trace_data = trace_detail.copy()
|
|
74
|
+
else:
|
|
75
|
+
trace_data = dict(trace_detail)
|
|
76
|
+
|
|
77
|
+
spans = []
|
|
78
|
+
for span in spans_data:
|
|
79
|
+
if hasattr(span, "model_dump"):
|
|
80
|
+
spans.append(TraceSpan.from_otel_span_detail(span))
|
|
81
|
+
else:
|
|
82
|
+
# Handle dict spans
|
|
83
|
+
span_data = dict(span) if not isinstance(span, dict) else span.copy()
|
|
84
|
+
if "scores" in span_data and span_data["scores"]:
|
|
85
|
+
span_data["scores"] = [
|
|
86
|
+
TraceScore(**score)
|
|
87
|
+
if isinstance(score, dict)
|
|
88
|
+
else TraceScore(**score.model_dump())
|
|
89
|
+
for score in span_data["scores"]
|
|
90
|
+
]
|
|
91
|
+
spans.append(TraceSpan(**span_data))
|
|
92
|
+
|
|
93
|
+
rules = []
|
|
94
|
+
if "rule_id" in trace_data and trace_data["rule_id"]:
|
|
95
|
+
rules = [
|
|
96
|
+
TraceRule(
|
|
97
|
+
rule_id=trace_data["rule_id"],
|
|
98
|
+
rule_name=f"Rule {trace_data['rule_id']}",
|
|
99
|
+
)
|
|
100
|
+
]
|
|
101
|
+
|
|
102
|
+
trace_data.pop("scores", [])
|
|
103
|
+
trace_data.pop("rule_id", None)
|
|
104
|
+
trace = cls(**trace_data)
|
|
105
|
+
|
|
106
|
+
trace.spans = spans
|
|
107
|
+
trace.rules = rules
|
|
108
|
+
|
|
109
|
+
return trace
|
|
110
|
+
|
|
111
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
112
|
+
"""Convert Trace to dictionary."""
|
|
113
|
+
return self.model_dump(exclude_none=True)
|
|
114
|
+
|
|
115
|
+
def __len__(self) -> int:
|
|
116
|
+
"""Return the number of spans in the trace."""
|
|
117
|
+
return len(self.spans)
|
|
118
|
+
|
|
119
|
+
def __iter__(self):
|
|
120
|
+
"""Iterate over spans in the trace."""
|
|
121
|
+
return iter(self.spans)
|