judgeval 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. judgeval/__init__.py +139 -12
  2. judgeval/api/__init__.py +501 -0
  3. judgeval/api/api_types.py +344 -0
  4. judgeval/cli.py +2 -4
  5. judgeval/constants.py +10 -26
  6. judgeval/data/evaluation_run.py +49 -26
  7. judgeval/data/example.py +2 -2
  8. judgeval/data/judgment_types.py +266 -82
  9. judgeval/data/result.py +4 -5
  10. judgeval/data/scorer_data.py +4 -2
  11. judgeval/data/tool.py +2 -2
  12. judgeval/data/trace.py +7 -50
  13. judgeval/data/trace_run.py +7 -4
  14. judgeval/{dataset.py → dataset/__init__.py} +43 -28
  15. judgeval/env.py +67 -0
  16. judgeval/{run_evaluation.py → evaluation/__init__.py} +29 -95
  17. judgeval/exceptions.py +27 -0
  18. judgeval/integrations/langgraph/__init__.py +788 -0
  19. judgeval/judges/__init__.py +2 -2
  20. judgeval/judges/litellm_judge.py +75 -15
  21. judgeval/judges/together_judge.py +86 -18
  22. judgeval/judges/utils.py +7 -21
  23. judgeval/{common/logger.py → logger.py} +8 -6
  24. judgeval/scorers/__init__.py +0 -4
  25. judgeval/scorers/agent_scorer.py +3 -7
  26. judgeval/scorers/api_scorer.py +8 -13
  27. judgeval/scorers/base_scorer.py +52 -32
  28. judgeval/scorers/example_scorer.py +1 -3
  29. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -14
  30. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +45 -20
  31. judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +2 -2
  32. judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +3 -3
  33. judgeval/scorers/score.py +21 -31
  34. judgeval/scorers/trace_api_scorer.py +5 -0
  35. judgeval/scorers/utils.py +1 -103
  36. judgeval/tracer/__init__.py +1075 -2
  37. judgeval/tracer/constants.py +1 -0
  38. judgeval/tracer/exporters/__init__.py +37 -0
  39. judgeval/tracer/exporters/s3.py +119 -0
  40. judgeval/tracer/exporters/store.py +43 -0
  41. judgeval/tracer/exporters/utils.py +32 -0
  42. judgeval/tracer/keys.py +67 -0
  43. judgeval/tracer/llm/__init__.py +1233 -0
  44. judgeval/{common/tracer → tracer/llm}/providers.py +5 -10
  45. judgeval/{local_eval_queue.py → tracer/local_eval_queue.py} +15 -10
  46. judgeval/tracer/managers.py +188 -0
  47. judgeval/tracer/processors/__init__.py +181 -0
  48. judgeval/tracer/utils.py +20 -0
  49. judgeval/trainer/__init__.py +5 -0
  50. judgeval/{common/trainer → trainer}/config.py +12 -9
  51. judgeval/{common/trainer → trainer}/console.py +2 -9
  52. judgeval/{common/trainer → trainer}/trainable_model.py +12 -7
  53. judgeval/{common/trainer → trainer}/trainer.py +119 -17
  54. judgeval/utils/async_utils.py +2 -3
  55. judgeval/utils/decorators.py +24 -0
  56. judgeval/utils/file_utils.py +37 -4
  57. judgeval/utils/guards.py +32 -0
  58. judgeval/utils/meta.py +14 -0
  59. judgeval/{common/api/json_encoder.py → utils/serialize.py} +7 -1
  60. judgeval/utils/testing.py +88 -0
  61. judgeval/utils/url.py +10 -0
  62. judgeval/{version_check.py → utils/version_check.py} +3 -3
  63. judgeval/version.py +5 -0
  64. judgeval/warnings.py +4 -0
  65. {judgeval-0.8.0.dist-info → judgeval-0.9.0.dist-info}/METADATA +12 -14
  66. judgeval-0.9.0.dist-info/RECORD +80 -0
  67. judgeval/clients.py +0 -35
  68. judgeval/common/__init__.py +0 -13
  69. judgeval/common/api/__init__.py +0 -3
  70. judgeval/common/api/api.py +0 -375
  71. judgeval/common/api/constants.py +0 -186
  72. judgeval/common/exceptions.py +0 -27
  73. judgeval/common/storage/__init__.py +0 -6
  74. judgeval/common/storage/s3_storage.py +0 -97
  75. judgeval/common/tracer/__init__.py +0 -31
  76. judgeval/common/tracer/constants.py +0 -22
  77. judgeval/common/tracer/core.py +0 -2427
  78. judgeval/common/tracer/otel_exporter.py +0 -108
  79. judgeval/common/tracer/otel_span_processor.py +0 -188
  80. judgeval/common/tracer/span_processor.py +0 -37
  81. judgeval/common/tracer/span_transformer.py +0 -207
  82. judgeval/common/tracer/trace_manager.py +0 -101
  83. judgeval/common/trainer/__init__.py +0 -5
  84. judgeval/common/utils.py +0 -948
  85. judgeval/integrations/langgraph.py +0 -844
  86. judgeval/judges/mixture_of_judges.py +0 -287
  87. judgeval/judgment_client.py +0 -267
  88. judgeval/rules.py +0 -521
  89. judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
  90. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
  91. judgeval/utils/alerts.py +0 -93
  92. judgeval/utils/requests.py +0 -50
  93. judgeval-0.8.0.dist-info/RECORD +0 -82
  94. {judgeval-0.8.0.dist-info → judgeval-0.9.0.dist-info}/WHEEL +0 -0
  95. {judgeval-0.8.0.dist-info → judgeval-0.9.0.dist-info}/entry_points.txt +0 -0
  96. {judgeval-0.8.0.dist-info → judgeval-0.9.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1 @@
1
+ JUDGEVAL_TRACER_INSTRUMENTING_MODULE_NAME = "opentelemetry.instrumentation.judgeval"
@@ -0,0 +1,37 @@
1
+ from opentelemetry.sdk.trace.export import (
2
+ SpanExportResult,
3
+ SpanExporter,
4
+ )
5
+ from opentelemetry.sdk.trace import ReadableSpan
6
+ from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
7
+ from typing import Sequence
8
+
9
+ from judgeval.tracer.exporters.store import ABCSpanStore
10
+ from judgeval.tracer.exporters.s3 import S3Exporter
11
+ from judgeval.tracer.exporters.utils import deduplicate_spans
12
+
13
+
14
+ class JudgmentSpanExporter(OTLPSpanExporter):
15
+ def __init__(self, endpoint: str, api_key: str, organization_id: str):
16
+ super().__init__(
17
+ endpoint=endpoint,
18
+ headers={
19
+ "Authorization": f"Bearer {api_key}",
20
+ "X-Organization-Id": organization_id,
21
+ },
22
+ )
23
+
24
+ def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult:
25
+ return super().export(deduplicate_spans(spans))
26
+
27
+
28
+ class InMemorySpanExporter(SpanExporter):
29
+ def __init__(self, store: ABCSpanStore):
30
+ self.store = store
31
+
32
+ def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult:
33
+ self.store.add(*spans)
34
+ return SpanExportResult.SUCCESS
35
+
36
+
37
+ __all__ = ("JudgmentSpanExporter", "InMemorySpanExporter", "S3Exporter")
@@ -0,0 +1,119 @@
1
+ from __future__ import annotations
2
+
3
+ from concurrent.futures import ThreadPoolExecutor, as_completed
4
+ from datetime import datetime
5
+ from typing import Literal, Sequence, Optional, TYPE_CHECKING, cast
6
+ import boto3
7
+ from botocore.client import Config
8
+
9
+ from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult
10
+ from opentelemetry.sdk.trace import ReadableSpan
11
+ from judgeval.env import (
12
+ JUDGMENT_S3_ACCESS_KEY_ID,
13
+ JUDGMENT_S3_SECRET_ACCESS_KEY,
14
+ JUDGMENT_S3_REGION_NAME,
15
+ JUDGMENT_S3_BUCKET_NAME,
16
+ JUDGMENT_S3_PREFIX,
17
+ JUDGMENT_S3_ENDPOINT_URL,
18
+ JUDGMENT_S3_SIGNATURE_VERSION,
19
+ JUDGMENT_S3_ADDRESSING_STYLE,
20
+ )
21
+ from judgeval.exceptions import JudgmentRuntimeError
22
+ from judgeval.logger import judgeval_logger
23
+
24
+ if TYPE_CHECKING:
25
+ from mypy_boto3_s3.client import S3Client
26
+
27
+
28
+ class S3Exporter(SpanExporter):
29
+ __slots__ = ("bucket_name", "prefix", "s3_client")
30
+
31
+ bucket_name: str
32
+ prefix: str
33
+ s3_client: S3Client
34
+
35
+ def __init__(
36
+ self,
37
+ bucket_name: Optional[str] = JUDGMENT_S3_BUCKET_NAME,
38
+ region_name: Optional[str] = JUDGMENT_S3_REGION_NAME,
39
+ prefix: str = JUDGMENT_S3_PREFIX,
40
+ s3_access_key_id: Optional[str] = JUDGMENT_S3_ACCESS_KEY_ID,
41
+ s3_secret_access_key: Optional[str] = JUDGMENT_S3_SECRET_ACCESS_KEY,
42
+ endpoint_url: Optional[str] = JUDGMENT_S3_ENDPOINT_URL,
43
+ # https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html
44
+ signature_version: str = JUDGMENT_S3_SIGNATURE_VERSION,
45
+ addressing_style: str = JUDGMENT_S3_ADDRESSING_STYLE,
46
+ batch_size: int = 8,
47
+ ):
48
+ if not bucket_name:
49
+ raise JudgmentRuntimeError("JUDGMENT_S3_BUCKET_NAME is not set")
50
+
51
+ if not region_name:
52
+ raise JudgmentRuntimeError("JUDGMENT_S3_REGION_NAME is not set")
53
+
54
+ if addressing_style not in ["auto", "virtual", "path"]:
55
+ raise JudgmentRuntimeError(f"Invalid addressing style: {addressing_style}")
56
+ addressing_style = cast(Literal["auto", "virtual", "path"], addressing_style)
57
+
58
+ self.bucket_name = bucket_name
59
+ self.prefix = prefix.rstrip("/")
60
+ self.batch_size = batch_size
61
+
62
+ self.s3_client = boto3.client(
63
+ "s3",
64
+ config=Config(
65
+ signature_version=signature_version,
66
+ s3={"addressing_style": addressing_style},
67
+ ),
68
+ aws_access_key_id=s3_access_key_id,
69
+ aws_secret_access_key=s3_secret_access_key,
70
+ endpoint_url=endpoint_url,
71
+ region_name=region_name,
72
+ )
73
+
74
+ def _upload_span(self, span: ReadableSpan) -> tuple[bool, str]:
75
+ """Upload a single span to S3. Returns (success, key)."""
76
+ try:
77
+ span_context = span.get_span_context()
78
+ if not span_context:
79
+ return False, ""
80
+
81
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S%f")
82
+ trace_id = format(span_context.trace_id, "032x")
83
+ span_id = format(span_context.span_id, "016x")
84
+ key = f"{self.prefix}/{trace_id}/{span_id}/{timestamp}.json"
85
+
86
+ span_json = span.to_json(indent=0)
87
+
88
+ self.s3_client.put_object(
89
+ Bucket=self.bucket_name,
90
+ Key=key,
91
+ Body=span_json,
92
+ ContentType="application/json",
93
+ )
94
+ return True, key
95
+ except Exception as e:
96
+ judgeval_logger.error(
97
+ f"Error uploading span {span_context.span_id if span_context else 'unknown'}: {e}"
98
+ )
99
+ return False, ""
100
+
101
+ def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult:
102
+ if not spans:
103
+ return SpanExportResult.SUCCESS
104
+
105
+ try:
106
+ with ThreadPoolExecutor(
107
+ max_workers=min(len(spans), self.batch_size)
108
+ ) as executor:
109
+ futures = [executor.submit(self._upload_span, span) for span in spans]
110
+
111
+ for future in as_completed(futures):
112
+ success, key = future.result()
113
+ if not success:
114
+ return SpanExportResult.FAILURE
115
+ return SpanExportResult.SUCCESS
116
+
117
+ except Exception as e:
118
+ judgeval_logger.error(f"Error exporting spans to S3: {e}")
119
+ return SpanExportResult.FAILURE
@@ -0,0 +1,43 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import List
3
+
4
+ from opentelemetry.sdk.trace import ReadableSpan
5
+
6
+
7
+ class ABCSpanStore(ABC):
8
+ @abstractmethod
9
+ def add(self, *spans: ReadableSpan): ...
10
+
11
+ @abstractmethod
12
+ def get(self, id: str) -> ReadableSpan: ...
13
+
14
+ @abstractmethod
15
+ def get_all(self) -> List[ReadableSpan]: ...
16
+
17
+
18
+ class SpanStore(ABCSpanStore):
19
+ __slots__ = ("spans",)
20
+
21
+ spans: List[ReadableSpan]
22
+
23
+ def __init__(self):
24
+ self.spans = []
25
+
26
+ def add(self, *spans: ReadableSpan):
27
+ self.spans.extend(spans)
28
+
29
+ def get(self, id: str) -> ReadableSpan:
30
+ for span in self.spans:
31
+ context = span.get_span_context()
32
+ if context is None:
33
+ continue
34
+ if context.span_id == id:
35
+ return span
36
+
37
+ raise ValueError(f"Span with id {id} not found")
38
+
39
+ def get_all(self) -> List[ReadableSpan]:
40
+ return self.spans
41
+
42
+ def __repr__(self) -> str:
43
+ return f"SpanStore(spans={self.spans})"
@@ -0,0 +1,32 @@
1
+ from typing import Sequence
2
+ from opentelemetry.sdk.trace import ReadableSpan
3
+
4
+ from judgeval.tracer.keys import AttributeKeys
5
+
6
+
7
+ def deduplicate_spans(spans: Sequence[ReadableSpan]) -> Sequence[ReadableSpan]:
8
+ spans_by_key: dict[tuple[int, int], ReadableSpan] = {}
9
+ for span in spans:
10
+ if span.attributes and span.context:
11
+ update_id = span.attributes.get(AttributeKeys.JUDGMENT_UPDATE_ID)
12
+
13
+ if not isinstance(update_id, int):
14
+ continue
15
+
16
+ key = (span.context.trace_id, span.context.span_id)
17
+ if key not in spans_by_key:
18
+ spans_by_key[key] = span
19
+ else:
20
+ existing_attrs = spans_by_key[key].attributes
21
+ existing_update_id = (
22
+ existing_attrs.get(AttributeKeys.JUDGMENT_UPDATE_ID, 0)
23
+ if existing_attrs
24
+ else 0
25
+ )
26
+ if (
27
+ isinstance(existing_update_id, (int, float))
28
+ and update_id > existing_update_id
29
+ ):
30
+ spans_by_key[key] = span
31
+
32
+ return list(spans_by_key.values())
@@ -0,0 +1,67 @@
1
+ """
2
+ Identifiers used by Judgeval to store specific types of data in the spans.
3
+ """
4
+
5
+ from opentelemetry.semconv.resource import ResourceAttributes
6
+ from opentelemetry.semconv._incubating.attributes import gen_ai_attributes
7
+ from enum import Enum
8
+
9
+
10
+ class AttributeKeys(str, Enum):
11
+ # General function tracing attributes (custom namespace)
12
+ JUDGMENT_SPAN_KIND = "judgment.span_kind"
13
+ JUDGMENT_INPUT = "judgment.input"
14
+ JUDGMENT_OUTPUT = "judgment.output"
15
+ JUDGMENT_OFFLINE_MODE = "judgment.offline_mode"
16
+ JUDGMENT_UPDATE_ID = "judgment.update_id"
17
+
18
+ # Custom tracking attributes
19
+ JUDGMENT_CUSTOMER_ID = "judgment.customer_id"
20
+
21
+ # Agent specific attributes (custom namespace)
22
+ JUDGMENT_AGENT_ID = "judgment.agent_id"
23
+ JUDGMENT_PARENT_AGENT_ID = "judgment.parent_agent_id"
24
+ JUDGMENT_AGENT_CLASS_NAME = "judgment.agent_class_name"
25
+ JUDGMENT_AGENT_INSTANCE_NAME = "judgment.agent_instance_name"
26
+ JUDGMENT_IS_AGENT_ENTRY_POINT = "judgment.is_agent_entry_point"
27
+ JUDGMENT_CUMULATIVE_LLM_COST = "judgment.cumulative_llm_cost"
28
+ JUDGMENT_STATE_BEFORE = "judgment.state_before"
29
+ JUDGMENT_STATE_AFTER = "judgment.state_after"
30
+
31
+ # Evaluation-specific attributes (custom namespace)
32
+ PENDING_TRACE_EVAL = "judgment.pending_trace_eval"
33
+
34
+ # GenAI-specific attributes (semantic conventions)
35
+ GEN_AI_PROMPT = gen_ai_attributes.GEN_AI_PROMPT
36
+ GEN_AI_COMPLETION = gen_ai_attributes.GEN_AI_COMPLETION
37
+ GEN_AI_REQUEST_MODEL = gen_ai_attributes.GEN_AI_REQUEST_MODEL
38
+ GEN_AI_RESPONSE_MODEL = gen_ai_attributes.GEN_AI_RESPONSE_MODEL
39
+ GEN_AI_SYSTEM = gen_ai_attributes.GEN_AI_SYSTEM
40
+ GEN_AI_USAGE_INPUT_TOKENS = gen_ai_attributes.GEN_AI_USAGE_INPUT_TOKENS
41
+ GEN_AI_USAGE_OUTPUT_TOKENS = gen_ai_attributes.GEN_AI_USAGE_OUTPUT_TOKENS
42
+ GEN_AI_USAGE_COMPLETION_TOKENS = gen_ai_attributes.GEN_AI_USAGE_COMPLETION_TOKENS
43
+ GEN_AI_REQUEST_TEMPERATURE = gen_ai_attributes.GEN_AI_REQUEST_TEMPERATURE
44
+ GEN_AI_REQUEST_MAX_TOKENS = gen_ai_attributes.GEN_AI_REQUEST_MAX_TOKENS
45
+ GEN_AI_RESPONSE_FINISH_REASONS = gen_ai_attributes.GEN_AI_RESPONSE_FINISH_REASONS
46
+
47
+ # GenAI-specific attributes (custom namespace)
48
+ GEN_AI_USAGE_TOTAL_COST = "gen_ai.usage.total_cost_usd"
49
+
50
+
51
+ class InternalAttributeKeys(str, Enum):
52
+ """
53
+ Internal attribute keys used for temporary state management in span processors.
54
+ These are NOT exported and are used only for internal span lifecycle management.
55
+ """
56
+
57
+ # Span control attributes
58
+ DISABLE_PARTIAL_EMIT = "disable_partial_emit"
59
+ CANCELLED = "cancelled"
60
+
61
+
62
+ class ResourceKeys(str, Enum):
63
+ SERVICE_NAME = ResourceAttributes.SERVICE_NAME
64
+ TELEMETRY_SDK_LANGUAGE = ResourceAttributes.TELEMETRY_SDK_LANGUAGE
65
+ TELEMETRY_SDK_NAME = ResourceAttributes.TELEMETRY_SDK_NAME
66
+ TELEMETRY_SDK_VERSION = ResourceAttributes.TELEMETRY_SDK_VERSION
67
+ JUDGMENT_PROJECT_ID = "judgment.project_id"