judgeval 0.1.0__py3-none-any.whl → 0.23.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +173 -10
- judgeval/api/__init__.py +523 -0
- judgeval/api/api_types.py +413 -0
- judgeval/cli.py +112 -0
- judgeval/constants.py +7 -30
- judgeval/data/__init__.py +1 -3
- judgeval/data/evaluation_run.py +125 -0
- judgeval/data/example.py +14 -40
- judgeval/data/judgment_types.py +396 -146
- judgeval/data/result.py +11 -18
- judgeval/data/scorer_data.py +3 -26
- judgeval/data/scripts/openapi_transform.py +5 -5
- judgeval/data/trace.py +115 -194
- judgeval/dataset/__init__.py +335 -0
- judgeval/env.py +55 -0
- judgeval/evaluation/__init__.py +346 -0
- judgeval/exceptions.py +28 -0
- judgeval/integrations/langgraph/__init__.py +13 -0
- judgeval/integrations/openlit/__init__.py +51 -0
- judgeval/judges/__init__.py +2 -2
- judgeval/judges/litellm_judge.py +77 -16
- judgeval/judges/together_judge.py +88 -17
- judgeval/judges/utils.py +7 -20
- judgeval/judgment_attribute_keys.py +55 -0
- judgeval/{common/logger.py → logger.py} +24 -8
- judgeval/prompt/__init__.py +330 -0
- judgeval/scorers/__init__.py +11 -11
- judgeval/scorers/agent_scorer.py +15 -19
- judgeval/scorers/api_scorer.py +21 -23
- judgeval/scorers/base_scorer.py +54 -36
- judgeval/scorers/example_scorer.py +1 -3
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -24
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -10
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -10
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -14
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +171 -59
- judgeval/scorers/score.py +64 -47
- judgeval/scorers/utils.py +2 -107
- judgeval/tracer/__init__.py +1111 -2
- judgeval/tracer/constants.py +1 -0
- judgeval/tracer/exporters/__init__.py +40 -0
- judgeval/tracer/exporters/s3.py +119 -0
- judgeval/tracer/exporters/store.py +59 -0
- judgeval/tracer/exporters/utils.py +32 -0
- judgeval/tracer/keys.py +63 -0
- judgeval/tracer/llm/__init__.py +7 -0
- judgeval/tracer/llm/config.py +78 -0
- judgeval/tracer/llm/constants.py +9 -0
- judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
- judgeval/tracer/llm/llm_anthropic/config.py +6 -0
- judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
- judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
- judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
- judgeval/tracer/llm/llm_google/__init__.py +3 -0
- judgeval/tracer/llm/llm_google/config.py +6 -0
- judgeval/tracer/llm/llm_google/generate_content.py +127 -0
- judgeval/tracer/llm/llm_google/wrapper.py +30 -0
- judgeval/tracer/llm/llm_openai/__init__.py +3 -0
- judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
- judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
- judgeval/tracer/llm/llm_openai/config.py +6 -0
- judgeval/tracer/llm/llm_openai/responses.py +506 -0
- judgeval/tracer/llm/llm_openai/utils.py +42 -0
- judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
- judgeval/tracer/llm/llm_together/__init__.py +3 -0
- judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
- judgeval/tracer/llm/llm_together/config.py +6 -0
- judgeval/tracer/llm/llm_together/wrapper.py +52 -0
- judgeval/tracer/llm/providers.py +19 -0
- judgeval/tracer/managers.py +167 -0
- judgeval/tracer/processors/__init__.py +220 -0
- judgeval/tracer/utils.py +19 -0
- judgeval/trainer/__init__.py +14 -0
- judgeval/trainer/base_trainer.py +122 -0
- judgeval/trainer/config.py +123 -0
- judgeval/trainer/console.py +144 -0
- judgeval/trainer/fireworks_trainer.py +392 -0
- judgeval/trainer/trainable_model.py +252 -0
- judgeval/trainer/trainer.py +70 -0
- judgeval/utils/async_utils.py +39 -0
- judgeval/utils/decorators/__init__.py +0 -0
- judgeval/utils/decorators/dont_throw.py +37 -0
- judgeval/utils/decorators/use_once.py +13 -0
- judgeval/utils/file_utils.py +74 -28
- judgeval/utils/guards.py +36 -0
- judgeval/utils/meta.py +27 -0
- judgeval/utils/project.py +15 -0
- judgeval/utils/serialize.py +253 -0
- judgeval/utils/testing.py +70 -0
- judgeval/utils/url.py +10 -0
- judgeval/{version_check.py → utils/version_check.py} +5 -3
- judgeval/utils/wrappers/README.md +3 -0
- judgeval/utils/wrappers/__init__.py +15 -0
- judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
- judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
- judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
- judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
- judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
- judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
- judgeval/utils/wrappers/py.typed +0 -0
- judgeval/utils/wrappers/utils.py +35 -0
- judgeval/v1/__init__.py +88 -0
- judgeval/v1/data/__init__.py +7 -0
- judgeval/v1/data/example.py +44 -0
- judgeval/v1/data/scorer_data.py +42 -0
- judgeval/v1/data/scoring_result.py +44 -0
- judgeval/v1/datasets/__init__.py +6 -0
- judgeval/v1/datasets/dataset.py +214 -0
- judgeval/v1/datasets/dataset_factory.py +94 -0
- judgeval/v1/evaluation/__init__.py +6 -0
- judgeval/v1/evaluation/evaluation.py +182 -0
- judgeval/v1/evaluation/evaluation_factory.py +17 -0
- judgeval/v1/instrumentation/__init__.py +6 -0
- judgeval/v1/instrumentation/llm/__init__.py +7 -0
- judgeval/v1/instrumentation/llm/config.py +78 -0
- judgeval/v1/instrumentation/llm/constants.py +11 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/messages.py +414 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/messages_stream.py +307 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/wrapper.py +61 -0
- judgeval/v1/instrumentation/llm/llm_google/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_google/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_google/generate_content.py +121 -0
- judgeval/v1/instrumentation/llm/llm_google/wrapper.py +30 -0
- judgeval/v1/instrumentation/llm/llm_openai/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_openai/beta_chat_completions.py +212 -0
- judgeval/v1/instrumentation/llm/llm_openai/chat_completions.py +477 -0
- judgeval/v1/instrumentation/llm/llm_openai/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_openai/responses.py +472 -0
- judgeval/v1/instrumentation/llm/llm_openai/utils.py +41 -0
- judgeval/v1/instrumentation/llm/llm_openai/wrapper.py +63 -0
- judgeval/v1/instrumentation/llm/llm_together/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_together/chat_completions.py +382 -0
- judgeval/v1/instrumentation/llm/llm_together/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_together/wrapper.py +57 -0
- judgeval/v1/instrumentation/llm/providers.py +19 -0
- judgeval/v1/integrations/claude_agent_sdk/__init__.py +119 -0
- judgeval/v1/integrations/claude_agent_sdk/wrapper.py +564 -0
- judgeval/v1/integrations/langgraph/__init__.py +13 -0
- judgeval/v1/integrations/openlit/__init__.py +47 -0
- judgeval/v1/internal/api/__init__.py +525 -0
- judgeval/v1/internal/api/api_types.py +413 -0
- judgeval/v1/prompts/__init__.py +6 -0
- judgeval/v1/prompts/prompt.py +29 -0
- judgeval/v1/prompts/prompt_factory.py +189 -0
- judgeval/v1/py.typed +0 -0
- judgeval/v1/scorers/__init__.py +6 -0
- judgeval/v1/scorers/api_scorer.py +82 -0
- judgeval/v1/scorers/base_scorer.py +17 -0
- judgeval/v1/scorers/built_in/__init__.py +17 -0
- judgeval/v1/scorers/built_in/answer_correctness.py +28 -0
- judgeval/v1/scorers/built_in/answer_relevancy.py +28 -0
- judgeval/v1/scorers/built_in/built_in_factory.py +26 -0
- judgeval/v1/scorers/built_in/faithfulness.py +28 -0
- judgeval/v1/scorers/built_in/instruction_adherence.py +28 -0
- judgeval/v1/scorers/custom_scorer/__init__.py +6 -0
- judgeval/v1/scorers/custom_scorer/custom_scorer.py +50 -0
- judgeval/v1/scorers/custom_scorer/custom_scorer_factory.py +16 -0
- judgeval/v1/scorers/prompt_scorer/__init__.py +6 -0
- judgeval/v1/scorers/prompt_scorer/prompt_scorer.py +86 -0
- judgeval/v1/scorers/prompt_scorer/prompt_scorer_factory.py +85 -0
- judgeval/v1/scorers/scorers_factory.py +49 -0
- judgeval/v1/tracer/__init__.py +7 -0
- judgeval/v1/tracer/base_tracer.py +520 -0
- judgeval/v1/tracer/exporters/__init__.py +14 -0
- judgeval/v1/tracer/exporters/in_memory_span_exporter.py +25 -0
- judgeval/v1/tracer/exporters/judgment_span_exporter.py +42 -0
- judgeval/v1/tracer/exporters/noop_span_exporter.py +19 -0
- judgeval/v1/tracer/exporters/span_store.py +50 -0
- judgeval/v1/tracer/judgment_tracer_provider.py +70 -0
- judgeval/v1/tracer/processors/__init__.py +6 -0
- judgeval/v1/tracer/processors/_lifecycles/__init__.py +28 -0
- judgeval/v1/tracer/processors/_lifecycles/agent_id_processor.py +53 -0
- judgeval/v1/tracer/processors/_lifecycles/context_keys.py +11 -0
- judgeval/v1/tracer/processors/_lifecycles/customer_id_processor.py +29 -0
- judgeval/v1/tracer/processors/_lifecycles/registry.py +18 -0
- judgeval/v1/tracer/processors/judgment_span_processor.py +165 -0
- judgeval/v1/tracer/processors/noop_span_processor.py +42 -0
- judgeval/v1/tracer/tracer.py +67 -0
- judgeval/v1/tracer/tracer_factory.py +38 -0
- judgeval/v1/trainers/__init__.py +5 -0
- judgeval/v1/trainers/base_trainer.py +62 -0
- judgeval/v1/trainers/config.py +123 -0
- judgeval/v1/trainers/console.py +144 -0
- judgeval/v1/trainers/fireworks_trainer.py +392 -0
- judgeval/v1/trainers/trainable_model.py +252 -0
- judgeval/v1/trainers/trainers_factory.py +37 -0
- judgeval/v1/utils.py +18 -0
- judgeval/version.py +5 -0
- judgeval/warnings.py +4 -0
- judgeval-0.23.0.dist-info/METADATA +266 -0
- judgeval-0.23.0.dist-info/RECORD +201 -0
- judgeval-0.23.0.dist-info/entry_points.txt +2 -0
- judgeval/clients.py +0 -34
- judgeval/common/__init__.py +0 -13
- judgeval/common/api/__init__.py +0 -3
- judgeval/common/api/api.py +0 -352
- judgeval/common/api/constants.py +0 -165
- judgeval/common/exceptions.py +0 -27
- judgeval/common/storage/__init__.py +0 -6
- judgeval/common/storage/s3_storage.py +0 -98
- judgeval/common/tracer/__init__.py +0 -31
- judgeval/common/tracer/constants.py +0 -22
- judgeval/common/tracer/core.py +0 -1916
- judgeval/common/tracer/otel_exporter.py +0 -108
- judgeval/common/tracer/otel_span_processor.py +0 -234
- judgeval/common/tracer/span_processor.py +0 -37
- judgeval/common/tracer/span_transformer.py +0 -211
- judgeval/common/tracer/trace_manager.py +0 -92
- judgeval/common/utils.py +0 -940
- judgeval/data/datasets/__init__.py +0 -4
- judgeval/data/datasets/dataset.py +0 -341
- judgeval/data/datasets/eval_dataset_client.py +0 -214
- judgeval/data/tool.py +0 -5
- judgeval/data/trace_run.py +0 -37
- judgeval/evaluation_run.py +0 -75
- judgeval/integrations/langgraph.py +0 -843
- judgeval/judges/mixture_of_judges.py +0 -286
- judgeval/judgment_client.py +0 -369
- judgeval/rules.py +0 -521
- judgeval/run_evaluation.py +0 -684
- judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
- judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
- judgeval/utils/alerts.py +0 -93
- judgeval/utils/requests.py +0 -50
- judgeval-0.1.0.dist-info/METADATA +0 -202
- judgeval-0.1.0.dist-info/RECORD +0 -73
- {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/WHEEL +0 -0
- {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Callable, Optional
|
|
4
|
+
|
|
5
|
+
from opentelemetry.sdk.trace import TracerProvider
|
|
6
|
+
from opentelemetry.trace import Tracer, NoOpTracer
|
|
7
|
+
from opentelemetry.util.types import Attributes
|
|
8
|
+
|
|
9
|
+
from judgeval.logger import judgeval_logger
|
|
10
|
+
from judgeval.v1.tracer.base_tracer import BaseTracer
|
|
11
|
+
|
|
12
|
+
FilterTracerCallback = Callable[[str, Optional[str], Optional[str], Attributes], bool]
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class JudgmentTracerProvider(TracerProvider):
|
|
16
|
+
__slots__ = ("_filter_tracer",)
|
|
17
|
+
|
|
18
|
+
def __init__(
|
|
19
|
+
self,
|
|
20
|
+
filter_tracer: Optional[FilterTracerCallback] = None,
|
|
21
|
+
**kwargs,
|
|
22
|
+
):
|
|
23
|
+
super().__init__(**kwargs)
|
|
24
|
+
self._filter_tracer = (
|
|
25
|
+
filter_tracer if filter_tracer is not None else lambda *_: True
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
def get_tracer(
|
|
29
|
+
self,
|
|
30
|
+
instrumenting_module_name: str,
|
|
31
|
+
instrumenting_library_version: Optional[str] = None,
|
|
32
|
+
schema_url: Optional[str] = None,
|
|
33
|
+
attributes: Attributes = None,
|
|
34
|
+
) -> Tracer:
|
|
35
|
+
if instrumenting_module_name == BaseTracer.TRACER_NAME:
|
|
36
|
+
return super().get_tracer(
|
|
37
|
+
instrumenting_module_name,
|
|
38
|
+
instrumenting_library_version,
|
|
39
|
+
schema_url,
|
|
40
|
+
attributes,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
try:
|
|
44
|
+
if self._filter_tracer(
|
|
45
|
+
instrumenting_module_name,
|
|
46
|
+
instrumenting_library_version,
|
|
47
|
+
schema_url,
|
|
48
|
+
attributes,
|
|
49
|
+
):
|
|
50
|
+
return super().get_tracer(
|
|
51
|
+
instrumenting_module_name,
|
|
52
|
+
instrumenting_library_version,
|
|
53
|
+
schema_url,
|
|
54
|
+
attributes,
|
|
55
|
+
)
|
|
56
|
+
else:
|
|
57
|
+
judgeval_logger.debug(
|
|
58
|
+
f"[JudgmentTracerProvider] Returning NoOpTracer for tracer {instrumenting_module_name} as it is disallowed by the filterTracer callback."
|
|
59
|
+
)
|
|
60
|
+
return NoOpTracer()
|
|
61
|
+
except Exception as error:
|
|
62
|
+
judgeval_logger.error(
|
|
63
|
+
f"[JudgmentTracerProvider] Failed to filter tracer {instrumenting_module_name}: {error}."
|
|
64
|
+
)
|
|
65
|
+
return super().get_tracer(
|
|
66
|
+
instrumenting_module_name,
|
|
67
|
+
instrumenting_library_version,
|
|
68
|
+
schema_url,
|
|
69
|
+
attributes,
|
|
70
|
+
)
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from judgeval.v1.tracer.processors.judgment_span_processor import JudgmentSpanProcessor
|
|
4
|
+
from judgeval.v1.tracer.processors.noop_span_processor import NoOpJudgmentSpanProcessor
|
|
5
|
+
|
|
6
|
+
__all__ = ["JudgmentSpanProcessor", "NoOpJudgmentSpanProcessor"]
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from judgeval.v1.tracer.processors._lifecycles.customer_id_processor import (
|
|
4
|
+
CustomerIdProcessor,
|
|
5
|
+
)
|
|
6
|
+
from judgeval.v1.tracer.processors._lifecycles.agent_id_processor import (
|
|
7
|
+
AgentIdProcessor,
|
|
8
|
+
)
|
|
9
|
+
from judgeval.v1.tracer.processors._lifecycles.registry import get_all, register
|
|
10
|
+
from judgeval.v1.tracer.processors._lifecycles.context_keys import (
|
|
11
|
+
CUSTOMER_ID_KEY,
|
|
12
|
+
AGENT_ID_KEY,
|
|
13
|
+
PARENT_AGENT_ID_KEY,
|
|
14
|
+
AGENT_CLASS_NAME_KEY,
|
|
15
|
+
AGENT_INSTANCE_NAME_KEY,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"CustomerIdProcessor",
|
|
20
|
+
"AgentIdProcessor",
|
|
21
|
+
"get_all",
|
|
22
|
+
"register",
|
|
23
|
+
"CUSTOMER_ID_KEY",
|
|
24
|
+
"AGENT_ID_KEY",
|
|
25
|
+
"PARENT_AGENT_ID_KEY",
|
|
26
|
+
"AGENT_CLASS_NAME_KEY",
|
|
27
|
+
"AGENT_INSTANCE_NAME_KEY",
|
|
28
|
+
]
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
from opentelemetry.context import Context, get_value
|
|
6
|
+
from opentelemetry.sdk.trace import ReadableSpan, Span, SpanProcessor
|
|
7
|
+
|
|
8
|
+
from judgeval.v1.tracer.processors._lifecycles.registry import register
|
|
9
|
+
from judgeval.v1.tracer.processors._lifecycles.context_keys import (
|
|
10
|
+
AGENT_ID_KEY,
|
|
11
|
+
PARENT_AGENT_ID_KEY,
|
|
12
|
+
AGENT_CLASS_NAME_KEY,
|
|
13
|
+
AGENT_INSTANCE_NAME_KEY,
|
|
14
|
+
)
|
|
15
|
+
from judgeval.judgment_attribute_keys import AttributeKeys
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class AgentIdProcessor(SpanProcessor):
|
|
19
|
+
def on_start(self, span: Span, parent_context: Optional[Context] = None) -> None:
|
|
20
|
+
agent_id = get_value(AGENT_ID_KEY, context=parent_context)
|
|
21
|
+
if agent_id is not None:
|
|
22
|
+
span.set_attribute(AttributeKeys.JUDGMENT_AGENT_ID, str(agent_id))
|
|
23
|
+
|
|
24
|
+
parent_agent_id = get_value(PARENT_AGENT_ID_KEY, context=parent_context)
|
|
25
|
+
if parent_agent_id is not None:
|
|
26
|
+
span.set_attribute(
|
|
27
|
+
AttributeKeys.JUDGMENT_PARENT_AGENT_ID, str(parent_agent_id)
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
class_name = get_value(AGENT_CLASS_NAME_KEY, context=parent_context)
|
|
31
|
+
if class_name is not None:
|
|
32
|
+
span.set_attribute(AttributeKeys.JUDGMENT_AGENT_CLASS_NAME, str(class_name))
|
|
33
|
+
|
|
34
|
+
instance_name = get_value(AGENT_INSTANCE_NAME_KEY, context=parent_context)
|
|
35
|
+
if instance_name is not None:
|
|
36
|
+
span.set_attribute(
|
|
37
|
+
AttributeKeys.JUDGMENT_AGENT_INSTANCE_NAME, str(instance_name)
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
if agent_id is not None and agent_id != parent_agent_id:
|
|
41
|
+
span.set_attribute(AttributeKeys.JUDGMENT_IS_AGENT_ENTRY_POINT, True)
|
|
42
|
+
|
|
43
|
+
def on_end(self, span: ReadableSpan) -> None:
|
|
44
|
+
pass
|
|
45
|
+
|
|
46
|
+
def shutdown(self) -> None:
|
|
47
|
+
pass
|
|
48
|
+
|
|
49
|
+
def force_flush(self, timeout_millis: int = 30000) -> bool:
|
|
50
|
+
return True
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
register(AgentIdProcessor)
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from opentelemetry.context import create_key
|
|
4
|
+
from judgeval.judgment_attribute_keys import AttributeKeys
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
CUSTOMER_ID_KEY = create_key(AttributeKeys.JUDGMENT_CUSTOMER_ID)
|
|
8
|
+
AGENT_ID_KEY = create_key(AttributeKeys.JUDGMENT_AGENT_ID)
|
|
9
|
+
PARENT_AGENT_ID_KEY = create_key(AttributeKeys.JUDGMENT_PARENT_AGENT_ID)
|
|
10
|
+
AGENT_CLASS_NAME_KEY = create_key(AttributeKeys.JUDGMENT_AGENT_CLASS_NAME)
|
|
11
|
+
AGENT_INSTANCE_NAME_KEY = create_key(AttributeKeys.JUDGMENT_AGENT_INSTANCE_NAME)
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
from opentelemetry.context import Context, get_value
|
|
6
|
+
from opentelemetry.sdk.trace import ReadableSpan, Span, SpanProcessor
|
|
7
|
+
|
|
8
|
+
from judgeval.v1.tracer.processors._lifecycles.registry import register
|
|
9
|
+
from judgeval.v1.tracer.processors._lifecycles.context_keys import CUSTOMER_ID_KEY
|
|
10
|
+
from judgeval.judgment_attribute_keys import AttributeKeys
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class CustomerIdProcessor(SpanProcessor):
|
|
14
|
+
def on_start(self, span: Span, parent_context: Optional[Context] = None) -> None:
|
|
15
|
+
customer_id = get_value(CUSTOMER_ID_KEY, context=parent_context)
|
|
16
|
+
if customer_id is not None:
|
|
17
|
+
span.set_attribute(AttributeKeys.JUDGMENT_CUSTOMER_ID, str(customer_id))
|
|
18
|
+
|
|
19
|
+
def on_end(self, span: ReadableSpan) -> None:
|
|
20
|
+
pass
|
|
21
|
+
|
|
22
|
+
def shutdown(self) -> None:
|
|
23
|
+
pass
|
|
24
|
+
|
|
25
|
+
def force_flush(self, timeout_millis: int = 30000) -> bool:
|
|
26
|
+
return True
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
register(CustomerIdProcessor)
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Callable, List
|
|
4
|
+
|
|
5
|
+
from opentelemetry.sdk.trace import SpanProcessor
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
ProcessorFactory = Callable[[], SpanProcessor]
|
|
9
|
+
|
|
10
|
+
_lifecycle_processors: List[ProcessorFactory] = []
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def register(processor_class: ProcessorFactory) -> None:
|
|
14
|
+
_lifecycle_processors.append(processor_class)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def get_all() -> List[SpanProcessor]:
|
|
18
|
+
return [factory() for factory in _lifecycle_processors]
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any, Optional
|
|
4
|
+
|
|
5
|
+
from collections import defaultdict
|
|
6
|
+
|
|
7
|
+
from opentelemetry.context import Context
|
|
8
|
+
from opentelemetry.sdk.trace import ReadableSpan, Span
|
|
9
|
+
from opentelemetry.trace import get_current_span
|
|
10
|
+
from opentelemetry.trace.span import SpanContext
|
|
11
|
+
from opentelemetry.sdk.trace.export import (
|
|
12
|
+
BatchSpanProcessor,
|
|
13
|
+
SpanExporter,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
from judgeval.judgment_attribute_keys import AttributeKeys
|
|
17
|
+
from judgeval.tracer.keys import InternalAttributeKeys
|
|
18
|
+
from judgeval.utils.decorators.dont_throw import dont_throw
|
|
19
|
+
from judgeval.v1.tracer.processors._lifecycles import get_all
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
if TYPE_CHECKING:
|
|
23
|
+
from judgeval.v1.tracer import BaseTracer
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class JudgmentSpanProcessor(BatchSpanProcessor):
|
|
27
|
+
__slots__ = ("tracer", "resource_attributes", "_internal_attributes")
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
tracer: BaseTracer,
|
|
32
|
+
exporter: SpanExporter,
|
|
33
|
+
/,
|
|
34
|
+
*,
|
|
35
|
+
max_queue_size: int | None = None,
|
|
36
|
+
schedule_delay_millis: float | None = None,
|
|
37
|
+
max_export_batch_size: int | None = None,
|
|
38
|
+
export_timeout_millis: float | None = None,
|
|
39
|
+
):
|
|
40
|
+
self.tracer = tracer
|
|
41
|
+
|
|
42
|
+
super().__init__(
|
|
43
|
+
exporter,
|
|
44
|
+
max_queue_size=max_queue_size,
|
|
45
|
+
schedule_delay_millis=schedule_delay_millis,
|
|
46
|
+
max_export_batch_size=max_export_batch_size,
|
|
47
|
+
export_timeout_millis=export_timeout_millis,
|
|
48
|
+
)
|
|
49
|
+
self._internal_attributes: defaultdict[tuple[int, int], dict[str, Any]] = (
|
|
50
|
+
defaultdict(dict)
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
def _get_span_key(self, span_context: SpanContext) -> tuple[int, int]:
|
|
54
|
+
return (span_context.trace_id, span_context.span_id)
|
|
55
|
+
|
|
56
|
+
def set_internal_attribute(
|
|
57
|
+
self, span_context: SpanContext, key: str, value: Any
|
|
58
|
+
) -> None:
|
|
59
|
+
span_key = self._get_span_key(span_context)
|
|
60
|
+
self._internal_attributes[span_key][key] = value
|
|
61
|
+
|
|
62
|
+
def get_internal_attribute(
|
|
63
|
+
self, span_context: SpanContext, key: str, default: Any = None
|
|
64
|
+
) -> Any:
|
|
65
|
+
span_key = self._get_span_key(span_context)
|
|
66
|
+
return self._internal_attributes[span_key].get(key, default)
|
|
67
|
+
|
|
68
|
+
def increment_update_id(self, span_context: SpanContext) -> int:
|
|
69
|
+
current_id = self.get_internal_attribute(
|
|
70
|
+
span_context=span_context, key=AttributeKeys.JUDGMENT_UPDATE_ID, default=0
|
|
71
|
+
)
|
|
72
|
+
new_id = current_id + 1
|
|
73
|
+
self.set_internal_attribute(
|
|
74
|
+
span_context=span_context,
|
|
75
|
+
key=AttributeKeys.JUDGMENT_UPDATE_ID,
|
|
76
|
+
value=new_id,
|
|
77
|
+
)
|
|
78
|
+
return current_id
|
|
79
|
+
|
|
80
|
+
def _cleanup_span_state(self, span_key: tuple[int, int]) -> None:
|
|
81
|
+
self._internal_attributes.pop(span_key, None)
|
|
82
|
+
|
|
83
|
+
@dont_throw
|
|
84
|
+
def on_start(self, span: Span, parent_context: Optional[Context] = None) -> None:
|
|
85
|
+
for processor in get_all():
|
|
86
|
+
processor.on_start(span, parent_context)
|
|
87
|
+
|
|
88
|
+
@dont_throw
|
|
89
|
+
def emit_partial(self) -> None:
|
|
90
|
+
current_span = get_current_span()
|
|
91
|
+
if (
|
|
92
|
+
not current_span
|
|
93
|
+
or not current_span.is_recording()
|
|
94
|
+
or not isinstance(current_span, ReadableSpan)
|
|
95
|
+
):
|
|
96
|
+
return
|
|
97
|
+
|
|
98
|
+
span_context = current_span.get_span_context()
|
|
99
|
+
if self.get_internal_attribute(
|
|
100
|
+
span_context, InternalAttributeKeys.DISABLE_PARTIAL_EMIT, False
|
|
101
|
+
):
|
|
102
|
+
return
|
|
103
|
+
|
|
104
|
+
attributes = dict(current_span.attributes or {})
|
|
105
|
+
attributes[AttributeKeys.JUDGMENT_UPDATE_ID] = self.increment_update_id(
|
|
106
|
+
span_context
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
partial_span = ReadableSpan(
|
|
110
|
+
name=current_span.name,
|
|
111
|
+
context=span_context,
|
|
112
|
+
parent=current_span.parent,
|
|
113
|
+
resource=current_span.resource,
|
|
114
|
+
attributes=attributes,
|
|
115
|
+
events=current_span.events,
|
|
116
|
+
links=current_span.links,
|
|
117
|
+
status=current_span.status,
|
|
118
|
+
kind=current_span.kind,
|
|
119
|
+
start_time=current_span.start_time,
|
|
120
|
+
end_time=None,
|
|
121
|
+
instrumentation_scope=current_span.instrumentation_scope,
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
super().on_end(partial_span)
|
|
125
|
+
|
|
126
|
+
@dont_throw
|
|
127
|
+
def on_end(self, span: ReadableSpan) -> None:
|
|
128
|
+
for processor in get_all():
|
|
129
|
+
processor.on_end(span)
|
|
130
|
+
|
|
131
|
+
if not span.context:
|
|
132
|
+
super().on_end(span)
|
|
133
|
+
return
|
|
134
|
+
|
|
135
|
+
span_key = self._get_span_key(span.context)
|
|
136
|
+
|
|
137
|
+
if self.get_internal_attribute(
|
|
138
|
+
span.context, InternalAttributeKeys.CANCELLED, False
|
|
139
|
+
):
|
|
140
|
+
self._cleanup_span_state(span_key)
|
|
141
|
+
return
|
|
142
|
+
|
|
143
|
+
if span.end_time is not None:
|
|
144
|
+
attributes = dict(span.attributes or {})
|
|
145
|
+
attributes[AttributeKeys.JUDGMENT_UPDATE_ID] = 20
|
|
146
|
+
|
|
147
|
+
final_span = ReadableSpan(
|
|
148
|
+
name=span.name,
|
|
149
|
+
context=span.context,
|
|
150
|
+
parent=span.parent,
|
|
151
|
+
resource=span.resource,
|
|
152
|
+
attributes=attributes,
|
|
153
|
+
events=span.events,
|
|
154
|
+
links=span.links,
|
|
155
|
+
status=span.status,
|
|
156
|
+
kind=span.kind,
|
|
157
|
+
start_time=span.start_time,
|
|
158
|
+
end_time=span.end_time,
|
|
159
|
+
instrumentation_scope=span.instrumentation_scope,
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
self._cleanup_span_state(span_key)
|
|
163
|
+
super().on_end(final_span)
|
|
164
|
+
else:
|
|
165
|
+
super().on_end(span)
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from opentelemetry.sdk.trace import Span, ReadableSpan
|
|
2
|
+
from opentelemetry.context import Context
|
|
3
|
+
from typing import Any, Optional
|
|
4
|
+
|
|
5
|
+
from opentelemetry.trace import SpanContext
|
|
6
|
+
|
|
7
|
+
from judgeval.v1.tracer.processors.judgment_span_processor import JudgmentSpanProcessor
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class NoOpJudgmentSpanProcessor(JudgmentSpanProcessor):
|
|
11
|
+
__slots__ = ("resource_attributes",)
|
|
12
|
+
|
|
13
|
+
def __init__(self):
|
|
14
|
+
self.resource_attributes = {}
|
|
15
|
+
|
|
16
|
+
def on_start(self, span: Span, parent_context: Optional[Context] = None) -> None:
|
|
17
|
+
pass
|
|
18
|
+
|
|
19
|
+
def on_end(self, span: ReadableSpan) -> None:
|
|
20
|
+
pass
|
|
21
|
+
|
|
22
|
+
def shutdown(self) -> None:
|
|
23
|
+
pass
|
|
24
|
+
|
|
25
|
+
def force_flush(self, timeout_millis: int | None = 30000) -> bool:
|
|
26
|
+
return True
|
|
27
|
+
|
|
28
|
+
def emit_partial(self) -> None:
|
|
29
|
+
pass
|
|
30
|
+
|
|
31
|
+
def set_internal_attribute(
|
|
32
|
+
self, span_context: SpanContext, key: str, value: Any
|
|
33
|
+
) -> None:
|
|
34
|
+
pass
|
|
35
|
+
|
|
36
|
+
def get_internal_attribute(
|
|
37
|
+
self, span_context: SpanContext, key: str, default: Any = None
|
|
38
|
+
) -> Any:
|
|
39
|
+
return default
|
|
40
|
+
|
|
41
|
+
def increment_update_id(self, span_context: SpanContext) -> int:
|
|
42
|
+
return 0
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, Callable, Optional
|
|
4
|
+
|
|
5
|
+
from opentelemetry import trace
|
|
6
|
+
from opentelemetry.sdk.resources import Resource
|
|
7
|
+
from opentelemetry.sdk.trace import TracerProvider
|
|
8
|
+
|
|
9
|
+
from judgeval.v1.internal.api import JudgmentSyncClient
|
|
10
|
+
from judgeval.logger import judgeval_logger
|
|
11
|
+
from judgeval.v1.tracer.judgment_tracer_provider import JudgmentTracerProvider
|
|
12
|
+
from judgeval.version import get_version
|
|
13
|
+
from judgeval.v1.tracer.base_tracer import BaseTracer
|
|
14
|
+
from judgeval.v1.tracer.judgment_tracer_provider import FilterTracerCallback
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class Tracer(BaseTracer):
|
|
18
|
+
__slots__ = ("_tracer_provider", "_filter_tracer")
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
project_name: str,
|
|
23
|
+
enable_evaluation: bool,
|
|
24
|
+
api_client: JudgmentSyncClient,
|
|
25
|
+
serializer: Callable[[Any], str],
|
|
26
|
+
initialize: bool,
|
|
27
|
+
filter_tracer: Optional[FilterTracerCallback] = None,
|
|
28
|
+
):
|
|
29
|
+
super().__init__(
|
|
30
|
+
project_name=project_name,
|
|
31
|
+
enable_evaluation=enable_evaluation,
|
|
32
|
+
api_client=api_client,
|
|
33
|
+
serializer=serializer,
|
|
34
|
+
)
|
|
35
|
+
self._tracer_provider: Optional[TracerProvider] = None
|
|
36
|
+
self._filter_tracer = filter_tracer
|
|
37
|
+
|
|
38
|
+
if initialize:
|
|
39
|
+
self.initialize()
|
|
40
|
+
|
|
41
|
+
def initialize(self) -> None:
|
|
42
|
+
resource = Resource.create(
|
|
43
|
+
{
|
|
44
|
+
"service.name": self.project_name,
|
|
45
|
+
"telemetry.sdk.name": self.TRACER_NAME,
|
|
46
|
+
"telemetry.sdk.version": get_version(),
|
|
47
|
+
}
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
self._tracer_provider = JudgmentTracerProvider(
|
|
51
|
+
resource=resource, filter_tracer=self._filter_tracer
|
|
52
|
+
)
|
|
53
|
+
self._tracer_provider.add_span_processor(self.get_span_processor())
|
|
54
|
+
|
|
55
|
+
trace.set_tracer_provider(self._tracer_provider)
|
|
56
|
+
|
|
57
|
+
def force_flush(self, timeout_millis: int = 30000) -> bool:
|
|
58
|
+
if self._tracer_provider is None:
|
|
59
|
+
judgeval_logger.error("Cannot forceFlush: tracer not initialized")
|
|
60
|
+
return False
|
|
61
|
+
return self._tracer_provider.force_flush(timeout_millis)
|
|
62
|
+
|
|
63
|
+
def shutdown(self, timeout_millis: int = 30000) -> None:
|
|
64
|
+
if self._tracer_provider is None:
|
|
65
|
+
judgeval_logger.error("Cannot shutdown: tracer not initialized")
|
|
66
|
+
return
|
|
67
|
+
self._tracer_provider.shutdown()
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, Callable, Optional
|
|
4
|
+
|
|
5
|
+
from judgeval.utils.serialize import safe_serialize
|
|
6
|
+
from judgeval.v1.internal.api import JudgmentSyncClient
|
|
7
|
+
from judgeval.v1.tracer.judgment_tracer_provider import FilterTracerCallback
|
|
8
|
+
from judgeval.v1.tracer.tracer import Tracer
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class TracerFactory:
|
|
12
|
+
__slots__ = "_client"
|
|
13
|
+
|
|
14
|
+
def __init__(
|
|
15
|
+
self,
|
|
16
|
+
client: JudgmentSyncClient,
|
|
17
|
+
):
|
|
18
|
+
self._client = client
|
|
19
|
+
|
|
20
|
+
def create(
|
|
21
|
+
self,
|
|
22
|
+
project_name: str,
|
|
23
|
+
enable_evaluation: bool = True,
|
|
24
|
+
serializer: Optional[Callable[[Any], str]] = None,
|
|
25
|
+
filter_tracer: Optional[FilterTracerCallback] = None,
|
|
26
|
+
initialize: bool = True,
|
|
27
|
+
) -> Tracer:
|
|
28
|
+
if serializer is None:
|
|
29
|
+
serializer = safe_serialize
|
|
30
|
+
|
|
31
|
+
return Tracer(
|
|
32
|
+
project_name=project_name,
|
|
33
|
+
enable_evaluation=enable_evaluation,
|
|
34
|
+
api_client=self._client,
|
|
35
|
+
serializer=serializer,
|
|
36
|
+
initialize=initialize,
|
|
37
|
+
filter_tracer=filter_tracer,
|
|
38
|
+
)
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from typing import Any, Callable, Dict, List, Optional, TYPE_CHECKING
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from judgeval.v1.tracer.tracer import Tracer
|
|
8
|
+
from judgeval.v1.trainers.trainable_model import TrainableModel
|
|
9
|
+
from judgeval.v1.trainers.config import TrainerConfig, ModelConfig
|
|
10
|
+
from judgeval.v1.scorers.base_scorer import BaseScorer
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class BaseTrainer(ABC):
|
|
14
|
+
__slots__ = ("config", "trainable_model", "tracer", "project_name")
|
|
15
|
+
|
|
16
|
+
def __init__(
|
|
17
|
+
self,
|
|
18
|
+
config: TrainerConfig,
|
|
19
|
+
trainable_model: TrainableModel,
|
|
20
|
+
tracer: Tracer,
|
|
21
|
+
project_name: Optional[str] = None,
|
|
22
|
+
):
|
|
23
|
+
self.config = config
|
|
24
|
+
self.trainable_model = trainable_model
|
|
25
|
+
self.tracer = tracer
|
|
26
|
+
self.project_name = project_name or "judgment_training"
|
|
27
|
+
|
|
28
|
+
@abstractmethod
|
|
29
|
+
async def generate_rollouts_and_rewards(
|
|
30
|
+
self,
|
|
31
|
+
agent_function: Callable[[Any], Any],
|
|
32
|
+
scorers: List[BaseScorer],
|
|
33
|
+
prompts: dict[int, dict[Any, Any]],
|
|
34
|
+
num_prompts_per_step: Optional[int] = None,
|
|
35
|
+
num_generations_per_prompt: Optional[int] = None,
|
|
36
|
+
concurrency: Optional[int] = None,
|
|
37
|
+
) -> Any:
|
|
38
|
+
pass
|
|
39
|
+
|
|
40
|
+
@abstractmethod
|
|
41
|
+
async def run_reinforcement_learning(
|
|
42
|
+
self,
|
|
43
|
+
agent_function: Callable[[Any], Any],
|
|
44
|
+
scorers: List[BaseScorer],
|
|
45
|
+
prompts: dict[int, dict[Any, Any]],
|
|
46
|
+
) -> "ModelConfig":
|
|
47
|
+
pass
|
|
48
|
+
|
|
49
|
+
@abstractmethod
|
|
50
|
+
async def train(
|
|
51
|
+
self,
|
|
52
|
+
agent_function: Callable[[Any], Any],
|
|
53
|
+
scorers: List[BaseScorer],
|
|
54
|
+
prompts: dict[int, dict[Any, Any]],
|
|
55
|
+
) -> "ModelConfig":
|
|
56
|
+
pass
|
|
57
|
+
|
|
58
|
+
@abstractmethod
|
|
59
|
+
def _extract_message_history_from_spans(
|
|
60
|
+
self, trace_id: str
|
|
61
|
+
) -> List[Dict[str, str]]:
|
|
62
|
+
pass
|