PyPI - agentevals-cli - Versions diffs - 0.9.0__tar.gz → 0.9.1__tar.gz - Mend

agentevals-cli 0.9.0tar.gz → 0.9.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (278) hide show

{agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: agentevals-cli
-Version: 0.9.0
+Version: 0.9.1
 Summary: Standalone framework to evaluate agent correctness based on portable OpenTelemetry traces
 License-File: LICENSE
 Requires-Python: >=3.11

{agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/docs/custom-evaluators.md RENAMED Viewed

@@ -317,6 +317,26 @@ The `grader.evaluation_metric` field selects the similarity algorithm:
 | `rouge_1` through `rouge_5` | Unigram through 5-gram overlap (F-measure) |
 | `rouge_l` | Longest common subsequence overlap (F-measure) |
+### Label Model Grader
+Scores responses without a golden set. The model reads each response and assigns a label from a fixed list. Passing labels are defined in the config.
+```yaml
+evaluators:
+  - name: quality_check
+    type: openai_eval
+    grader:
+      type: label_model
+      model: gpt-4o-mini
+      input:
+        - role: user
+          content: "Rate this response: {{ item.actual_response }}"
+      labels: [good, bad]
+      passing_labels: [good]
+```
+The `threshold` field is not used for `label_model`. A response passes if its assigned label is in `passing_labels`.
 ### How it works
 Under the hood, agentevals creates an ephemeral eval on OpenAI, submits the actual and expected responses as JSONL items, polls for results, and cleans up. The agent's response and the golden reference are both placed in the `item` namespace (with `include_sample_schema: false`), so OpenAI only grades the provided text without generating any model outputs.

{agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/custom_evaluators/eval_config.yaml RENAMED Viewed

@@ -32,3 +32,4 @@ evaluators:
     ref: evaluators/random_evaluator/random_evaluator.py
     threshold: 0.110
     executor: local

agentevals_cli-0.9.1/examples/custom_evaluators/eval_config_openai_eval.yaml ADDED Viewed

@@ -0,0 +1,18 @@
+# Eval config using OpenAI Evals API graders.
+# Requires OPENAI_API_KEY to be set.
+#
+# Run with:
+#   agentevals run samples/helm.json \
+#     --config examples/custom_evaluators/eval_config_openai_eval.yaml
+evaluators:
+  - name: quality_check
+    type: openai_eval
+    grader:
+      type: label_model
+      model: gpt-4o-mini
+      input:
+        - role: user
+          content: "Rate this response: {{ item.actual_response }}"
+      labels: [good, bad]
+      passing_labels: [good]

{agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "agentevals-cli"
-version = "0.9.0"
+version = "0.9.1"
 description = "Standalone framework to evaluate agent correctness based on portable OpenTelemetry traces"
 readme = "README.md"
 requires-python = ">=3.11"

{agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/config.py RENAMED Viewed

@@ -100,13 +100,21 @@ class OpenAIEvalDef(BaseModel):
     @classmethod
     def _validate_grader(cls, v: dict[str, Any]) -> dict[str, Any]:
         grader_type = v.get("type")
-        if grader_type != "text_similarity":
-            raise ValueError(f"Only 'text_similarity' grader type is currently supported, got '{grader_type}'")
-        metric = v.get("evaluation_metric")
-        if not metric:
-            raise ValueError("'evaluation_metric' is required for text_similarity grader")
-        if metric not in _VALID_SIMILARITY_METRICS:
-            raise ValueError(f"Unknown evaluation_metric '{metric}'. Valid: {sorted(_VALID_SIMILARITY_METRICS)}")
+        if grader_type == "text_similarity":
+            metric = v.get("evaluation_metric")
+            if not metric:
+                raise ValueError("'evaluation_metric' is required for text_similarity grader")
+            if metric not in _VALID_SIMILARITY_METRICS:
+                raise ValueError(f"Unknown evaluation_metric '{metric}'. Valid: {sorted(_VALID_SIMILARITY_METRICS)}")
+        elif grader_type == "label_model":
+            for field in ("model", "input", "labels", "passing_labels"):
+                if not v.get(field):
+                    raise ValueError(f"'{field}' is required for label_model grader")
+            invalid = [lbl for lbl in v["passing_labels"] if lbl not in v["labels"]]
+            if invalid:
+                raise ValueError(f"passing_labels contains labels not declared in labels: {invalid}")
+        else:
+            raise ValueError(f"Unsupported grader type: '{grader_type}'. Supported: label_model, text_similarity")
         return v

{agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/converter.py RENAMED Viewed

@@ -23,6 +23,7 @@ from .extraction import (
     extract_tool_call_from_span,
     extract_tool_result_from_span,
     extract_user_text_from_attrs,
+    find_adk_llm_spans_in,
     get_extractor,
     has_adk_descendant,
     is_adk_scope,
@@ -127,15 +128,18 @@ def _find_adk_spans(trace: Trace, operation: str) -> list[Span]:
 def _convert_invoke_span(invoke_span: Span) -> Invocation:
-    call_llm_spans = _find_children_by_op(invoke_span, "call_llm")
-    if not call_llm_spans:
-        raise ValueError(f"invoke_agent span {invoke_span.span_id} has no child call_llm spans")
+    llm_spans = find_adk_llm_spans_in(invoke_span)
+    if not llm_spans:
+        raise ValueError(
+            f"invoke_agent span {invoke_span.span_id} has no converter-compatible ADK LLM descendants; "
+            "expected call_llm or ADK generate_content spans"
+        )
     tool_spans = _find_children_by_op(invoke_span, "execute_tool")
-    user_content = _extract_user_content(call_llm_spans[0])
-    final_response = _extract_final_response(call_llm_spans[-1])
-    tool_uses, tool_responses = _extract_tool_trajectory(call_llm_spans, tool_spans)
+    user_content = _extract_user_content(llm_spans[0])
+    final_response = _extract_final_response(llm_spans[-1])
+    tool_uses, tool_responses = _extract_tool_trajectory(llm_spans, tool_spans)
     intermediate_data = IntermediateData(
         tool_uses=tool_uses,
@@ -177,7 +181,7 @@ def _extract_user_content(first_call_llm: Span) -> genai_types.Content:
         )
     llm_request_raw = first_call_llm.get_tag(ADK_LLM_REQUEST, "{}")
     llm_request = parse_json(llm_request_raw)
-    for content_dict in llm_request.get("contents", []):
+    for content_dict in llm_request.get("contents", llm_request.get("Contents", [])):
         if content_dict.get("role") == "user":
             return _content_from_dict(content_dict)
     raise ValueError(f"call_llm span {first_call_llm.span_id}: no user content found in llm_request")
@@ -193,7 +197,7 @@ def _extract_final_response(last_call_llm: Span) -> genai_types.Content:
         )
     llm_response_raw = last_call_llm.get_tag(ADK_LLM_RESPONSE, "{}")
     llm_response = parse_json(llm_response_raw)
-    content_dict = llm_response.get("content", {})
+    content_dict = llm_response.get("content", llm_response.get("Content", {}))
     if not content_dict:
         raise ValueError(f"call_llm span {last_call_llm.span_id}: no content in llm_response")
     logger.warning(
@@ -263,12 +267,12 @@ def _extract_function_calls_from_llm_response(
     llm_response_raw = call_llm.get_tag(ADK_LLM_RESPONSE, "{}")
     llm_response = parse_json(llm_response_raw)
-    content_dict = llm_response.get("content", {})
+    content_dict = llm_response.get("content", llm_response.get("Content", {}))
     parts = content_dict.get("parts", [])
     calls = []
     for part in parts:
-        fc_dict = part.get("function_call")
+        fc_dict = part.get("function_call", part.get("functionCall"))
         if fc_dict:
             calls.append(
                 genai_types.FunctionCall(
@@ -288,9 +292,9 @@ def _content_from_dict(content_dict: dict[str, Any]) -> genai_types.Content:
     parts: list[genai_types.Part] = []
     for p in parts_dicts:
         if "text" in p:
-            parts.append(genai_types.Part(text=p["text"]))
-        elif "function_call" in p:
-            fc = p["function_call"]
+            parts.append(genai_types.Part(text=p.get("text")))
+        elif "function_call" in p or "functionCall" in p:
+            fc = p.get("function_call", p.get("functionCall"))
             parts.append(
                 genai_types.Part(
                     function_call=genai_types.FunctionCall(
@@ -300,8 +304,8 @@ def _content_from_dict(content_dict: dict[str, Any]) -> genai_types.Content:
                     )
                 )
             )
-        elif "function_response" in p:
-            fr = p["function_response"]
+        elif "function_response" in p or "functionResponse" in p:
+            fr = p.get("function_response", p.get("functionResponse"))
             parts.append(
                 genai_types.Part(
                     function_response=genai_types.FunctionResponse(

{agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/extraction.py RENAMED Viewed

@@ -69,14 +69,15 @@ def extract_user_text_from_attrs(attrs: dict[str, Any]) -> str | None:
     if llm_request_raw:
         llm_request = parse_json(llm_request_raw)
         if isinstance(llm_request, dict):
-            for content_dict in reversed(llm_request.get("contents", [])):
+            contents = llm_request.get("contents", llm_request.get("Contents", []))
+            for content_dict in reversed(contents):
                 if content_dict.get("role") != "user":
                     continue
                 parts = content_dict.get("parts", [])
                 text_parts = [p for p in parts if "text" in p]
                 if text_parts:
                     return " ".join(p["text"] for p in text_parts)
-            for content_dict in llm_request.get("contents", []):
+            for content_dict in contents:
                 if content_dict.get("role") == "user":
                     parts = content_dict.get("parts", [])
                     if parts:
@@ -101,7 +102,7 @@ def extract_agent_response_from_attrs(attrs: dict[str, Any]) -> str | None:
     if llm_response_raw:
         llm_response = parse_json(llm_response_raw)
         if isinstance(llm_response, dict):
-            content_dict = llm_response.get("content", {})
+            content_dict = llm_response.get("content", llm_response.get("Content", {}))
             if content_dict:
                 parts_dicts = content_dict.get("parts", [])
                 text_parts = [p for p in parts_dicts if "text" in p]
@@ -392,6 +393,38 @@ def is_adk_scope(span: Span) -> bool:
     return False
+def is_adk_generate_content_llm_span(span: Span) -> bool:
+    if not (span.operation_name.startswith("generate_content") or span.get_tag(OTEL_GENAI_OP) == "generate_content"):
+        return False
+    return bool(span.get_tag(ADK_LLM_REQUEST) or span.get_tag(ADK_LLM_RESPONSE))
+def is_adk_llm_span(span: Span) -> bool:
+    return span.operation_name.startswith("call_llm") or is_adk_generate_content_llm_span(span)
+def find_adk_llm_spans_in(root: Span) -> list[Span]:
+    call_llm_spans: list[Span] = []
+    generate_content_spans: list[Span] = []
+    def collect(span: Span) -> None:
+        if span.operation_name.startswith("call_llm"):
+            call_llm_spans.append(span)
+        elif is_adk_generate_content_llm_span(span):
+            generate_content_spans.append(span)
+    _walk_descendants(root, collect)
+    call_llm_spans.sort(key=lambda s: s.start_time)
+    generate_content_spans.sort(key=lambda s: s.start_time)
+    return call_llm_spans or generate_content_spans
+def _walk_descendants(span: Span, visit) -> None:
+    for child in span.children:
+        visit(child)
+        _walk_descendants(child, visit)
 def is_llm_span(span: Span) -> bool:
     return span.get_tag(OTEL_GENAI_REQUEST_MODEL) is not None
@@ -477,10 +510,7 @@ class AdkExtractor:
         return matches
     def find_llm_spans_in(self, root: Span) -> list[Span]:
-        results: list[Span] = []
-        self._walk(root, lambda s: s.operation_name.startswith("call_llm"), results)
-        results.sort(key=lambda s: s.start_time)
-        return results
+        return find_adk_llm_spans_in(root)
     def find_tool_spans_in(self, root: Span) -> list[Span]:
         results: list[Span] = []
@@ -493,7 +523,7 @@ class AdkExtractor:
             return None
         if span.operation_name.startswith("invoke_agent"):
             return "invocation"
-        if span.operation_name.startswith("call_llm"):
+        if is_adk_llm_span(span):
             return "llm"
         if span.operation_name.startswith("execute_tool"):
             return "tool"

{agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/openai_eval_backend.py RENAMED Viewed

@@ -31,6 +31,12 @@ _TEXT_PAIR_SCHEMA = {
     "required": ["actual_response", "expected_response"],
 }
+_ACTUAL_ONLY_SCHEMA = {
+    "type": "object",
+    "properties": {"actual_response": {"type": "string"}},
+    "required": ["actual_response"],
+}
 def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]:
     """Build the OpenAI testing_criteria dict from the evaluator config.
@@ -51,28 +57,33 @@ def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]:
             "pass_threshold": evaluator_def.threshold,
         }
+    if grader_type == "label_model":
+        return {
+            "type": "label_model",
+            "name": evaluator_def.name,
+            "model": grader["model"],
+            "input": grader["input"],
+            "labels": grader["labels"],
+            "passing_labels": grader["passing_labels"],
+        }
     raise ValueError(f"Unsupported grader type: {grader_type}")
 def _build_jsonl_items(
     actual_invocations: list[Invocation],
     expected_invocations: list[Invocation],
+    include_expected: bool = True,
 ) -> list[dict[str, Any]]:
     items = []
     for i, actual_inv in enumerate(actual_invocations):
-        actual_text = _content_to_text(actual_inv.final_response)
-        if i < len(expected_invocations):
-            expected_text = _content_to_text(expected_invocations[i].final_response)
-        else:
-            expected_text = ""
-        items.append(
-            {
-                "item": {
-                    "actual_response": actual_text,
-                    "expected_response": expected_text,
-                }
-            }
-        )
+        entry: dict[str, Any] = {"actual_response": _content_to_text(actual_inv.final_response)}
+        if include_expected:
+            expected_text = (
+                _content_to_text(expected_invocations[i].final_response) if i < len(expected_invocations) else ""
+            )
+            entry["expected_response"] = expected_text
+        items.append({"item": entry})
     return items
@@ -111,13 +122,17 @@ async def evaluate_openai_eval(
             error="OPENAI_API_KEY environment variable is not set.",
         )
-    if expected_invocations is None:
+    grader_type = evaluator_def.grader["type"]
+    if grader_type == "text_similarity" and expected_invocations is None:
         return MetricResult(
             metric_name=evaluator_def.name,
             error="OpenAI text_similarity grader requires expected invocations (golden eval set).",
         )
-    items = _build_jsonl_items(actual_invocations, expected_invocations)
+    items = _build_jsonl_items(
+        actual_invocations, expected_invocations or [], include_expected=(grader_type != "label_model")
+    )
     if not items:
         return MetricResult(
             metric_name=evaluator_def.name,
@@ -130,12 +145,13 @@ async def evaluate_openai_eval(
     try:
         client = await asyncio.to_thread(_get_openai_client)
+        item_schema = _ACTUAL_ONLY_SCHEMA if grader_type == "label_model" else _TEXT_PAIR_SCHEMA
         eval_obj = await asyncio.to_thread(
             client.evals.create,
-            name=f"agentevals-{evaluator_def.name}",
+            name=f"agentevals-openai-{evaluator_def.name}",
             data_source_config={
                 "type": "custom",
-                "item_schema": _TEXT_PAIR_SCHEMA,
+                "item_schema": item_schema,
                 "include_sample_schema": False,
             },
             testing_criteria=[testing_criteria],
@@ -146,7 +162,7 @@ async def evaluate_openai_eval(
         run = await asyncio.to_thread(
             client.evals.runs.create,
             eval_id=eval_id,
-            name=f"agentevals-run-{evaluator_def.name}",
+            name=f"agentevals-openai-run-{evaluator_def.name}",
             data_source={
                 "type": "jsonl",
                 "source": {
@@ -225,12 +241,17 @@ async def _collect_results(client: Any, eval_id: str, run_id: str, run: Any, eva
     total = result_counts.total if result_counts else 0
     eval_status = "PASSED" if failed == 0 and total > 0 else "FAILED"
+    grader = evaluator_def.grader
     details: dict[str, Any] = {
         "openai_eval_id": eval_id,
         "openai_run_id": run_id,
-        "evaluation_metric": evaluator_def.grader.get("evaluation_metric"),
         "result_counts": {"passed": passed, "failed": failed, "total": total},
     }
+    if grader["type"] == "text_similarity":
+        details["evaluation_metric"] = grader.get("evaluation_metric")
+    elif grader["type"] == "label_model":
+        details["model"] = grader.get("model")
+        details["passing_labels"] = grader.get("passing_labels")
     per_criteria = getattr(run, "per_testing_criteria_results", None)
     if per_criteria:
         details["per_testing_criteria"] = [

{agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/test_converter.py RENAMED Viewed

@@ -186,6 +186,108 @@ class TestConverter:
         assert len(results) == 2
         assert all(r.trace_id == "t1" for r in results)
+    def test_convert_adk_generate_content_llm_spans(self):
+        invoke = Span(
+            trace_id="t-gc",
+            span_id="invoke1",
+            parent_span_id=None,
+            operation_name="invoke_agent query_agent",
+            start_time=1000,
+            duration=10000,
+            tags={"gen_ai.operation.name": "invoke_agent"},
+        )
+        llm_1 = Span(
+            trace_id="t-gc",
+            span_id="llm1",
+            parent_span_id="invoke1",
+            operation_name="generate_content mockllm-deterministic",
+            start_time=2000,
+            duration=1000,
+            tags={
+                "gen_ai.operation.name": "generate_content",
+                "gcp.vertex.agent.llm_request": json.dumps(
+                    {"Contents": [{"role": "user", "parts": [{"text": "inspect pods"}]}]}
+                ),
+                "gcp.vertex.agent.llm_response": json.dumps(
+                    {"Content": {"role": "model", "parts": [{"text": "Calling tools."}]}}
+                ),
+            },
+        )
+        tool_1 = Span(
+            trace_id="t-gc",
+            span_id="tool1",
+            parent_span_id="invoke1",
+            operation_name="execute_tool list_pods",
+            start_time=3000,
+            duration=500,
+            tags={
+                "gen_ai.tool.name": "list_pods",
+                "gen_ai.tool.call.id": "call_1",
+                "gcp.vertex.agent.tool_call_args": json.dumps({"namespace": "default"}),
+                "gcp.vertex.agent.tool_response": json.dumps({"pods": []}),
+            },
+        )
+        llm_2 = Span(
+            trace_id="t-gc",
+            span_id="llm2",
+            parent_span_id="invoke1",
+            operation_name="generate_content mockllm-deterministic",
+            start_time=4000,
+            duration=1000,
+            tags={
+                "gen_ai.operation.name": "generate_content",
+                "gcp.vertex.agent.llm_request": json.dumps({"contents": []}),
+                "gcp.vertex.agent.llm_response": json.dumps(
+                    {
+                        "Content": {
+                            "role": "model",
+                            "parts": [
+                                {
+                                    "functionCall": {
+                                        "name": "summarize_pods",
+                                        "args": {"namespace": "default"},
+                                        "id": "call_final",
+                                    }
+                                }
+                            ],
+                        }
+                    }
+                ),
+            },
+        )
+        tool_2 = Span(
+            trace_id="t-gc",
+            span_id="tool2",
+            parent_span_id="invoke1",
+            operation_name="execute_tool get_events",
+            start_time=5000,
+            duration=500,
+            tags={
+                "gen_ai.tool.name": "get_events",
+                "gen_ai.tool.call.id": "call_2",
+                "gcp.vertex.agent.tool_call_args": json.dumps({"namespace": "default"}),
+                "gcp.vertex.agent.tool_response": json.dumps({"events": []}),
+            },
+        )
+        invoke.children.extend([llm_1, tool_1, llm_2, tool_2])
+        trace = Trace(
+            trace_id="t-gc",
+            root_spans=[invoke],
+            all_spans=[invoke, llm_1, tool_1, llm_2, tool_2],
+        )
+        result = convert_trace(trace)
+        assert result.warnings == []
+        assert len(result.invocations) == 1
+        inv = result.invocations[0]
+        assert inv.user_content.parts[0].text == "inspect pods"
+        final_call = inv.final_response.parts[0].function_call
+        assert final_call.name == "summarize_pods"
+        assert final_call.args == {"namespace": "default"}
+        assert final_call.id == "call_final"
+        assert [t.name for t in inv.intermediate_data.tool_uses] == ["list_pods", "get_events"]
     def test_no_invoke_agent_warns(self):
         trace = Trace(
             trace_id="empty",
@@ -207,6 +309,35 @@ class TestConverter:
         assert len(result.warnings) == 1
         assert "no invoke_agent" in result.warnings[0]
+    def test_no_llm_descendants_warns_with_compatible_shapes(self):
+        invoke = Span(
+            trace_id="no-llm",
+            span_id="invoke-no-llm",
+            parent_span_id=None,
+            operation_name="invoke_agent test_agent",
+            start_time=1000,
+            duration=1000,
+            tags={
+                "otel.scope.name": "gcp.vertex.agent",
+                "gen_ai.operation.name": "invoke_agent",
+            },
+        )
+        trace = Trace(
+            trace_id="no-llm",
+            root_spans=[invoke],
+            all_spans=[invoke],
+        )
+        result = convert_trace(trace)
+        assert result.invocations == []
+        assert len(result.warnings) == 1
+        warning = result.warnings[0]
+        assert "invoke-no-llm" in warning
+        assert "no converter-compatible ADK LLM descendants" in warning
+        assert "call_llm" in warning
+        assert "ADK generate_content" in warning
     def test_no_tool_spans_fallback_to_llm_response(self):
         """When no execute_tool spans exist, function_calls should be
         extracted from call_llm responses instead."""

{agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/test_extraction.py RENAMED Viewed

@@ -107,6 +107,18 @@ class TestExtractUserText:
         }
         assert extract_user_text_from_attrs(attrs) == "Second"
+    def test_adk_llm_request_outer_contents_pascalcase(self):
+        attrs = {
+            ADK_LLM_REQUEST: json.dumps(
+                {
+                    "Contents": [
+                        {"role": "user", "parts": [{"text": "Outer PascalCase only"}]},
+                    ]
+                }
+            )
+        }
+        assert extract_user_text_from_attrs(attrs) == "Outer PascalCase only"
     def test_genai_content_based(self):
         attrs = {
             OTEL_GENAI_INPUT_MESSAGES: json.dumps(
@@ -170,6 +182,10 @@ class TestExtractAgentResponse:
         attrs = {ADK_LLM_RESPONSE: json.dumps({"content": {"parts": [{"text": "ADK response"}]}})}
         assert extract_agent_response_from_attrs(attrs) == "ADK response"
+    def test_adk_llm_response_outer_content_pascalcase(self):
+        attrs = {ADK_LLM_RESPONSE: json.dumps({"Content": {"parts": [{"text": "Outer Content only"}]}})}
+        assert extract_agent_response_from_attrs(attrs) == "Outer Content only"
     def test_genai_content_based(self):
         attrs = {
             OTEL_GENAI_OUTPUT_MESSAGES: json.dumps(
@@ -519,6 +535,39 @@ class TestAdkExtractorSpanFinding:
         ext = AdkExtractor()
         assert [s.span_id for s in ext.find_llm_spans_in(root)] == ["llm1"]
+    def test_find_llm_spans_in_falls_back_to_adk_generate_content(self):
+        child_llm = _span(
+            op="generate_content mockllm-deterministic",
+            tags={ADK_LLM_REQUEST: "{}"},
+            span_id="llm1",
+        )
+        child_tool = _span(op="execute_tool search", span_id="tool1")
+        root = _span(op="invoke_agent a", children=[child_llm, child_tool])
+        ext = AdkExtractor()
+        assert [s.span_id for s in ext.find_llm_spans_in(root)] == ["llm1"]
+    def test_find_llm_spans_in_ignores_provider_generate_content_without_adk_payload(self):
+        child_llm = _span(
+            op="generate_content gpt-4",
+            tags={OTEL_GENAI_REQUEST_MODEL: "gpt-4"},
+            span_id="llm1",
+        )
+        root = _span(op="invoke_agent a", children=[child_llm])
+        ext = AdkExtractor()
+        assert ext.find_llm_spans_in(root) == []
+    def test_find_llm_spans_in_prefers_call_llm_over_generate_content(self):
+        call_llm = _span(op="call_llm gemini", span_id="llm1", start_time=20)
+        generate_content = _span(
+            op="generate_content gemini",
+            tags={ADK_LLM_REQUEST: "{}"},
+            span_id="llm2",
+            start_time=10,
+        )
+        root = _span(op="invoke_agent a", children=[generate_content, call_llm])
+        ext = AdkExtractor()
+        assert [s.span_id for s in ext.find_llm_spans_in(root)] == ["llm1"]
     def test_find_tool_spans_in(self):
         child_llm = _span(op="call_llm gemini", span_id="llm1")
         child_tool = _span(op="execute_tool search", span_id="tool1")
@@ -530,6 +579,7 @@ class TestAdkExtractorSpanFinding:
         ext = AdkExtractor()
         assert ext.classify_span(_span(op="invoke_agent a", tags={OTEL_SCOPE: ADK_SCOPE_VALUE})) == "invocation"
         assert ext.classify_span(_span(op="call_llm", tags={OTEL_SCOPE: ADK_SCOPE_VALUE})) == "llm"
+        assert ext.classify_span(_span(op="generate_content", tags={ADK_LLM_REQUEST: "{}"})) == "llm"
         assert ext.classify_span(_span(op="execute_tool x", tags={OTEL_SCOPE: ADK_SCOPE_VALUE})) == "tool"
         assert ext.classify_span(_span(op="random")) is None

agentevals-cli 0.9.0__tar.gz → 0.9.1__tar.gz

agentevals-cli 0.9.0tar.gz → 0.9.1tar.gz