PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.1.3py3-none-any.whl → 1.1.8b0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (97) hide show

{ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/METADATA +19 -1
ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
wxo_agentic_evaluation/analytics/tools/analyzer.py +4 -2
wxo_agentic_evaluation/analyze_run.py +1025 -220
wxo_agentic_evaluation/annotate.py +2 -2
wxo_agentic_evaluation/arg_configs.py +60 -2
wxo_agentic_evaluation/base_user.py +25 -0
wxo_agentic_evaluation/batch_annotate.py +19 -2
wxo_agentic_evaluation/clients.py +103 -0
wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
wxo_agentic_evaluation/compare_runs/diff.py +554 -0
wxo_agentic_evaluation/compare_runs/model.py +193 -0
wxo_agentic_evaluation/data_annotator.py +25 -7
wxo_agentic_evaluation/description_quality_checker.py +29 -6
wxo_agentic_evaluation/evaluation.py +16 -8
wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
wxo_agentic_evaluation/evaluation_package.py +414 -69
wxo_agentic_evaluation/external_agent/__init__.py +1 -1
wxo_agentic_evaluation/external_agent/external_validate.py +7 -5
wxo_agentic_evaluation/external_agent/types.py +3 -9
wxo_agentic_evaluation/extractors/__init__.py +3 -0
wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
wxo_agentic_evaluation/langfuse_collection.py +60 -0
wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
wxo_agentic_evaluation/llm_matching.py +104 -2
wxo_agentic_evaluation/llm_safety_eval.py +64 -0
wxo_agentic_evaluation/llm_user.py +5 -4
wxo_agentic_evaluation/llm_user_v2.py +114 -0
wxo_agentic_evaluation/main.py +112 -343
wxo_agentic_evaluation/metrics/__init__.py +15 -0
wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
wxo_agentic_evaluation/metrics/evaluations.py +107 -0
wxo_agentic_evaluation/metrics/journey_success.py +137 -0
wxo_agentic_evaluation/metrics/llm_as_judge.py +26 -0
wxo_agentic_evaluation/metrics/metrics.py +276 -8
wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
wxo_agentic_evaluation/otel_parser/parser.py +163 -0
wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
wxo_agentic_evaluation/otel_parser/utils.py +15 -0
wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
wxo_agentic_evaluation/otel_support/evaluate_tau.py +44 -10
wxo_agentic_evaluation/otel_support/otel_message_conversion.py +12 -4
wxo_agentic_evaluation/otel_support/tasks_test.py +456 -116
wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
wxo_agentic_evaluation/prompt/template_render.py +103 -4
wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
wxo_agentic_evaluation/quick_eval.py +33 -17
wxo_agentic_evaluation/record_chat.py +38 -32
wxo_agentic_evaluation/red_teaming/attack_evaluator.py +211 -62
wxo_agentic_evaluation/red_teaming/attack_generator.py +63 -40
wxo_agentic_evaluation/red_teaming/attack_list.py +95 -7
wxo_agentic_evaluation/red_teaming/attack_runner.py +77 -17
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +105 -39
wxo_agentic_evaluation/resource_map.py +3 -1
wxo_agentic_evaluation/runner.py +329 -0
wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +24 -293
wxo_agentic_evaluation/scheduler.py +247 -0
wxo_agentic_evaluation/service_instance.py +26 -17
wxo_agentic_evaluation/service_provider/__init__.py +145 -9
wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
wxo_agentic_evaluation/service_provider/model_proxy_provider.py +417 -17
wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
wxo_agentic_evaluation/service_provider/provider.py +130 -10
wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
wxo_agentic_evaluation/service_provider/watsonx_provider.py +481 -53
wxo_agentic_evaluation/simluation_runner.py +125 -0
wxo_agentic_evaluation/test_prompt.py +4 -4
wxo_agentic_evaluation/type.py +185 -16
wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
wxo_agentic_evaluation/utils/__init__.py +44 -3
wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
wxo_agentic_evaluation/utils/messages_parser.py +30 -0
wxo_agentic_evaluation/utils/parsers.py +71 -0
wxo_agentic_evaluation/utils/utils.py +313 -9
wxo_agentic_evaluation/wxo_client.py +81 -0
ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD +0 -102
wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +0 -176
{ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
{ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0

wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py DELETED Viewed

@@ -1,176 +0,0 @@
-from wxo_agentic_evaluation.otel_support.tasks_test import TASKS
-from wxo_agentic_evaluation.type import EvaluationData, Message, EventTypes, ContentType
-from typing import Any, Dict, List, Union
-from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
-import json
-import glob
-file_paths = glob.glob("airline_traces/*.json")
-def convert_span_to_messages(span: Dict[str, Any]) -> List[Message]:
-    attrs: Dict[str, str] = {}
-    for attr in span.get("attributes", []):
-        k = attr.get("key")
-        v_obj = attr.get("value", {})
-        v = v_obj.get("stringValue")
-        if v is None and v_obj:
-            v = next(iter(v_obj.values()))
-        if isinstance(v, (str, int, float, bool)):
-            attrs[k] = str(v)
-        else:
-            attrs[k] = json.dumps(v) if v is not None else ""
-    def collect_message_indexes(prefix: str) -> List[int]:
-        idxs = set()
-        plen = len(prefix)
-        for k in attrs:
-            if k.startswith(prefix):
-                rest = k[plen:]
-                first = rest.split(".", 1)[0]
-                if first.isdigit():
-                    idxs.add(int(first))
-        return sorted(idxs)
-    messages: List[Message] = []
-    in_prefix = "llm.input_messages."
-    for i in collect_message_indexes(in_prefix):
-        role = attrs.get(f"{in_prefix}{i}.message.role", "")
-        tc_prefix = f"{in_prefix}{i}.message.tool_calls."
-        has_tool_calls = any(k.startswith(tc_prefix) for k in attrs.keys())
-        if has_tool_calls:
-            call_indexes = set()
-            for k in attrs.keys():
-                if k.startswith(tc_prefix):
-                    rest = k[len(tc_prefix):]
-                    first = rest.split(".", 1)[0]
-                    if first.isdigit():
-                        call_indexes.add(int(first))
-            for ci in sorted(call_indexes):
-                name = attrs.get(f"{tc_prefix}{ci}.tool_call.function.name", "")
-                args_raw = attrs.get(f"{tc_prefix}{ci}.tool_call.function.arguments", "{}")
-                tool_call_id = attrs.get(f"{tc_prefix}{ci}.tool_call.id", "")
-                try:
-                    args = json.loads(args_raw)
-                except Exception:
-                    args = {"raw": args_raw}
-                messages.append(
-                    Message(
-                        role="assistant",
-                        content=json.dumps({"args": args, "name": name, "tool_call_id": tool_call_id}),
-                        type=ContentType.tool_call,
-                    )
-                )
-        else:
-            content = attrs.get(f"{in_prefix}{i}.message.content", "")
-            messages.append(
-                Message(
-                    role=role if role in {"user", "assistant", "tool"} else "user",
-                    content=content,
-                    type=ContentType.text,
-                )
-            )
-        if role == "tool":
-            pass
-    out_prefix = "llm.output_messages."
-    for i in collect_message_indexes(out_prefix):
-        role = attrs.get(f"{out_prefix}{i}.message.role", "assistant")
-        content = attrs.get(f"{out_prefix}{i}.message.content", "")
-        messages.append(
-            Message(
-                role=role if role in {"user", "assistant", "tool"} else "assistant",
-                content=content,
-                type=ContentType.text,
-            )
-        )
-    return messages
-total = 0
-success = 0
-for i, file in enumerate(file_paths):
-    # if i != 2:
-    #     continue
-    with open(file, "r") as f:
-        data = json.load(f)
-    messages = []
-    for span in data["resourceSpans"][0]["scopeSpans"][0]["spans"]:
-        temp = convert_span_to_messages(span)
-        if len(temp) > len(messages):
-            messages = temp
-    for msg in messages:
-        #print(msg.role, msg.content)
-        pass
-    task_id = None
-    for kv in data["resourceSpans"][0]["scopeSpans"][0]["spans"][-1]["attributes"]:
-        if kv["key"] == "task.index":
-            task_id = int(kv["value"]["stringValue"])
-    task = TASKS[task_id].model_dump()
-    goal_temp = []
-    goals = {}
-    goal_details = []
-    i = 0
-    for action in task["actions"]:
-        goal_temp.append(action["name"] + f"_{i}")
-        args = {}
-        for k,v in action["kwargs"].items():
-            args[k] = v
-        goal_detail = {"type": "tool_call", "name": action["name"] + f"_{i}", "tool_name": action["name"], "args": args }
-        goal_details.append(goal_detail)
-        i += 1
-    if not goal_temp:
-        continue
-    if len(goal_temp) == 1:
-        goals[goal_temp[0]] = []
-    else:
-        for i in range(len(goal_temp)-1):
-            goals.update({goal_temp[i]: [goal_temp[i+1]]})
-        goals[goal_temp[-1]]= []
-    gt_data = {
-        "agent": "airline_agent",
-        "goals": goals,
-        "goal_details": goal_details,
-        "story": task["instruction"],
-        "starting_sentence": "",
-    }
-    gt_data = EvaluationData.model_validate(gt_data)
-    tc_name = f"airline_test_{i}"
-    try:
-        evaluation_package = EvaluationPackage(
-            test_case_name=tc_name,
-            messages=messages,
-            ground_truth=gt_data,
-            conversational_search_data=None,
-            resource_map=None
-        )
-        (
-            keyword_semantic_matches,
-            knowledge_base_metrics,
-            messages_with_reason,
-            metrics,
-        ) = evaluation_package.generate_summary()
-        success += metrics.is_success
-        total += 1
-    except Exception as e:
-        raise e
-print(success/total)
-print(total)

{ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL RENAMED Viewed

File without changes

{ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt RENAMED Viewed

File without changes

ibm-watsonx-orchestrate-evaluation-framework 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

ibm-watsonx-orchestrate-evaluation-framework 1.1.3py3-none-any.whl → 1.1.8b0py3-none-any.whl