PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.1.3py3-none-any.whl → 1.1.8b0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (97) hide show

{ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/METADATA +19 -1
ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
wxo_agentic_evaluation/analytics/tools/analyzer.py +4 -2
wxo_agentic_evaluation/analyze_run.py +1025 -220
wxo_agentic_evaluation/annotate.py +2 -2
wxo_agentic_evaluation/arg_configs.py +60 -2
wxo_agentic_evaluation/base_user.py +25 -0
wxo_agentic_evaluation/batch_annotate.py +19 -2
wxo_agentic_evaluation/clients.py +103 -0
wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
wxo_agentic_evaluation/compare_runs/diff.py +554 -0
wxo_agentic_evaluation/compare_runs/model.py +193 -0
wxo_agentic_evaluation/data_annotator.py +25 -7
wxo_agentic_evaluation/description_quality_checker.py +29 -6
wxo_agentic_evaluation/evaluation.py +16 -8
wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
wxo_agentic_evaluation/evaluation_package.py +414 -69
wxo_agentic_evaluation/external_agent/__init__.py +1 -1
wxo_agentic_evaluation/external_agent/external_validate.py +7 -5
wxo_agentic_evaluation/external_agent/types.py +3 -9
wxo_agentic_evaluation/extractors/__init__.py +3 -0
wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
wxo_agentic_evaluation/langfuse_collection.py +60 -0
wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
wxo_agentic_evaluation/llm_matching.py +104 -2
wxo_agentic_evaluation/llm_safety_eval.py +64 -0
wxo_agentic_evaluation/llm_user.py +5 -4
wxo_agentic_evaluation/llm_user_v2.py +114 -0
wxo_agentic_evaluation/main.py +112 -343
wxo_agentic_evaluation/metrics/__init__.py +15 -0
wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
wxo_agentic_evaluation/metrics/evaluations.py +107 -0
wxo_agentic_evaluation/metrics/journey_success.py +137 -0
wxo_agentic_evaluation/metrics/llm_as_judge.py +26 -0
wxo_agentic_evaluation/metrics/metrics.py +276 -8
wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
wxo_agentic_evaluation/otel_parser/parser.py +163 -0
wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
wxo_agentic_evaluation/otel_parser/utils.py +15 -0
wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
wxo_agentic_evaluation/otel_support/evaluate_tau.py +44 -10
wxo_agentic_evaluation/otel_support/otel_message_conversion.py +12 -4
wxo_agentic_evaluation/otel_support/tasks_test.py +456 -116
wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
wxo_agentic_evaluation/prompt/template_render.py +103 -4
wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
wxo_agentic_evaluation/quick_eval.py +33 -17
wxo_agentic_evaluation/record_chat.py +38 -32
wxo_agentic_evaluation/red_teaming/attack_evaluator.py +211 -62
wxo_agentic_evaluation/red_teaming/attack_generator.py +63 -40
wxo_agentic_evaluation/red_teaming/attack_list.py +95 -7
wxo_agentic_evaluation/red_teaming/attack_runner.py +77 -17
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +105 -39
wxo_agentic_evaluation/resource_map.py +3 -1
wxo_agentic_evaluation/runner.py +329 -0
wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +24 -293
wxo_agentic_evaluation/scheduler.py +247 -0
wxo_agentic_evaluation/service_instance.py +26 -17
wxo_agentic_evaluation/service_provider/__init__.py +145 -9
wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
wxo_agentic_evaluation/service_provider/model_proxy_provider.py +417 -17
wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
wxo_agentic_evaluation/service_provider/provider.py +130 -10
wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
wxo_agentic_evaluation/service_provider/watsonx_provider.py +481 -53
wxo_agentic_evaluation/simluation_runner.py +125 -0
wxo_agentic_evaluation/test_prompt.py +4 -4
wxo_agentic_evaluation/type.py +185 -16
wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
wxo_agentic_evaluation/utils/__init__.py +44 -3
wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
wxo_agentic_evaluation/utils/messages_parser.py +30 -0
wxo_agentic_evaluation/utils/parsers.py +71 -0
wxo_agentic_evaluation/utils/utils.py +313 -9
wxo_agentic_evaluation/wxo_client.py +81 -0
ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD +0 -102
wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +0 -176
{ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
{ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0

wxo_agentic_evaluation/simluation_runner.py ADDED Viewed

@@ -0,0 +1,125 @@
+from wxo_agentic_evaluation.evaluation_controller.evaluation_controller import EvaluationController
+from langfuse import get_client
+from wxo_agentic_evaluation.runtime_adapter.runtime_adapter import RuntimeAdapter
+from wxo_agentic_evaluation.runtime_adapter.wxo_runtime_adapter import WXORuntimeAdapter
+from wxo_agentic_evaluation.type import Message, RuntimeResponse
+from wxo_agentic_evaluation.llm_user import LLMUser
+from wxo_agentic_evaluation.llm_user_v2 import LLMUserV2
+from wxo_agentic_evaluation.arg_configs import ControllerConfig
+from wxo_agentic_evaluation.hr_agent_langgraph import agent
+from dotenv import load_dotenv
+load_dotenv()
+import os
+import base64
+os.environ["USE_PORTKEY_PROVIDER"] = "true"
+lf_public = os.getenv("LANGFUSE_PUBLIC_KEY")
+lf_secret = os.getenv("LANGFUSE_SECRET_KEY")
+auth_bytes = f"{lf_public}:{lf_secret}".encode("utf-8")
+auth_b64 = base64.b64encode(auth_bytes).decode("ascii")
+HEADERS = {"Authorization": f"Basic {auth_b64}"}
+lf_base_url = os.getenv("LANGFUSE_HOST", "https://cloud.langfuse.com").rstrip("/")
+OTEL_ENDPOINT = f"{lf_base_url}/api/public/otel/v1/traces"
+from phoenix.otel import register
+register(endpoint=OTEL_ENDPOINT, headers=HEADERS, auto_instrument=True)
+context = {"session_id": "1", "chat_history": []}
+class MyAgentWrapper(RuntimeAdapter):
+    def run(
+        self,
+        user_message: Message,
+        context: dict,
+        thread_id=None,
+    ) -> RuntimeResponse:
+        message_json = user_message.model_dump()
+        messages = {"messages": [ message_json ]}
+        result = agent.invoke(messages)
+        # print(result)
+        message = Message(role="assistant", content=result["messages"][-1].content)
+        # messages = [Message(role="assistant", content=msg.content, type="tool_call") for msg in result["messages"]]
+        return RuntimeResponse(messages=[message])
+agent_wrapper = MyAgentWrapper()
+from openinference.instrumentation import using_session
+class SimulationRunner:
+    def __init__(self, user_agent: LLMUser,
+                 agent: RuntimeAdapter,
+                 config: ControllerConfig):
+        self.evaluation_controller = EvaluationController(
+            runtime=agent,
+            llm_user=user_agent,
+            config=config,
+        )
+        self.counter = 0
+    def run_wrapper(self, session_id = 'session-id-test-00'):
+        def run_task(*, item, **kwargs):
+            """
+            Task function for Langfuse experiment.
+            Item input should be: {"persona": "...", "scenario": "..."}
+            """
+            # print(item)
+            with using_session(session_id + "-" + self.counter.__str__()):
+                input = item.input
+                user_story = input.get("story")
+                starting_sentence = input.get("starting_sentence")
+                agent_name = input.get("agent")
+                _, _, _, thread_id = self.evaluation_controller.run(self.counter, agent_name=agent_name, story=user_story, starting_user_input=starting_sentence)
+                self.counter += 1
+                if isinstance(self.evaluation_controller.runtime, WXORuntimeAdapter):
+                    return thread_id
+            return session_id
+        return run_task
+if __name__ == "__main__":
+    import json
+    with open("benchmarks/hr_sample/data_simple.json") as f:
+        data = json.load(f)
+    langfuse = get_client()
+    langfuse.create_dataset(name="dataset-test-00")
+    # Upload to Langfuse
+    langfuse.create_dataset_item(
+        dataset_name="dataset-test-00",
+        # any python object or value
+        input={"story": data["story"], "starting_sentence": data["starting_sentence"]},
+        # any python object or value, optional
+        expected_output={"goals": data["goals"], "goal_details": data["goal_details"]},
+    )
+    from wxo_agentic_evaluation.service_provider import get_provider
+    model_id = "gpt-4o-mini"
+    provider = get_provider(provider="openai", model_id=model_id, api_key=os.getenv("OPENAI_API_KEY"),
+                            use_portkey_provider=True)
+    llm_user = LLMUserV2(llm_client=provider, user_prompt_path="src/wxo_agentic_evaluation/prompt/universal_user_template.jinja2")
+    config = ControllerConfig()
+    simluation_runner = SimulationRunner(agent = agent_wrapper, user_agent=llm_user, config=config)
+    dataset = langfuse.get_dataset("dataset-test-00")
+    result = dataset.run_experiment(
+        name="experiment-test-00",
+        description="Synthetic conversations from persona/scenario pairs",
+        task=simluation_runner.run_wrapper()
+    )
+    get_client().flush()
+    session_id = "dummy-1"
+    with using_session(session_id):
+        result = agent_wrapper.run(Message(role="user", content="hi"), context={})
+    print(result)

wxo_agentic_evaluation/test_prompt.py CHANGED Viewed

@@ -1,7 +1,6 @@
 from wxo_agentic_evaluation.watsonx_provider import WatsonXProvider
 def parse_json_string(input_string):
     json_char_count = 0
     json_objects = []
@@ -31,9 +30,10 @@ def parse_json_string(input_string):
     is_thinking_step = len(input_string) - json_char_count > 10
     return json_objects
 wai_client = WatsonXProvider(model_id="meta-llama/llama-3-405b-instruct")
-prompt =  """
+prompt = """
 <|begin_of_text|><|start_header_id|>system<|end_header_id|>
 You are trying to make tool calls. Given a raw input and tool output. Try to extract the information to make the tool call
@@ -83,12 +83,12 @@ test_sample2 = """
  <|start_header_id|>ipython<|end_header_id|>"""
 outputs = wai_client.query(prompt + test_sample1)
 import json
 print(outputs["generated_text"])
 json_obj = parse_json_string(outputs["generated_text"])[0]
-print(json_obj)
+print(json_obj)

wxo_agentic_evaluation/type.py CHANGED Viewed

@@ -1,8 +1,21 @@
-from enum import StrEnum
-from typing import Any, Dict, List, Optional, Union
+from enum import Enum, StrEnum
+from hashlib import md5
+from typing import Any, Dict, List, Literal, Mapping, Optional, Union
-from pydantic import BaseModel, ConfigDict, Field
-from rich.text import Text
+from pydantic import (
+    BaseModel,
+    ConfigDict,
+    Field,
+    computed_field,
+    model_validator,
+)
+class CallTracker(BaseModel):
+    tool_call: List = []
+    tool_response: List = []
+    generic: List = []
+    metadata: Dict[str, Any] = Field(default={})
 class EventTypes(StrEnum):
@@ -27,6 +40,11 @@ class AttackCategory(StrEnum):
     off_policy = "off_policy"
+class Roles(Enum):
+    ASSISTANT = "assistant"
+    USER = "user"
 class ConversationalSearchCitations(BaseModel):
     url: str
     body: str
@@ -90,10 +108,35 @@ class ConversationalSearch(BaseModel):
     response_length_option: str
+class OTelParserFunction(BaseModel):
+    """OpenAI chat completion function structure for OTel parser tool calls"""
+    name: str
+    arguments: str  # JSON string of arguments
+    model_config = ConfigDict(frozen=True)
+    def __str__(self):
+        return f"{self.name}:{self.arguments}"
+class OTelParserToolCall(BaseModel):
+    """OpenAI chat completion tool call structure for OTel parser"""
+    id: str
+    function: OTelParserFunction
+    type: Literal["function"] = "function"
+    model_config = ConfigDict(frozen=True)
+    def __str__(self):
+        return f"{self.id}:{self.type}:{self.function}"
 class Message(BaseModel):
     role: str
     content: Union[str, Dict[str, Any]]
-    type: ContentType
+    type: ContentType = None
     # event that produced the message
     event: Optional[str] = None
     # used to correlate the Message with the retrieval context (ConversationalSearch)
@@ -107,18 +150,70 @@ class ExtendedMessage(BaseModel):
     reason: dict | list | None = None
+class OTelParserMessage(Message):
+    """Message class for OTel parser with OpenAI-compatible tool call fields.
+    Inherits from Message and adds structured tool call fields for compatibility
+    with OpenTelemetry trace parsing (LangGraph, Pydantic AI, etc.)
+    """
+    tool_calls: Optional[List[OTelParserToolCall]] = None
+    tool_call_id: Optional[str] = None
+    def hash(self) -> str:
+        """Generate hash for message deduplication"""
+        parts = [
+            self.role,
+            str(self.content) if self.content else "",
+            (
+                ":".join(str(tc) for tc in self.tool_calls)
+                if self.tool_calls
+                else ""
+            ),
+            self.tool_call_id or "",
+        ]
+        return md5(":".join(parts).encode("utf-8")).hexdigest()
 class KnowledgeBaseGoalDetail(BaseModel):
     enabled: bool = False
     metrics: list = []
+class MatchingStrategy(StrEnum):
+    """Argument matching strategy:\n
+    Strict: exact match\n
+    Optional: optional argument, exact match if the field exists\n
+    Fuzzy: semantic/similarity match\n"""
+    strict = "strict"
+    optional = "optional"
+    fuzzy = "fuzzy"
 class GoalDetail(BaseModel):
     name: str
-    tool_name: str = None
+    tool_name: Optional[str] = None
     type: ContentType
-    args: Dict = None
-    response: str = None
-    keywords: List = None
+    args: Optional[Dict] = None
+    # matching strategy defaults to `strict` matching if not specified in the test case
+    arg_matching: Optional[dict[str, MatchingStrategy]] = Field(
+        default_factory=dict
+    )
+    response: Optional[str] = None
+    keywords: Optional[List] = None
+    @model_validator(mode="after")
+    def validate_arg_matching(self):
+        for field in self.arg_matching:
+            if field not in self.args:
+                raise ValueError(
+                    f"{field} not in goal arguments for goal {self.name}"
+                )
+        return self
+class GoalDetailOrchestrate(GoalDetail):
     knowledge_base: KnowledgeBaseGoalDetail = KnowledgeBaseGoalDetail()
@@ -131,23 +226,97 @@ class AttackData(BaseModel):
 class AttackData(BaseModel):
     agent: str
-    agents_path: str
+    agents_list_or_path: Union[List[str], str]
     attack_data: AttackData
     story: str
     starting_sentence: str
-    goals: Dict = None
-    goal_details: List[GoalDetail] = None
+    goals: dict | None = None
+    goal_details: list[GoalDetail] | None = None
-class EvaluationData(BaseModel):
-    agent: str
-    goals: Dict
+class DatasetModel(BaseModel):
+    starting_sentence: str | None = None
     story: str
+    goals: Mapping[str, Any]
     goal_details: List[GoalDetail]
-    starting_sentence: str = None
+    max_user_turns: int | None = None
+    agent: str | None = None
+class LangfuseDatasetModel(DatasetModel):
+    @computed_field
+    @property
+    def langfuse_input(self) -> Mapping[str, Any]:
+        input = {
+            "starting_sentence": self.starting_sentence,
+            "story": self.story,
+            "agent": self.agent
+        }
+        return input
+    @computed_field
+    @property
+    def langfuse_output(self) -> Mapping[str, Any]:
+        output = {"goals": self.goals, "goal_details": self.goal_details}
+        return output
+def _convert_to_langfuse_format(langfuse_row) -> LangfuseDatasetModel:
+    input = langfuse_row.input
+    output = langfuse_row.expected_output
+    for goal in output.get("goal_details"):
+        GoalDetail.model_validate(goal)
+    return LangfuseDatasetModel(
+        starting_sentence=input.get("starting_sentence"),
+        story=input.get("story"),
+        goals=output.get("goals"),
+        goal_details=[
+            GoalDetail.model_validate(goal)
+            for goal in output.get("goal_details")
+        ],
+    )
+class OrchestrateDataset(DatasetModel):
+    goal_details: List[GoalDetailOrchestrate]
+    agent: str
+class LangfuseCollectionModel(BaseModel):
+    collection_name: str
+    datasets: List[LangfuseDatasetModel]
+    collection_description: Optional[str] = ""
+    metadata: Optional[Mapping[str, str]] = None
 class ToolDefinition(BaseModel):
     tool_description: Optional[str]
     tool_name: str
     tool_params: List[str]
+class ProviderInstancesCacheKey(BaseModel):
+    provider: str
+    hashed_args: str
+    hashed_kwargs: str
+    def __str__(self) -> str:
+        return f"{self.provider}|{self.hashed_args}|{self.hashed_kwargs}"
+class RuntimeResponse(BaseModel):
+    messages: List[Message]
+    thread_id: str | None = None
+    context: dict = Field(default={})
+class ExperimentResult(BaseModel):
+    experiment_name: str
+    run_id: str
+    experiment_id: str
+    metrics: list
+    session_ids: List[str]

wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py ADDED Viewed

@@ -0,0 +1,100 @@
+from wxo_agentic_evaluation.llm_user_v2 import LLMUser
+from wxo_agentic_evaluation.service_provider.portkey_provider import (
+    PortkeyProvider,
+)
+from openai import OpenAI
+import os
+import uuid
+from wxo_agentic_evaluation.type import Message, ContentType
+user_story = "Your user id is mia_li_3668. You want to fly from New York to Seattle on May 20 (one way). You do not want to fly before 11am est. You want to fly in economy. You prefer direct flights but one stopover also fine. If there are multiple options, you prefer the one with the lowest price. You have 3 baggages. You do not want insurance. You want to use your two certificates to pay. If only one certificate can be used, you prefer using the larger one, and pay the rest with your 7447 card. You are reactive to the agent and will not say anything that is not asked. Your birthday is in your user profile so you do not prefer to provide it."
+portkey_client = PortkeyProvider(
+    provider="@openai",
+    model_id="gpt-4o-mini",
+    api_key=os.environ.get("PORTKEY_API_KEY"),
+)
+user_response_style = [
+    "reactive to the agent and will not say anything that is not asked",
+    "replies only in very short sentences and few words",
+]
+user_agent = LLMUser(
+    llm_client=portkey_client,
+    user_prompt_path="../prompt/universal_user_template.jinja2",
+)
+agent = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
+def get_agent_response(messages: list[dict]) -> str:
+    response = agent.chat.completions.create(
+        model="gpt-4o-mini", messages=messages
+    )
+    return response.choices[0].message.content
+starting_user_input = Message(
+    role="user", content="I want to fly.", type=ContentType.text
+)
+agent_system_prompt = Message(
+    role="system",
+    content="You are a helpful assistant. Keep your responses short and concise.",
+    type=ContentType.text,
+)
+session_id = str(uuid.uuid4())
+max_turns = 30
+conversation_history = []
+for i in range(max_turns):
+    if len(conversation_history) == 0:
+        conversation_history.append(agent_system_prompt)
+        conversation_history.append(
+            Message(
+                role="assistant",
+                content="Hi! How can I help you today?",
+                type=ContentType.text,
+            )
+        )
+        user_response = user_agent.generate_user_input(
+            user_story=user_story,
+            conversation_history=conversation_history,
+            user_response_style=user_response_style,
+            starting_user_input=starting_user_input,
+        )
+    else:
+        user_response = user_agent.generate_user_input(
+            user_story=user_story,
+            conversation_history=conversation_history,
+            user_response_style=user_response_style,
+            starting_user_input=None,
+        )
+    conversation_history.append(user_response)
+    print(f"User: {user_response.content}")
+    if "END" in user_response.content:
+        break
+    # Get agent response
+    agent_response_content = get_agent_response(
+        [msg.model_dump() for msg in conversation_history]
+    )
+    # agent_response_content = get_langflow_agent_response(conversation_history, session_id)
+    # agent_response_content = asyncio.run(get_langgraph_agent_response(conversation_history, session_id))
+    print(f"Agent: {agent_response_content}")
+    agent_response = Message(
+        role="assistant", content=agent_response_content, type=ContentType.text
+    )
+    conversation_history.append(agent_response)
+print(conversation_history)

wxo_agentic_evaluation/utils/__init__.py CHANGED Viewed

@@ -1,6 +1,47 @@
 import json
+import os
+import tempfile
+from pathlib import Path
+from wxo_agentic_evaluation.utils.open_ai_tool_extractor import (
+    ToolExtractionOpenAIFormat,
+)
+from wxo_agentic_evaluation.utils.parsers import ReferencelessEvalParser
+from wxo_agentic_evaluation.utils.utils import (
+    N_A,
+    TestCaseResources,
+    add_line_seperator,
+    list_run_files,
+    load_run_metrics,
+)
-def json_dump(output_path, object):
-    with open(output_path, "w", encoding="utf-8") as f:
-        json.dump(object, f, indent=4)
+def json_dump(output_path, obj):
+    """
+    Atomically dump JSON to `output_path`.
+    - Writes to a temporary file first
+    - Then atomically replaces the target file
+    - Prevents corrupted/half-written JSON if process is interrupted
+    """
+    output_path = Path(output_path)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    fd, tmp_path = tempfile.mkstemp(
+        dir=output_path.parent,
+        prefix=output_path.stem,
+        suffix=".tmp",
+        text=True,
+    )
+    try:
+        with os.fdopen(fd, "w", encoding="utf-8") as f:
+            json.dump(obj, f, indent=4, ensure_ascii=False)
+            f.flush()
+            os.fsync(f.fileno())
+        os.replace(tmp_path, output_path)
+    except Exception:
+        try:
+            os.remove(tmp_path)
+        except OSError:
+            pass
+        raise

wxo_agentic_evaluation/utils/evaluation_discovery.py ADDED Viewed

@@ -0,0 +1,47 @@
+"""
+Evaluation discovery mechanism.
+This module provides functionality for discovering classes that inherit from Evaluation.
+"""
+import importlib.util
+import inspect
+import os
+def find_evaluation_subclasses(directory: str, base_class_name="Evaluation"):
+    """
+    Dynamically import Python files under 'directory' and find classes that
+    inherit from a class named 'Evaluation'. Returns a list of non-abstract
+    class objects.
+    """
+    subclasses = []
+    for root, _, files in os.walk(directory):
+        for file in files:
+            if file.endswith(".py") and not file.startswith("__"):
+                filepath = os.path.join(root, file)
+                module_name = os.path.splitext(os.path.basename(filepath))[0]
+                spec = importlib.util.spec_from_file_location(
+                    module_name, filepath
+                )
+                if spec and spec.loader:
+                    module = importlib.util.module_from_spec(spec)
+                    try:
+                        spec.loader.exec_module(module)
+                    except Exception as e:
+                        print(f"Skipping {filepath} due to import error: {e}")
+                        continue
+                    # Inspect for subclasses
+                    for name, obj in inspect.getmembers(
+                        module, inspect.isclass
+                    ):
+                        if any(
+                            base.__name__ == base_class_name
+                            for base in obj.__mro__[1:]
+                        ) and not inspect.isabstract(obj):
+                            subclasses.append(obj)
+    return subclasses

wxo_agentic_evaluation/utils/gateway_provider_utils.py ADDED Viewed

@@ -0,0 +1,39 @@
+import os
+from functools import lru_cache
+from wxo_agentic_evaluation.arg_configs import AuthConfig
+from wxo_agentic_evaluation.service_provider import USE_GATEWAY_MODEL_PROVIDER
+from wxo_agentic_evaluation.wxo_client import get_wxo_client
+WXO_AUTH_CONFIG_DEFAULTS = AuthConfig(
+    url=os.getenv("WXO_URL", "http://localhost:4321"),
+    tenant_name=os.getenv("WXO_TENANT", "wxo-dev"),
+    token=os.getenv("WXO_TOKEN", None),
+)
+@lru_cache(maxsize=1)
+def _get_cached_wxo_client():
+    # TODO: remove this once the client is implemented as a Singleton.
+    return get_wxo_client(
+        WXO_AUTH_CONFIG_DEFAULTS.url,
+        WXO_AUTH_CONFIG_DEFAULTS.tenant_name,
+        WXO_AUTH_CONFIG_DEFAULTS.token,
+    )
+def get_provider_kwargs(**base_kwargs: dict) -> dict:
+    if not USE_GATEWAY_MODEL_PROVIDER:
+        return base_kwargs
+    if "instance_url" in base_kwargs and "token" in base_kwargs:
+        return base_kwargs
+    wxo_client = _get_cached_wxo_client()
+    return {
+        **base_kwargs,
+        "instance_url": wxo_client.service_url,
+        "token": wxo_client.api_key,
+    }

wxo_agentic_evaluation/utils/messages_parser.py ADDED Viewed

@@ -0,0 +1,30 @@
+from typing import Optional
+from pydantic import BaseModel, Field
+from wxo_agentic_evaluation.type import ContentType, Message
+class ParsedMessages(BaseModel):
+    """
+    A parsed history of messages.
+    """
+    messages: list[Message] = Field(description="The list of messages")
+    @property
+    def user_input(self) -> Optional[str]:
+        """Find the original user message."""
+        for message in self.messages:
+            if message.role == "user" and message.type == ContentType.text:
+                return str(message.content)
+        return None
+    @property
+    def agent_response(self) -> Optional[str]:
+        """Find the most recent assistant message."""
+        messages_in_reverse = reversed(self.messages)
+        for message in messages_in_reverse:
+            if message.role == "assistant" and message.type == ContentType.text:
+                return str(message.content)
+        return None

ibm-watsonx-orchestrate-evaluation-framework 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

ibm-watsonx-orchestrate-evaluation-framework 1.1.3py3-none-any.whl → 1.1.8b0py3-none-any.whl