ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA +53 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
- wxo_agentic_evaluation/analytics/tools/analyzer.py +38 -21
- wxo_agentic_evaluation/analytics/tools/main.py +19 -25
- wxo_agentic_evaluation/analytics/tools/types.py +26 -11
- wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
- wxo_agentic_evaluation/analyze_run.py +1184 -97
- wxo_agentic_evaluation/annotate.py +7 -5
- wxo_agentic_evaluation/arg_configs.py +97 -5
- wxo_agentic_evaluation/base_user.py +25 -0
- wxo_agentic_evaluation/batch_annotate.py +97 -27
- wxo_agentic_evaluation/clients.py +103 -0
- wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
- wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
- wxo_agentic_evaluation/compare_runs/diff.py +554 -0
- wxo_agentic_evaluation/compare_runs/model.py +193 -0
- wxo_agentic_evaluation/data_annotator.py +45 -19
- wxo_agentic_evaluation/description_quality_checker.py +178 -0
- wxo_agentic_evaluation/evaluation.py +50 -0
- wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
- wxo_agentic_evaluation/evaluation_package.py +544 -107
- wxo_agentic_evaluation/external_agent/__init__.py +18 -7
- wxo_agentic_evaluation/external_agent/external_validate.py +49 -36
- wxo_agentic_evaluation/external_agent/performance_test.py +33 -22
- wxo_agentic_evaluation/external_agent/types.py +8 -7
- wxo_agentic_evaluation/extractors/__init__.py +3 -0
- wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
- wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
- wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
- wxo_agentic_evaluation/langfuse_collection.py +60 -0
- wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
- wxo_agentic_evaluation/llm_matching.py +108 -5
- wxo_agentic_evaluation/llm_rag_eval.py +7 -4
- wxo_agentic_evaluation/llm_safety_eval.py +64 -0
- wxo_agentic_evaluation/llm_user.py +12 -6
- wxo_agentic_evaluation/llm_user_v2.py +114 -0
- wxo_agentic_evaluation/main.py +128 -246
- wxo_agentic_evaluation/metrics/__init__.py +15 -0
- wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
- wxo_agentic_evaluation/metrics/evaluations.py +107 -0
- wxo_agentic_evaluation/metrics/journey_success.py +137 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +28 -2
- wxo_agentic_evaluation/metrics/metrics.py +319 -16
- wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
- wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
- wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
- wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
- wxo_agentic_evaluation/otel_parser/parser.py +163 -0
- wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
- wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
- wxo_agentic_evaluation/otel_parser/utils.py +15 -0
- wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
- wxo_agentic_evaluation/otel_support/evaluate_tau.py +101 -0
- wxo_agentic_evaluation/otel_support/otel_message_conversion.py +29 -0
- wxo_agentic_evaluation/otel_support/tasks_test.py +1566 -0
- wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
- wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +59 -5
- wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
- wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
- wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
- wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
- wxo_agentic_evaluation/prompt/template_render.py +163 -12
- wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
- wxo_agentic_evaluation/quick_eval.py +384 -0
- wxo_agentic_evaluation/record_chat.py +132 -81
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +302 -0
- wxo_agentic_evaluation/red_teaming/attack_generator.py +329 -0
- wxo_agentic_evaluation/red_teaming/attack_list.py +184 -0
- wxo_agentic_evaluation/red_teaming/attack_runner.py +204 -0
- wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +29 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +245 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +106 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +291 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +465 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +162 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +562 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/field.py +266 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +344 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +193 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +413 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +46 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +158 -0
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +191 -0
- wxo_agentic_evaluation/resource_map.py +6 -3
- wxo_agentic_evaluation/runner.py +329 -0
- wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
- wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
- wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +88 -150
- wxo_agentic_evaluation/scheduler.py +247 -0
- wxo_agentic_evaluation/service_instance.py +117 -26
- wxo_agentic_evaluation/service_provider/__init__.py +182 -17
- wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +628 -45
- wxo_agentic_evaluation/service_provider/ollama_provider.py +392 -22
- wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
- wxo_agentic_evaluation/service_provider/provider.py +129 -10
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +203 -0
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +516 -53
- wxo_agentic_evaluation/simluation_runner.py +125 -0
- wxo_agentic_evaluation/test_prompt.py +4 -4
- wxo_agentic_evaluation/tool_planner.py +141 -46
- wxo_agentic_evaluation/type.py +217 -14
- wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
- wxo_agentic_evaluation/utils/__init__.py +44 -3
- wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
- wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
- wxo_agentic_evaluation/utils/messages_parser.py +30 -0
- wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +178 -0
- wxo_agentic_evaluation/utils/parsers.py +71 -0
- wxo_agentic_evaluation/utils/rich_utils.py +188 -0
- wxo_agentic_evaluation/utils/rouge_score.py +23 -0
- wxo_agentic_evaluation/utils/utils.py +514 -17
- wxo_agentic_evaluation/wxo_client.py +81 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/METADATA +0 -380
- ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/RECORD +0 -56
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
|
@@ -1,40 +1,410 @@
|
|
|
1
|
-
import requests
|
|
2
1
|
import json
|
|
3
|
-
|
|
4
|
-
from typing import List
|
|
2
|
+
import logging
|
|
5
3
|
import os
|
|
4
|
+
import time
|
|
5
|
+
import uuid
|
|
6
|
+
from typing import Any, Dict, Iterator, List, Optional, Sequence
|
|
7
|
+
|
|
8
|
+
import requests
|
|
9
|
+
|
|
10
|
+
from wxo_agentic_evaluation.service_provider.provider import (
|
|
11
|
+
ChatResult,
|
|
12
|
+
Provider,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
6
16
|
|
|
7
17
|
OLLAMA_URL = os.environ.get("OLLAMA_HOST", "http://localhost:11434")
|
|
8
18
|
|
|
9
19
|
|
|
20
|
+
def _truncate(value: Any, max_len: int = 1000) -> str:
|
|
21
|
+
if value is None:
|
|
22
|
+
return ""
|
|
23
|
+
s = str(value)
|
|
24
|
+
return (
|
|
25
|
+
s
|
|
26
|
+
if len(s) <= max_len
|
|
27
|
+
else s[:max_len] + f"... [truncated {len(s) - max_len} chars]"
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _translate_params_to_ollama_options(
|
|
32
|
+
params: Optional[Dict[str, Any]]
|
|
33
|
+
) -> Dict[str, Any]:
|
|
34
|
+
"""
|
|
35
|
+
Map generic params to Ollama 'options' field.
|
|
36
|
+
Ollama options docs: https://github.com/ollama/ollama/blob/main/docs/modelfile.md#parameters
|
|
37
|
+
"""
|
|
38
|
+
p = params or {}
|
|
39
|
+
out: Dict[str, Any] = {}
|
|
40
|
+
|
|
41
|
+
for key in ("temperature", "top_p", "top_k", "stop", "seed"):
|
|
42
|
+
if key in p:
|
|
43
|
+
out[key] = p[key]
|
|
44
|
+
|
|
45
|
+
if "max_new_tokens" in p:
|
|
46
|
+
out["num_predict"] = p["max_new_tokens"]
|
|
47
|
+
elif "max_tokens" in p:
|
|
48
|
+
out["num_predict"] = p["max_tokens"]
|
|
49
|
+
|
|
50
|
+
if "repeat_penalty" in p:
|
|
51
|
+
out["repeat_penalty"] = p["repeat_penalty"]
|
|
52
|
+
if "repeat_last_n" in p:
|
|
53
|
+
out["repeat_last_n"] = p["repeat_last_n"]
|
|
54
|
+
|
|
55
|
+
return out
|
|
56
|
+
|
|
57
|
+
|
|
10
58
|
class OllamaProvider(Provider):
|
|
11
59
|
def __init__(
|
|
12
60
|
self,
|
|
13
|
-
model_id=None
|
|
61
|
+
model_id: Optional[str] = None,
|
|
62
|
+
params: Optional[Dict[str, Any]] = None,
|
|
63
|
+
timeout: int = 300,
|
|
64
|
+
use_legacy_query: Optional[bool] = None,
|
|
65
|
+
system_prompt: Optional[str] = None,
|
|
66
|
+
token: Optional[str] = None,
|
|
67
|
+
instance_url: Optional[str] = None,
|
|
14
68
|
):
|
|
15
|
-
|
|
16
|
-
self.
|
|
17
|
-
|
|
69
|
+
super().__init__(use_legacy_query=use_legacy_query)
|
|
70
|
+
self.generate_url = (
|
|
71
|
+
OLLAMA_URL.rstrip("/") + "/api/generate"
|
|
72
|
+
) # legacy text generation
|
|
73
|
+
self.chat_url = OLLAMA_URL.rstrip("/") + "/api/chat" # chat endpoint
|
|
74
|
+
self.model_id = os.environ.get("MODEL_OVERRIDE", model_id)
|
|
75
|
+
logger.info("[d b]Using inference model %s", self.model_id)
|
|
76
|
+
self.params = params or {}
|
|
77
|
+
self.timeout = timeout
|
|
78
|
+
self.system_prompt = system_prompt
|
|
79
|
+
|
|
80
|
+
def old_query(self, sentence: str) -> str:
|
|
81
|
+
# Legacy /api/generate
|
|
82
|
+
if not self.model_id:
|
|
83
|
+
raise ValueError("model_id must be specified for Ollama generation")
|
|
18
84
|
|
|
19
|
-
|
|
20
|
-
payload
|
|
21
|
-
|
|
85
|
+
options = _translate_params_to_ollama_options(self.params)
|
|
86
|
+
payload: Dict[str, Any] = {
|
|
87
|
+
"model": self.model_id,
|
|
88
|
+
"prompt": sentence,
|
|
89
|
+
"stream": True,
|
|
90
|
+
}
|
|
91
|
+
if options:
|
|
92
|
+
payload["options"] = options
|
|
93
|
+
|
|
94
|
+
request_id = str(uuid.uuid4())
|
|
95
|
+
t0 = time.time()
|
|
96
|
+
|
|
97
|
+
logger.debug(
|
|
98
|
+
"[d][b]Sending Ollama generate request | request_id=%s url=%s model=%s params=%s input_preview=%s",
|
|
99
|
+
request_id,
|
|
100
|
+
self.generate_url,
|
|
101
|
+
self.model_id,
|
|
102
|
+
json.dumps(options, sort_keys=True, ensure_ascii=False),
|
|
103
|
+
_truncate(sentence, 200),
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
resp = None
|
|
22
107
|
final_text = ""
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
108
|
+
usage: Dict[str, Any] = {}
|
|
109
|
+
|
|
110
|
+
try:
|
|
111
|
+
resp = requests.post(
|
|
112
|
+
self.generate_url,
|
|
113
|
+
json=payload,
|
|
114
|
+
stream=True,
|
|
115
|
+
timeout=self.timeout,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
if resp.status_code != 200:
|
|
119
|
+
resp_text_preview = _truncate(getattr(resp, "text", ""), 2000)
|
|
120
|
+
duration_ms = int((time.time() - t0) * 1000)
|
|
121
|
+
logger.error(
|
|
122
|
+
"[d b red]Ollama generate request failed (non-200) | request_id=%s status_code=%s duration_ms=%s response_text_preview=%s",
|
|
123
|
+
request_id,
|
|
124
|
+
resp.status_code,
|
|
125
|
+
duration_ms,
|
|
126
|
+
resp_text_preview,
|
|
127
|
+
)
|
|
128
|
+
resp.raise_for_status()
|
|
129
|
+
|
|
130
|
+
for line in resp.iter_lines(decode_unicode=True):
|
|
131
|
+
if not line:
|
|
132
|
+
continue
|
|
133
|
+
try:
|
|
134
|
+
obj = json.loads(line)
|
|
135
|
+
except Exception:
|
|
136
|
+
logger.warning(
|
|
137
|
+
"Skipping unparsable line from Ollama generate | request_id=%s line_preview=%s",
|
|
138
|
+
request_id,
|
|
139
|
+
_truncate(line, 500),
|
|
140
|
+
)
|
|
141
|
+
continue
|
|
142
|
+
|
|
143
|
+
if not obj.get("done"):
|
|
144
|
+
chunk = obj.get("response", "")
|
|
145
|
+
if chunk:
|
|
146
|
+
final_text += chunk
|
|
147
|
+
else:
|
|
148
|
+
# Final metrics frame
|
|
149
|
+
usage = {
|
|
150
|
+
"prompt_eval_count": obj.get("prompt_eval_count"),
|
|
151
|
+
"eval_count": obj.get("eval_count"),
|
|
152
|
+
"prompt_eval_duration_ns": obj.get(
|
|
153
|
+
"prompt_eval_duration"
|
|
154
|
+
),
|
|
155
|
+
"eval_duration_ns": obj.get("eval_duration"),
|
|
156
|
+
"total_duration_ns": obj.get("total_duration"),
|
|
157
|
+
"load_duration_ns": obj.get("load_duration"),
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
duration_ms = int((time.time() - t0) * 1000)
|
|
161
|
+
logger.debug(
|
|
162
|
+
"[d][b]Ollama generate response received | request_id=%s status_code=%s duration_ms=%s usage=%s output_preview=%s",
|
|
163
|
+
request_id,
|
|
164
|
+
resp.status_code,
|
|
165
|
+
duration_ms,
|
|
166
|
+
json.dumps(usage, sort_keys=True, ensure_ascii=False),
|
|
167
|
+
_truncate(final_text, 2000),
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
return final_text
|
|
171
|
+
|
|
172
|
+
except Exception:
|
|
173
|
+
duration_ms = int((time.time() - t0) * 1000)
|
|
174
|
+
status_code = getattr(resp, "status_code", None)
|
|
175
|
+
resp_text_preview = None
|
|
176
|
+
try:
|
|
177
|
+
if resp is not None and not getattr(resp, "raw", None):
|
|
178
|
+
resp_text_preview = _truncate(
|
|
179
|
+
getattr(resp, "text", None), 2000
|
|
180
|
+
)
|
|
181
|
+
except Exception:
|
|
182
|
+
pass
|
|
183
|
+
|
|
184
|
+
logger.exception(
|
|
185
|
+
"Ollama generate request encountered an error | request_id=%s status_code=%s duration_ms=%s response_text_preview=%s",
|
|
186
|
+
request_id,
|
|
187
|
+
status_code,
|
|
188
|
+
duration_ms,
|
|
189
|
+
resp_text_preview,
|
|
190
|
+
)
|
|
191
|
+
raise
|
|
192
|
+
|
|
193
|
+
def new_query(self, sentence: str) -> str:
|
|
194
|
+
"""
|
|
195
|
+
/api/chat
|
|
196
|
+
Returns assistant message content.
|
|
197
|
+
"""
|
|
198
|
+
if not self.model_id:
|
|
199
|
+
raise ValueError("model_id must be specified for Ollama chat")
|
|
200
|
+
|
|
201
|
+
options = _translate_params_to_ollama_options(self.params)
|
|
31
202
|
|
|
32
|
-
|
|
203
|
+
messages: List[Dict[str, str]] = []
|
|
204
|
+
if self.system_prompt:
|
|
205
|
+
messages.append({"role": "system", "content": self.system_prompt})
|
|
206
|
+
messages.append({"role": "user", "content": sentence})
|
|
207
|
+
|
|
208
|
+
payload: Dict[str, Any] = {
|
|
209
|
+
"model": self.model_id,
|
|
210
|
+
"messages": messages,
|
|
211
|
+
"stream": False,
|
|
212
|
+
}
|
|
213
|
+
if options:
|
|
214
|
+
payload["options"] = options
|
|
215
|
+
|
|
216
|
+
request_id = str(uuid.uuid4())
|
|
217
|
+
t0 = time.time()
|
|
218
|
+
|
|
219
|
+
logger.debug(
|
|
220
|
+
"[d][b]Sending Ollama chat request (non-streaming) | request_id=%s url=%s model=%s params=%s input_preview=%s",
|
|
221
|
+
request_id,
|
|
222
|
+
self.chat_url,
|
|
223
|
+
self.model_id,
|
|
224
|
+
json.dumps(options, sort_keys=True, ensure_ascii=False),
|
|
225
|
+
_truncate(sentence, 200),
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
resp = None
|
|
229
|
+
try:
|
|
230
|
+
resp = requests.post(
|
|
231
|
+
self.chat_url, json=payload, timeout=self.timeout
|
|
232
|
+
)
|
|
233
|
+
duration_ms = int((time.time() - t0) * 1000)
|
|
234
|
+
resp.raise_for_status()
|
|
235
|
+
data = resp.json()
|
|
236
|
+
|
|
237
|
+
# Non-streaming chat response: { "message": {"role": "assistant", "content": "..."} , "done": true, ... }
|
|
238
|
+
message = data.get("message") or {}
|
|
239
|
+
content = message.get("content", "") or ""
|
|
240
|
+
finish_reason = data.get("finish_reason")
|
|
241
|
+
usage = {
|
|
242
|
+
"prompt_eval_count": data.get("prompt_eval_count"),
|
|
243
|
+
"eval_count": data.get("eval_count"),
|
|
244
|
+
"prompt_eval_duration_ns": data.get("prompt_eval_duration"),
|
|
245
|
+
"eval_duration_ns": data.get("eval_duration"),
|
|
246
|
+
"total_duration_ns": data.get("total_duration"),
|
|
247
|
+
"load_duration_ns": data.get("load_duration"),
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
logger.debug(
|
|
251
|
+
"[d][b]Ollama chat response received | request_id=%s status_code=%s duration_ms=%s finish_reason=%s usage=%s output_preview=%s",
|
|
252
|
+
request_id,
|
|
253
|
+
resp.status_code,
|
|
254
|
+
duration_ms,
|
|
255
|
+
finish_reason,
|
|
256
|
+
json.dumps(usage, sort_keys=True, ensure_ascii=False),
|
|
257
|
+
_truncate(content, 2000),
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
return content
|
|
261
|
+
|
|
262
|
+
except Exception:
|
|
263
|
+
duration_ms = int((time.time() - t0) * 1000)
|
|
264
|
+
status_code = getattr(resp, "status_code", None)
|
|
265
|
+
resp_text_preview = (
|
|
266
|
+
_truncate(getattr(resp, "text", None), 2000)
|
|
267
|
+
if resp is not None
|
|
268
|
+
else None
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
logger.exception(
|
|
272
|
+
"Ollama chat request encountered an error | request_id=%s status_code=%s duration_ms=%s response_text_preview=%s",
|
|
273
|
+
request_id,
|
|
274
|
+
status_code,
|
|
275
|
+
duration_ms,
|
|
276
|
+
resp_text_preview,
|
|
277
|
+
)
|
|
278
|
+
raise
|
|
279
|
+
|
|
280
|
+
def chat(
|
|
281
|
+
self,
|
|
282
|
+
messages: Sequence[Dict[str, str]],
|
|
283
|
+
params: Optional[Dict[str, Any]] = None,
|
|
284
|
+
) -> ChatResult:
|
|
285
|
+
"""
|
|
286
|
+
Non-streaming chat via /api/chat.
|
|
287
|
+
"""
|
|
288
|
+
if not self.model_id:
|
|
289
|
+
raise ValueError("model_id must be specified for Ollama chat")
|
|
290
|
+
|
|
291
|
+
merged_params = dict(self.params or {})
|
|
292
|
+
if params:
|
|
293
|
+
merged_params.update(params)
|
|
294
|
+
options = _translate_params_to_ollama_options(merged_params)
|
|
295
|
+
|
|
296
|
+
payload: Dict[str, Any] = {
|
|
297
|
+
"model": self.model_id,
|
|
298
|
+
"messages": list(messages),
|
|
299
|
+
"stream": False,
|
|
300
|
+
}
|
|
301
|
+
if options:
|
|
302
|
+
payload["options"] = options
|
|
303
|
+
|
|
304
|
+
last_user = next(
|
|
305
|
+
(
|
|
306
|
+
m.get("content", "")
|
|
307
|
+
for m in reversed(messages)
|
|
308
|
+
if m.get("role") == "user"
|
|
309
|
+
),
|
|
310
|
+
"",
|
|
311
|
+
)
|
|
312
|
+
request_id = str(uuid.uuid4())
|
|
313
|
+
t0 = time.time()
|
|
314
|
+
|
|
315
|
+
logger.debug(
|
|
316
|
+
"[d][b]Sending Ollama chat request (non-streaming, multi-message) | request_id=%s url=%s model=%s params=%s input_preview=%s",
|
|
317
|
+
request_id,
|
|
318
|
+
self.chat_url,
|
|
319
|
+
self.model_id,
|
|
320
|
+
json.dumps(options, sort_keys=True, ensure_ascii=False),
|
|
321
|
+
_truncate(last_user, 200),
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
resp = None
|
|
325
|
+
try:
|
|
326
|
+
resp = requests.post(
|
|
327
|
+
self.chat_url, json=payload, timeout=self.timeout
|
|
328
|
+
)
|
|
329
|
+
duration_ms = int((time.time() - t0) * 1000)
|
|
330
|
+
resp.raise_for_status()
|
|
331
|
+
data = resp.json()
|
|
332
|
+
|
|
333
|
+
message = data.get("message") or {}
|
|
334
|
+
content = message.get("content", "") or ""
|
|
335
|
+
finish_reason = data.get("finish_reason")
|
|
336
|
+
usage = {
|
|
337
|
+
"prompt_eval_count": data.get("prompt_eval_count"),
|
|
338
|
+
"eval_count": data.get("eval_count"),
|
|
339
|
+
"prompt_eval_duration_ns": data.get("prompt_eval_duration"),
|
|
340
|
+
"eval_duration_ns": data.get("eval_duration"),
|
|
341
|
+
"total_duration_ns": data.get("total_duration"),
|
|
342
|
+
"load_duration_ns": data.get("load_duration"),
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
logger.debug(
|
|
346
|
+
"[d][b]Ollama chat response received (non-streaming, multi-message) | request_id=%s status_code=%s duration_ms=%s finish_reason=%s usage=%s output_preview=%s",
|
|
347
|
+
request_id,
|
|
348
|
+
resp.status_code,
|
|
349
|
+
duration_ms,
|
|
350
|
+
finish_reason,
|
|
351
|
+
json.dumps(usage, sort_keys=True, ensure_ascii=False),
|
|
352
|
+
_truncate(content, 2000),
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
return ChatResult(
|
|
356
|
+
text=content, usage=usage, finish_reason=finish_reason, raw=data
|
|
357
|
+
)
|
|
358
|
+
|
|
359
|
+
except Exception:
|
|
360
|
+
duration_ms = int((time.time() - t0) * 1000)
|
|
361
|
+
status_code = getattr(resp, "status_code", None)
|
|
362
|
+
resp_text_preview = (
|
|
363
|
+
_truncate(getattr(resp, "text", None), 2000)
|
|
364
|
+
if resp is not None
|
|
365
|
+
else None
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
logger.exception(
|
|
369
|
+
"Ollama chat request (non-streaming, multi-message) encountered an error | request_id=%s status_code=%s duration_ms=%s response_text_preview=%s",
|
|
370
|
+
request_id,
|
|
371
|
+
status_code,
|
|
372
|
+
duration_ms,
|
|
373
|
+
resp_text_preview,
|
|
374
|
+
)
|
|
375
|
+
raise
|
|
33
376
|
|
|
34
377
|
def encode(self, sentences: List[str]) -> List[list]:
|
|
35
|
-
|
|
378
|
+
raise NotImplementedError(
|
|
379
|
+
"encode is not implemented for OllamaProvider"
|
|
380
|
+
)
|
|
36
381
|
|
|
37
382
|
|
|
38
383
|
if __name__ == "__main__":
|
|
39
|
-
|
|
40
|
-
|
|
384
|
+
logging.basicConfig(
|
|
385
|
+
level=logging.INFO,
|
|
386
|
+
format="%(asctime)s %(levelname)s %(name)s %(message)s",
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
provider = OllamaProvider(model_id="llama3.1:8b", use_legacy_query=False)
|
|
390
|
+
|
|
391
|
+
print("new_query:", provider.query("Say hello in one sentence."))
|
|
392
|
+
|
|
393
|
+
# chat API
|
|
394
|
+
messages = [
|
|
395
|
+
{"role": "system", "content": "You are concise."},
|
|
396
|
+
{"role": "user", "content": "List three fruits."},
|
|
397
|
+
]
|
|
398
|
+
result = provider.chat(messages)
|
|
399
|
+
print("chat:", result.text)
|
|
400
|
+
|
|
401
|
+
# Streaming chat
|
|
402
|
+
print("stream_chat:")
|
|
403
|
+
assembled = []
|
|
404
|
+
for chunk in provider.stream_chat(
|
|
405
|
+
[{"role": "user", "content": "Stream a short sentence."}]
|
|
406
|
+
):
|
|
407
|
+
if chunk.get("delta"):
|
|
408
|
+
assembled.append(chunk["delta"])
|
|
409
|
+
if chunk.get("is_final"):
|
|
410
|
+
print("".join(assembled))
|
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Any, Dict, List, Optional, Sequence, Union
|
|
3
|
+
|
|
4
|
+
from wxo_agentic_evaluation.service_provider.provider import (
|
|
5
|
+
ChatResult,
|
|
6
|
+
Provider,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
try:
|
|
12
|
+
from portkey_ai.api_resources.types.chat_complete_type import (
|
|
13
|
+
ChatCompletions,
|
|
14
|
+
)
|
|
15
|
+
except Exception as e:
|
|
16
|
+
logger.warning(e)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _extract_text_from_response(resp: Any) -> str:
|
|
20
|
+
"""Extract assistant text from common Portkey response shapes.
|
|
21
|
+
|
|
22
|
+
The exact return type from the client may vary. Try several common
|
|
23
|
+
patterns and fall back to str(resp).
|
|
24
|
+
"""
|
|
25
|
+
try:
|
|
26
|
+
# Common pattern like OpenAI: choices[0].message.content (string or list)
|
|
27
|
+
if isinstance(resp, dict):
|
|
28
|
+
choices = resp.get("choices")
|
|
29
|
+
if choices and len(choices) > 0:
|
|
30
|
+
choice = choices[0]
|
|
31
|
+
msg = choice.get("message") or choice.get("delta") or {}
|
|
32
|
+
if isinstance(msg, dict):
|
|
33
|
+
content = msg.get("content")
|
|
34
|
+
# content might be a string or a list of content blocks
|
|
35
|
+
if isinstance(content, str):
|
|
36
|
+
return content
|
|
37
|
+
if isinstance(content, list) and content:
|
|
38
|
+
# content blocks might be {"type":"text","text":...}
|
|
39
|
+
first = content[0]
|
|
40
|
+
if isinstance(first, dict) and "text" in first:
|
|
41
|
+
return first.get("text", "")
|
|
42
|
+
return str(first)
|
|
43
|
+
|
|
44
|
+
# fallback: some clients return choices[0].text
|
|
45
|
+
if "text" in choice:
|
|
46
|
+
return choice.get("text") or ""
|
|
47
|
+
|
|
48
|
+
# If not a dict, try objects with attributes
|
|
49
|
+
if hasattr(resp, "choices"):
|
|
50
|
+
choices = getattr(resp, "choices")
|
|
51
|
+
if choices:
|
|
52
|
+
c0 = choices[0]
|
|
53
|
+
if hasattr(c0, "message") and getattr(c0, "message"):
|
|
54
|
+
m = getattr(c0, "message")
|
|
55
|
+
if isinstance(m, dict):
|
|
56
|
+
return _extract_text_from_response({"choices": [m]})
|
|
57
|
+
# message may be an object; try to get content attr
|
|
58
|
+
if hasattr(m, "content"):
|
|
59
|
+
return getattr(m, "content")
|
|
60
|
+
|
|
61
|
+
except Exception:
|
|
62
|
+
# parsing should never raise to caller; fall through to str(resp)
|
|
63
|
+
pass
|
|
64
|
+
|
|
65
|
+
# Last resort
|
|
66
|
+
try:
|
|
67
|
+
return str(resp)
|
|
68
|
+
except Exception:
|
|
69
|
+
return ""
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class PortkeyProvider(Provider):
|
|
73
|
+
"""Provider that delegates to the Portkey AI client"""
|
|
74
|
+
|
|
75
|
+
def __init__(
|
|
76
|
+
self,
|
|
77
|
+
provider: str,
|
|
78
|
+
api_key: Optional[str] = None,
|
|
79
|
+
model_id: Optional[str] = None,
|
|
80
|
+
embedding_model: Optional[str] = None,
|
|
81
|
+
base_url: Optional[str] = None,
|
|
82
|
+
timeout: int = 60,
|
|
83
|
+
system_prompt: Optional[str] = None,
|
|
84
|
+
**kwargs,
|
|
85
|
+
) -> None:
|
|
86
|
+
super().__init__()
|
|
87
|
+
|
|
88
|
+
self.provider = provider
|
|
89
|
+
self.api_key = api_key
|
|
90
|
+
self.model_id = model_id
|
|
91
|
+
self.embedding_model = embedding_model
|
|
92
|
+
self.base_url = base_url
|
|
93
|
+
self.timeout = timeout * 1000 # convert to ms
|
|
94
|
+
self.system_prompt = system_prompt
|
|
95
|
+
|
|
96
|
+
# Lazy import - avoid hard dependency at import time
|
|
97
|
+
self._client = None
|
|
98
|
+
if self.api_key is not None:
|
|
99
|
+
try:
|
|
100
|
+
|
|
101
|
+
from portkey_ai import Portkey # type: ignore
|
|
102
|
+
|
|
103
|
+
client_kwargs = {
|
|
104
|
+
"provider": self.provider,
|
|
105
|
+
"Authorization": self.api_key,
|
|
106
|
+
}
|
|
107
|
+
if self.base_url:
|
|
108
|
+
client_kwargs["base_url"] = base_url
|
|
109
|
+
if self.timeout:
|
|
110
|
+
client_kwargs["request_timeout"] = self.timeout
|
|
111
|
+
# Add any remaining kwargs
|
|
112
|
+
client_kwargs.update(kwargs)
|
|
113
|
+
# construct client
|
|
114
|
+
self._client = Portkey(**client_kwargs)
|
|
115
|
+
except Exception as e: # ImportError or runtime errors
|
|
116
|
+
# Do not fail hard on import; surface when used
|
|
117
|
+
logger.debug("portkey_ai import/initialization failed: %s", e)
|
|
118
|
+
self._client = None
|
|
119
|
+
|
|
120
|
+
def _require_client(self) -> None:
|
|
121
|
+
if self._client is None:
|
|
122
|
+
raise ImportError(
|
|
123
|
+
"portkey_ai client is not available. Install 'portkey_ai' and provide a valid api_key."
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
def old_query(self, sentence: str, extract_text: bool = False) -> str:
|
|
127
|
+
return self.new_query(sentence, extract_text)
|
|
128
|
+
|
|
129
|
+
def new_query(self, sentence: str, extract_text: bool = False) -> str:
|
|
130
|
+
"""Send a single user message and return assistant text."""
|
|
131
|
+
self._require_client()
|
|
132
|
+
|
|
133
|
+
messages = []
|
|
134
|
+
if self.system_prompt:
|
|
135
|
+
messages.append({"role": "system", "content": self.system_prompt})
|
|
136
|
+
messages.append({"role": "user", "content": sentence})
|
|
137
|
+
|
|
138
|
+
resp = self._client.chat.completions.create(
|
|
139
|
+
messages=messages, model=self.model_id
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
if extract_text:
|
|
143
|
+
return _extract_text_from_response(resp)
|
|
144
|
+
|
|
145
|
+
return resp
|
|
146
|
+
|
|
147
|
+
def chat(
|
|
148
|
+
self,
|
|
149
|
+
messages: Sequence[Dict[str, str]],
|
|
150
|
+
params: Optional[Dict[str, Any]] = None,
|
|
151
|
+
return_chat_completions: bool = True,
|
|
152
|
+
) -> Union[ChatResult, ChatCompletions]:
|
|
153
|
+
self._require_client()
|
|
154
|
+
|
|
155
|
+
# build messages for Portkey: pass them mostly through
|
|
156
|
+
port_messages = []
|
|
157
|
+
for m in messages:
|
|
158
|
+
# Portkey expects simple role/content pairs for chat
|
|
159
|
+
role = m.get("role")
|
|
160
|
+
content = m.get("content", "")
|
|
161
|
+
port_messages.append({"role": role, "content": content})
|
|
162
|
+
|
|
163
|
+
kwargs = {}
|
|
164
|
+
if params:
|
|
165
|
+
kwargs.update(params)
|
|
166
|
+
|
|
167
|
+
try:
|
|
168
|
+
resp = self._client.chat.completions.create(
|
|
169
|
+
messages=port_messages, model=self.model_id, **kwargs
|
|
170
|
+
)
|
|
171
|
+
except TypeError:
|
|
172
|
+
# fallback if client signature differs
|
|
173
|
+
resp = self._client.chat.completions.create(
|
|
174
|
+
messages=port_messages, model=self.model_id
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
if return_chat_completions:
|
|
178
|
+
return resp
|
|
179
|
+
|
|
180
|
+
# try to extract text, usage and finish reason
|
|
181
|
+
text = _extract_text_from_response(resp)
|
|
182
|
+
|
|
183
|
+
usage = None
|
|
184
|
+
finish_reason = None
|
|
185
|
+
if isinstance(resp, dict):
|
|
186
|
+
usage = resp.get("usage")
|
|
187
|
+
try:
|
|
188
|
+
finish_reason = resp.get("choices", [])[0].get("finish_reason")
|
|
189
|
+
except Exception:
|
|
190
|
+
finish_reason = None
|
|
191
|
+
|
|
192
|
+
return ChatResult(
|
|
193
|
+
text=text, usage=usage, finish_reason=finish_reason, raw=resp
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
def encode(self, sentences: List[str]) -> List[list]:
|
|
197
|
+
if self.embedding_model is None:
|
|
198
|
+
raise Exception(
|
|
199
|
+
"embedding model id must be specified for text encoding"
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
self._require_client()
|
|
203
|
+
|
|
204
|
+
try:
|
|
205
|
+
resp = self._client.embeddings.create(
|
|
206
|
+
inputs=sentences, model=self.embedding_model
|
|
207
|
+
)
|
|
208
|
+
except TypeError:
|
|
209
|
+
resp = self._client.embeddings.create(
|
|
210
|
+
inputs=sentences, model=self.embedding_model
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
# Try common shapes: {'data': [{'embedding': [...]}, ...]} or {'results': ...}
|
|
214
|
+
if isinstance(resp, dict):
|
|
215
|
+
if "data" in resp:
|
|
216
|
+
return [d.get("embedding") for d in resp.get("data", [])]
|
|
217
|
+
if "results" in resp:
|
|
218
|
+
return [r.get("embedding") for r in resp.get("results", [])]
|
|
219
|
+
|
|
220
|
+
# If the client returns list directly
|
|
221
|
+
if isinstance(resp, list):
|
|
222
|
+
# expect list of embeddings
|
|
223
|
+
return resp
|
|
224
|
+
|
|
225
|
+
# Unknown shape -> try to coerce
|
|
226
|
+
try:
|
|
227
|
+
return [list(e) for e in resp]
|
|
228
|
+
except Exception:
|
|
229
|
+
raise ValueError("Unexpected response from embeddings request")
|