ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA +53 -0
  2. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +38 -21
  4. wxo_agentic_evaluation/analytics/tools/main.py +19 -25
  5. wxo_agentic_evaluation/analytics/tools/types.py +26 -11
  6. wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
  7. wxo_agentic_evaluation/analyze_run.py +1184 -97
  8. wxo_agentic_evaluation/annotate.py +7 -5
  9. wxo_agentic_evaluation/arg_configs.py +97 -5
  10. wxo_agentic_evaluation/base_user.py +25 -0
  11. wxo_agentic_evaluation/batch_annotate.py +97 -27
  12. wxo_agentic_evaluation/clients.py +103 -0
  13. wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
  14. wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
  15. wxo_agentic_evaluation/compare_runs/diff.py +554 -0
  16. wxo_agentic_evaluation/compare_runs/model.py +193 -0
  17. wxo_agentic_evaluation/data_annotator.py +45 -19
  18. wxo_agentic_evaluation/description_quality_checker.py +178 -0
  19. wxo_agentic_evaluation/evaluation.py +50 -0
  20. wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
  21. wxo_agentic_evaluation/evaluation_package.py +544 -107
  22. wxo_agentic_evaluation/external_agent/__init__.py +18 -7
  23. wxo_agentic_evaluation/external_agent/external_validate.py +49 -36
  24. wxo_agentic_evaluation/external_agent/performance_test.py +33 -22
  25. wxo_agentic_evaluation/external_agent/types.py +8 -7
  26. wxo_agentic_evaluation/extractors/__init__.py +3 -0
  27. wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
  28. wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
  29. wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
  30. wxo_agentic_evaluation/langfuse_collection.py +60 -0
  31. wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
  32. wxo_agentic_evaluation/llm_matching.py +108 -5
  33. wxo_agentic_evaluation/llm_rag_eval.py +7 -4
  34. wxo_agentic_evaluation/llm_safety_eval.py +64 -0
  35. wxo_agentic_evaluation/llm_user.py +12 -6
  36. wxo_agentic_evaluation/llm_user_v2.py +114 -0
  37. wxo_agentic_evaluation/main.py +128 -246
  38. wxo_agentic_evaluation/metrics/__init__.py +15 -0
  39. wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
  40. wxo_agentic_evaluation/metrics/evaluations.py +107 -0
  41. wxo_agentic_evaluation/metrics/journey_success.py +137 -0
  42. wxo_agentic_evaluation/metrics/llm_as_judge.py +28 -2
  43. wxo_agentic_evaluation/metrics/metrics.py +319 -16
  44. wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
  45. wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
  46. wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
  47. wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
  48. wxo_agentic_evaluation/otel_parser/parser.py +163 -0
  49. wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
  50. wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
  51. wxo_agentic_evaluation/otel_parser/utils.py +15 -0
  52. wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
  53. wxo_agentic_evaluation/otel_support/evaluate_tau.py +101 -0
  54. wxo_agentic_evaluation/otel_support/otel_message_conversion.py +29 -0
  55. wxo_agentic_evaluation/otel_support/tasks_test.py +1566 -0
  56. wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
  57. wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
  58. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +59 -5
  59. wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
  60. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
  61. wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
  62. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
  63. wxo_agentic_evaluation/prompt/template_render.py +163 -12
  64. wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
  65. wxo_agentic_evaluation/quick_eval.py +384 -0
  66. wxo_agentic_evaluation/record_chat.py +132 -81
  67. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +302 -0
  68. wxo_agentic_evaluation/red_teaming/attack_generator.py +329 -0
  69. wxo_agentic_evaluation/red_teaming/attack_list.py +184 -0
  70. wxo_agentic_evaluation/red_teaming/attack_runner.py +204 -0
  71. wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
  72. wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
  73. wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
  74. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
  75. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +29 -0
  76. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
  77. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
  78. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
  79. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  80. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
  81. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
  82. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
  83. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  84. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +245 -0
  85. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
  86. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +106 -0
  87. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +291 -0
  88. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +465 -0
  89. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +162 -0
  90. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
  91. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +562 -0
  92. wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
  93. wxo_agentic_evaluation/referenceless_eval/metrics/field.py +266 -0
  94. wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +344 -0
  95. wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +193 -0
  96. wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +413 -0
  97. wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +46 -0
  98. wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
  99. wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +158 -0
  100. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +191 -0
  101. wxo_agentic_evaluation/resource_map.py +6 -3
  102. wxo_agentic_evaluation/runner.py +329 -0
  103. wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
  104. wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
  105. wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +88 -150
  106. wxo_agentic_evaluation/scheduler.py +247 -0
  107. wxo_agentic_evaluation/service_instance.py +117 -26
  108. wxo_agentic_evaluation/service_provider/__init__.py +182 -17
  109. wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
  110. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +628 -45
  111. wxo_agentic_evaluation/service_provider/ollama_provider.py +392 -22
  112. wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
  113. wxo_agentic_evaluation/service_provider/provider.py +129 -10
  114. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +203 -0
  115. wxo_agentic_evaluation/service_provider/watsonx_provider.py +516 -53
  116. wxo_agentic_evaluation/simluation_runner.py +125 -0
  117. wxo_agentic_evaluation/test_prompt.py +4 -4
  118. wxo_agentic_evaluation/tool_planner.py +141 -46
  119. wxo_agentic_evaluation/type.py +217 -14
  120. wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
  121. wxo_agentic_evaluation/utils/__init__.py +44 -3
  122. wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
  123. wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
  124. wxo_agentic_evaluation/utils/messages_parser.py +30 -0
  125. wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +178 -0
  126. wxo_agentic_evaluation/utils/parsers.py +71 -0
  127. wxo_agentic_evaluation/utils/rich_utils.py +188 -0
  128. wxo_agentic_evaluation/utils/rouge_score.py +23 -0
  129. wxo_agentic_evaluation/utils/utils.py +514 -17
  130. wxo_agentic_evaluation/wxo_client.py +81 -0
  131. ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/METADATA +0 -380
  132. ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/RECORD +0 -56
  133. {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
  134. {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
@@ -1,40 +1,410 @@
1
- import requests
2
1
  import json
3
- from wxo_agentic_evaluation.service_provider.provider import Provider
4
- from typing import List
2
+ import logging
5
3
  import os
4
+ import time
5
+ import uuid
6
+ from typing import Any, Dict, Iterator, List, Optional, Sequence
7
+
8
+ import requests
9
+
10
+ from wxo_agentic_evaluation.service_provider.provider import (
11
+ ChatResult,
12
+ Provider,
13
+ )
14
+
15
+ logger = logging.getLogger(__name__)
6
16
 
7
17
  OLLAMA_URL = os.environ.get("OLLAMA_HOST", "http://localhost:11434")
8
18
 
9
19
 
20
+ def _truncate(value: Any, max_len: int = 1000) -> str:
21
+ if value is None:
22
+ return ""
23
+ s = str(value)
24
+ return (
25
+ s
26
+ if len(s) <= max_len
27
+ else s[:max_len] + f"... [truncated {len(s) - max_len} chars]"
28
+ )
29
+
30
+
31
+ def _translate_params_to_ollama_options(
32
+ params: Optional[Dict[str, Any]]
33
+ ) -> Dict[str, Any]:
34
+ """
35
+ Map generic params to Ollama 'options' field.
36
+ Ollama options docs: https://github.com/ollama/ollama/blob/main/docs/modelfile.md#parameters
37
+ """
38
+ p = params or {}
39
+ out: Dict[str, Any] = {}
40
+
41
+ for key in ("temperature", "top_p", "top_k", "stop", "seed"):
42
+ if key in p:
43
+ out[key] = p[key]
44
+
45
+ if "max_new_tokens" in p:
46
+ out["num_predict"] = p["max_new_tokens"]
47
+ elif "max_tokens" in p:
48
+ out["num_predict"] = p["max_tokens"]
49
+
50
+ if "repeat_penalty" in p:
51
+ out["repeat_penalty"] = p["repeat_penalty"]
52
+ if "repeat_last_n" in p:
53
+ out["repeat_last_n"] = p["repeat_last_n"]
54
+
55
+ return out
56
+
57
+
10
58
  class OllamaProvider(Provider):
11
59
  def __init__(
12
60
  self,
13
- model_id=None
61
+ model_id: Optional[str] = None,
62
+ params: Optional[Dict[str, Any]] = None,
63
+ timeout: int = 300,
64
+ use_legacy_query: Optional[bool] = None,
65
+ system_prompt: Optional[str] = None,
66
+ token: Optional[str] = None,
67
+ instance_url: Optional[str] = None,
14
68
  ):
15
- self.url = OLLAMA_URL + "/api/generate"
16
- self.model_id = model_id
17
- super().__init__()
69
+ super().__init__(use_legacy_query=use_legacy_query)
70
+ self.generate_url = (
71
+ OLLAMA_URL.rstrip("/") + "/api/generate"
72
+ ) # legacy text generation
73
+ self.chat_url = OLLAMA_URL.rstrip("/") + "/api/chat" # chat endpoint
74
+ self.model_id = os.environ.get("MODEL_OVERRIDE", model_id)
75
+ logger.info("[d b]Using inference model %s", self.model_id)
76
+ self.params = params or {}
77
+ self.timeout = timeout
78
+ self.system_prompt = system_prompt
79
+
80
+ def old_query(self, sentence: str) -> str:
81
+ # Legacy /api/generate
82
+ if not self.model_id:
83
+ raise ValueError("model_id must be specified for Ollama generation")
18
84
 
19
- def query(self, sentence: str) -> str:
20
- payload = {"model": self.model_id, "prompt": sentence}
21
- resp = requests.post(self.url, json=payload, stream=True)
85
+ options = _translate_params_to_ollama_options(self.params)
86
+ payload: Dict[str, Any] = {
87
+ "model": self.model_id,
88
+ "prompt": sentence,
89
+ "stream": True,
90
+ }
91
+ if options:
92
+ payload["options"] = options
93
+
94
+ request_id = str(uuid.uuid4())
95
+ t0 = time.time()
96
+
97
+ logger.debug(
98
+ "[d][b]Sending Ollama generate request | request_id=%s url=%s model=%s params=%s input_preview=%s",
99
+ request_id,
100
+ self.generate_url,
101
+ self.model_id,
102
+ json.dumps(options, sort_keys=True, ensure_ascii=False),
103
+ _truncate(sentence, 200),
104
+ )
105
+
106
+ resp = None
22
107
  final_text = ""
23
- data = b''
24
- for chunk in resp:
25
- data += chunk
26
- if data.endswith(b'\n'):
27
- json_obj = json.loads(data)
28
- if not json_obj["done"] and json_obj["response"]:
29
- final_text += json_obj["response"]
30
- data = b''
108
+ usage: Dict[str, Any] = {}
109
+
110
+ try:
111
+ resp = requests.post(
112
+ self.generate_url,
113
+ json=payload,
114
+ stream=True,
115
+ timeout=self.timeout,
116
+ )
117
+
118
+ if resp.status_code != 200:
119
+ resp_text_preview = _truncate(getattr(resp, "text", ""), 2000)
120
+ duration_ms = int((time.time() - t0) * 1000)
121
+ logger.error(
122
+ "[d b red]Ollama generate request failed (non-200) | request_id=%s status_code=%s duration_ms=%s response_text_preview=%s",
123
+ request_id,
124
+ resp.status_code,
125
+ duration_ms,
126
+ resp_text_preview,
127
+ )
128
+ resp.raise_for_status()
129
+
130
+ for line in resp.iter_lines(decode_unicode=True):
131
+ if not line:
132
+ continue
133
+ try:
134
+ obj = json.loads(line)
135
+ except Exception:
136
+ logger.warning(
137
+ "Skipping unparsable line from Ollama generate | request_id=%s line_preview=%s",
138
+ request_id,
139
+ _truncate(line, 500),
140
+ )
141
+ continue
142
+
143
+ if not obj.get("done"):
144
+ chunk = obj.get("response", "")
145
+ if chunk:
146
+ final_text += chunk
147
+ else:
148
+ # Final metrics frame
149
+ usage = {
150
+ "prompt_eval_count": obj.get("prompt_eval_count"),
151
+ "eval_count": obj.get("eval_count"),
152
+ "prompt_eval_duration_ns": obj.get(
153
+ "prompt_eval_duration"
154
+ ),
155
+ "eval_duration_ns": obj.get("eval_duration"),
156
+ "total_duration_ns": obj.get("total_duration"),
157
+ "load_duration_ns": obj.get("load_duration"),
158
+ }
159
+
160
+ duration_ms = int((time.time() - t0) * 1000)
161
+ logger.debug(
162
+ "[d][b]Ollama generate response received | request_id=%s status_code=%s duration_ms=%s usage=%s output_preview=%s",
163
+ request_id,
164
+ resp.status_code,
165
+ duration_ms,
166
+ json.dumps(usage, sort_keys=True, ensure_ascii=False),
167
+ _truncate(final_text, 2000),
168
+ )
169
+
170
+ return final_text
171
+
172
+ except Exception:
173
+ duration_ms = int((time.time() - t0) * 1000)
174
+ status_code = getattr(resp, "status_code", None)
175
+ resp_text_preview = None
176
+ try:
177
+ if resp is not None and not getattr(resp, "raw", None):
178
+ resp_text_preview = _truncate(
179
+ getattr(resp, "text", None), 2000
180
+ )
181
+ except Exception:
182
+ pass
183
+
184
+ logger.exception(
185
+ "Ollama generate request encountered an error | request_id=%s status_code=%s duration_ms=%s response_text_preview=%s",
186
+ request_id,
187
+ status_code,
188
+ duration_ms,
189
+ resp_text_preview,
190
+ )
191
+ raise
192
+
193
+ def new_query(self, sentence: str) -> str:
194
+ """
195
+ /api/chat
196
+ Returns assistant message content.
197
+ """
198
+ if not self.model_id:
199
+ raise ValueError("model_id must be specified for Ollama chat")
200
+
201
+ options = _translate_params_to_ollama_options(self.params)
31
202
 
32
- return final_text
203
+ messages: List[Dict[str, str]] = []
204
+ if self.system_prompt:
205
+ messages.append({"role": "system", "content": self.system_prompt})
206
+ messages.append({"role": "user", "content": sentence})
207
+
208
+ payload: Dict[str, Any] = {
209
+ "model": self.model_id,
210
+ "messages": messages,
211
+ "stream": False,
212
+ }
213
+ if options:
214
+ payload["options"] = options
215
+
216
+ request_id = str(uuid.uuid4())
217
+ t0 = time.time()
218
+
219
+ logger.debug(
220
+ "[d][b]Sending Ollama chat request (non-streaming) | request_id=%s url=%s model=%s params=%s input_preview=%s",
221
+ request_id,
222
+ self.chat_url,
223
+ self.model_id,
224
+ json.dumps(options, sort_keys=True, ensure_ascii=False),
225
+ _truncate(sentence, 200),
226
+ )
227
+
228
+ resp = None
229
+ try:
230
+ resp = requests.post(
231
+ self.chat_url, json=payload, timeout=self.timeout
232
+ )
233
+ duration_ms = int((time.time() - t0) * 1000)
234
+ resp.raise_for_status()
235
+ data = resp.json()
236
+
237
+ # Non-streaming chat response: { "message": {"role": "assistant", "content": "..."} , "done": true, ... }
238
+ message = data.get("message") or {}
239
+ content = message.get("content", "") or ""
240
+ finish_reason = data.get("finish_reason")
241
+ usage = {
242
+ "prompt_eval_count": data.get("prompt_eval_count"),
243
+ "eval_count": data.get("eval_count"),
244
+ "prompt_eval_duration_ns": data.get("prompt_eval_duration"),
245
+ "eval_duration_ns": data.get("eval_duration"),
246
+ "total_duration_ns": data.get("total_duration"),
247
+ "load_duration_ns": data.get("load_duration"),
248
+ }
249
+
250
+ logger.debug(
251
+ "[d][b]Ollama chat response received | request_id=%s status_code=%s duration_ms=%s finish_reason=%s usage=%s output_preview=%s",
252
+ request_id,
253
+ resp.status_code,
254
+ duration_ms,
255
+ finish_reason,
256
+ json.dumps(usage, sort_keys=True, ensure_ascii=False),
257
+ _truncate(content, 2000),
258
+ )
259
+
260
+ return content
261
+
262
+ except Exception:
263
+ duration_ms = int((time.time() - t0) * 1000)
264
+ status_code = getattr(resp, "status_code", None)
265
+ resp_text_preview = (
266
+ _truncate(getattr(resp, "text", None), 2000)
267
+ if resp is not None
268
+ else None
269
+ )
270
+
271
+ logger.exception(
272
+ "Ollama chat request encountered an error | request_id=%s status_code=%s duration_ms=%s response_text_preview=%s",
273
+ request_id,
274
+ status_code,
275
+ duration_ms,
276
+ resp_text_preview,
277
+ )
278
+ raise
279
+
280
+ def chat(
281
+ self,
282
+ messages: Sequence[Dict[str, str]],
283
+ params: Optional[Dict[str, Any]] = None,
284
+ ) -> ChatResult:
285
+ """
286
+ Non-streaming chat via /api/chat.
287
+ """
288
+ if not self.model_id:
289
+ raise ValueError("model_id must be specified for Ollama chat")
290
+
291
+ merged_params = dict(self.params or {})
292
+ if params:
293
+ merged_params.update(params)
294
+ options = _translate_params_to_ollama_options(merged_params)
295
+
296
+ payload: Dict[str, Any] = {
297
+ "model": self.model_id,
298
+ "messages": list(messages),
299
+ "stream": False,
300
+ }
301
+ if options:
302
+ payload["options"] = options
303
+
304
+ last_user = next(
305
+ (
306
+ m.get("content", "")
307
+ for m in reversed(messages)
308
+ if m.get("role") == "user"
309
+ ),
310
+ "",
311
+ )
312
+ request_id = str(uuid.uuid4())
313
+ t0 = time.time()
314
+
315
+ logger.debug(
316
+ "[d][b]Sending Ollama chat request (non-streaming, multi-message) | request_id=%s url=%s model=%s params=%s input_preview=%s",
317
+ request_id,
318
+ self.chat_url,
319
+ self.model_id,
320
+ json.dumps(options, sort_keys=True, ensure_ascii=False),
321
+ _truncate(last_user, 200),
322
+ )
323
+
324
+ resp = None
325
+ try:
326
+ resp = requests.post(
327
+ self.chat_url, json=payload, timeout=self.timeout
328
+ )
329
+ duration_ms = int((time.time() - t0) * 1000)
330
+ resp.raise_for_status()
331
+ data = resp.json()
332
+
333
+ message = data.get("message") or {}
334
+ content = message.get("content", "") or ""
335
+ finish_reason = data.get("finish_reason")
336
+ usage = {
337
+ "prompt_eval_count": data.get("prompt_eval_count"),
338
+ "eval_count": data.get("eval_count"),
339
+ "prompt_eval_duration_ns": data.get("prompt_eval_duration"),
340
+ "eval_duration_ns": data.get("eval_duration"),
341
+ "total_duration_ns": data.get("total_duration"),
342
+ "load_duration_ns": data.get("load_duration"),
343
+ }
344
+
345
+ logger.debug(
346
+ "[d][b]Ollama chat response received (non-streaming, multi-message) | request_id=%s status_code=%s duration_ms=%s finish_reason=%s usage=%s output_preview=%s",
347
+ request_id,
348
+ resp.status_code,
349
+ duration_ms,
350
+ finish_reason,
351
+ json.dumps(usage, sort_keys=True, ensure_ascii=False),
352
+ _truncate(content, 2000),
353
+ )
354
+
355
+ return ChatResult(
356
+ text=content, usage=usage, finish_reason=finish_reason, raw=data
357
+ )
358
+
359
+ except Exception:
360
+ duration_ms = int((time.time() - t0) * 1000)
361
+ status_code = getattr(resp, "status_code", None)
362
+ resp_text_preview = (
363
+ _truncate(getattr(resp, "text", None), 2000)
364
+ if resp is not None
365
+ else None
366
+ )
367
+
368
+ logger.exception(
369
+ "Ollama chat request (non-streaming, multi-message) encountered an error | request_id=%s status_code=%s duration_ms=%s response_text_preview=%s",
370
+ request_id,
371
+ status_code,
372
+ duration_ms,
373
+ resp_text_preview,
374
+ )
375
+ raise
33
376
 
34
377
  def encode(self, sentences: List[str]) -> List[list]:
35
- pass
378
+ raise NotImplementedError(
379
+ "encode is not implemented for OllamaProvider"
380
+ )
36
381
 
37
382
 
38
383
  if __name__ == "__main__":
39
- provider = OllamaProvider(model_id="llama3.1:8b")
40
- print(provider.query("ok"))
384
+ logging.basicConfig(
385
+ level=logging.INFO,
386
+ format="%(asctime)s %(levelname)s %(name)s %(message)s",
387
+ )
388
+
389
+ provider = OllamaProvider(model_id="llama3.1:8b", use_legacy_query=False)
390
+
391
+ print("new_query:", provider.query("Say hello in one sentence."))
392
+
393
+ # chat API
394
+ messages = [
395
+ {"role": "system", "content": "You are concise."},
396
+ {"role": "user", "content": "List three fruits."},
397
+ ]
398
+ result = provider.chat(messages)
399
+ print("chat:", result.text)
400
+
401
+ # Streaming chat
402
+ print("stream_chat:")
403
+ assembled = []
404
+ for chunk in provider.stream_chat(
405
+ [{"role": "user", "content": "Stream a short sentence."}]
406
+ ):
407
+ if chunk.get("delta"):
408
+ assembled.append(chunk["delta"])
409
+ if chunk.get("is_final"):
410
+ print("".join(assembled))
@@ -0,0 +1,229 @@
1
+ import logging
2
+ from typing import Any, Dict, List, Optional, Sequence, Union
3
+
4
+ from wxo_agentic_evaluation.service_provider.provider import (
5
+ ChatResult,
6
+ Provider,
7
+ )
8
+
9
+
10
+ logger = logging.getLogger(__name__)
11
+ try:
12
+ from portkey_ai.api_resources.types.chat_complete_type import (
13
+ ChatCompletions,
14
+ )
15
+ except Exception as e:
16
+ logger.warning(e)
17
+
18
+
19
+ def _extract_text_from_response(resp: Any) -> str:
20
+ """Extract assistant text from common Portkey response shapes.
21
+
22
+ The exact return type from the client may vary. Try several common
23
+ patterns and fall back to str(resp).
24
+ """
25
+ try:
26
+ # Common pattern like OpenAI: choices[0].message.content (string or list)
27
+ if isinstance(resp, dict):
28
+ choices = resp.get("choices")
29
+ if choices and len(choices) > 0:
30
+ choice = choices[0]
31
+ msg = choice.get("message") or choice.get("delta") or {}
32
+ if isinstance(msg, dict):
33
+ content = msg.get("content")
34
+ # content might be a string or a list of content blocks
35
+ if isinstance(content, str):
36
+ return content
37
+ if isinstance(content, list) and content:
38
+ # content blocks might be {"type":"text","text":...}
39
+ first = content[0]
40
+ if isinstance(first, dict) and "text" in first:
41
+ return first.get("text", "")
42
+ return str(first)
43
+
44
+ # fallback: some clients return choices[0].text
45
+ if "text" in choice:
46
+ return choice.get("text") or ""
47
+
48
+ # If not a dict, try objects with attributes
49
+ if hasattr(resp, "choices"):
50
+ choices = getattr(resp, "choices")
51
+ if choices:
52
+ c0 = choices[0]
53
+ if hasattr(c0, "message") and getattr(c0, "message"):
54
+ m = getattr(c0, "message")
55
+ if isinstance(m, dict):
56
+ return _extract_text_from_response({"choices": [m]})
57
+ # message may be an object; try to get content attr
58
+ if hasattr(m, "content"):
59
+ return getattr(m, "content")
60
+
61
+ except Exception:
62
+ # parsing should never raise to caller; fall through to str(resp)
63
+ pass
64
+
65
+ # Last resort
66
+ try:
67
+ return str(resp)
68
+ except Exception:
69
+ return ""
70
+
71
+
72
+ class PortkeyProvider(Provider):
73
+ """Provider that delegates to the Portkey AI client"""
74
+
75
+ def __init__(
76
+ self,
77
+ provider: str,
78
+ api_key: Optional[str] = None,
79
+ model_id: Optional[str] = None,
80
+ embedding_model: Optional[str] = None,
81
+ base_url: Optional[str] = None,
82
+ timeout: int = 60,
83
+ system_prompt: Optional[str] = None,
84
+ **kwargs,
85
+ ) -> None:
86
+ super().__init__()
87
+
88
+ self.provider = provider
89
+ self.api_key = api_key
90
+ self.model_id = model_id
91
+ self.embedding_model = embedding_model
92
+ self.base_url = base_url
93
+ self.timeout = timeout * 1000 # convert to ms
94
+ self.system_prompt = system_prompt
95
+
96
+ # Lazy import - avoid hard dependency at import time
97
+ self._client = None
98
+ if self.api_key is not None:
99
+ try:
100
+
101
+ from portkey_ai import Portkey # type: ignore
102
+
103
+ client_kwargs = {
104
+ "provider": self.provider,
105
+ "Authorization": self.api_key,
106
+ }
107
+ if self.base_url:
108
+ client_kwargs["base_url"] = base_url
109
+ if self.timeout:
110
+ client_kwargs["request_timeout"] = self.timeout
111
+ # Add any remaining kwargs
112
+ client_kwargs.update(kwargs)
113
+ # construct client
114
+ self._client = Portkey(**client_kwargs)
115
+ except Exception as e: # ImportError or runtime errors
116
+ # Do not fail hard on import; surface when used
117
+ logger.debug("portkey_ai import/initialization failed: %s", e)
118
+ self._client = None
119
+
120
+ def _require_client(self) -> None:
121
+ if self._client is None:
122
+ raise ImportError(
123
+ "portkey_ai client is not available. Install 'portkey_ai' and provide a valid api_key."
124
+ )
125
+
126
+ def old_query(self, sentence: str, extract_text: bool = False) -> str:
127
+ return self.new_query(sentence, extract_text)
128
+
129
+ def new_query(self, sentence: str, extract_text: bool = False) -> str:
130
+ """Send a single user message and return assistant text."""
131
+ self._require_client()
132
+
133
+ messages = []
134
+ if self.system_prompt:
135
+ messages.append({"role": "system", "content": self.system_prompt})
136
+ messages.append({"role": "user", "content": sentence})
137
+
138
+ resp = self._client.chat.completions.create(
139
+ messages=messages, model=self.model_id
140
+ )
141
+
142
+ if extract_text:
143
+ return _extract_text_from_response(resp)
144
+
145
+ return resp
146
+
147
+ def chat(
148
+ self,
149
+ messages: Sequence[Dict[str, str]],
150
+ params: Optional[Dict[str, Any]] = None,
151
+ return_chat_completions: bool = True,
152
+ ) -> Union[ChatResult, ChatCompletions]:
153
+ self._require_client()
154
+
155
+ # build messages for Portkey: pass them mostly through
156
+ port_messages = []
157
+ for m in messages:
158
+ # Portkey expects simple role/content pairs for chat
159
+ role = m.get("role")
160
+ content = m.get("content", "")
161
+ port_messages.append({"role": role, "content": content})
162
+
163
+ kwargs = {}
164
+ if params:
165
+ kwargs.update(params)
166
+
167
+ try:
168
+ resp = self._client.chat.completions.create(
169
+ messages=port_messages, model=self.model_id, **kwargs
170
+ )
171
+ except TypeError:
172
+ # fallback if client signature differs
173
+ resp = self._client.chat.completions.create(
174
+ messages=port_messages, model=self.model_id
175
+ )
176
+
177
+ if return_chat_completions:
178
+ return resp
179
+
180
+ # try to extract text, usage and finish reason
181
+ text = _extract_text_from_response(resp)
182
+
183
+ usage = None
184
+ finish_reason = None
185
+ if isinstance(resp, dict):
186
+ usage = resp.get("usage")
187
+ try:
188
+ finish_reason = resp.get("choices", [])[0].get("finish_reason")
189
+ except Exception:
190
+ finish_reason = None
191
+
192
+ return ChatResult(
193
+ text=text, usage=usage, finish_reason=finish_reason, raw=resp
194
+ )
195
+
196
+ def encode(self, sentences: List[str]) -> List[list]:
197
+ if self.embedding_model is None:
198
+ raise Exception(
199
+ "embedding model id must be specified for text encoding"
200
+ )
201
+
202
+ self._require_client()
203
+
204
+ try:
205
+ resp = self._client.embeddings.create(
206
+ inputs=sentences, model=self.embedding_model
207
+ )
208
+ except TypeError:
209
+ resp = self._client.embeddings.create(
210
+ inputs=sentences, model=self.embedding_model
211
+ )
212
+
213
+ # Try common shapes: {'data': [{'embedding': [...]}, ...]} or {'results': ...}
214
+ if isinstance(resp, dict):
215
+ if "data" in resp:
216
+ return [d.get("embedding") for d in resp.get("data", [])]
217
+ if "results" in resp:
218
+ return [r.get("embedding") for r in resp.get("results", [])]
219
+
220
+ # If the client returns list directly
221
+ if isinstance(resp, list):
222
+ # expect list of embeddings
223
+ return resp
224
+
225
+ # Unknown shape -> try to coerce
226
+ try:
227
+ return [list(e) for e in resp]
228
+ except Exception:
229
+ raise ValueError("Unexpected response from embeddings request")