ibm-watsonx-orchestrate-evaluation-framework 1.1.6__py3-none-any.whl → 1.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (42) hide show
  1. {ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/METADATA +4 -1
  2. {ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/RECORD +42 -36
  3. wxo_agentic_evaluation/analyze_run.py +49 -32
  4. wxo_agentic_evaluation/arg_configs.py +30 -2
  5. wxo_agentic_evaluation/data_annotator.py +22 -4
  6. wxo_agentic_evaluation/description_quality_checker.py +20 -4
  7. wxo_agentic_evaluation/evaluation_package.py +189 -15
  8. wxo_agentic_evaluation/external_agent/external_validate.py +3 -1
  9. wxo_agentic_evaluation/external_agent/types.py +1 -1
  10. wxo_agentic_evaluation/inference_backend.py +64 -34
  11. wxo_agentic_evaluation/llm_matching.py +92 -2
  12. wxo_agentic_evaluation/llm_user.py +2 -2
  13. wxo_agentic_evaluation/main.py +147 -38
  14. wxo_agentic_evaluation/metrics/__init__.py +5 -1
  15. wxo_agentic_evaluation/metrics/evaluations.py +124 -0
  16. wxo_agentic_evaluation/metrics/metrics.py +24 -3
  17. wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
  18. wxo_agentic_evaluation/prompt/template_render.py +16 -0
  19. wxo_agentic_evaluation/quick_eval.py +17 -3
  20. wxo_agentic_evaluation/record_chat.py +17 -6
  21. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +44 -14
  22. wxo_agentic_evaluation/red_teaming/attack_generator.py +31 -12
  23. wxo_agentic_evaluation/red_teaming/attack_list.py +23 -24
  24. wxo_agentic_evaluation/red_teaming/attack_runner.py +36 -19
  25. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +42 -16
  26. wxo_agentic_evaluation/service_instance.py +5 -3
  27. wxo_agentic_evaluation/service_provider/__init__.py +129 -9
  28. wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
  29. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +415 -17
  30. wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
  31. wxo_agentic_evaluation/service_provider/provider.py +130 -10
  32. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
  33. wxo_agentic_evaluation/service_provider/watsonx_provider.py +480 -52
  34. wxo_agentic_evaluation/type.py +14 -4
  35. wxo_agentic_evaluation/utils/__init__.py +43 -5
  36. wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
  37. wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
  38. wxo_agentic_evaluation/utils/messages_parser.py +30 -0
  39. wxo_agentic_evaluation/utils/utils.py +14 -9
  40. wxo_agentic_evaluation/wxo_client.py +2 -1
  41. {ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/WHEEL +0 -0
  42. {ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/top_level.txt +0 -0
@@ -1,13 +1,21 @@
1
+ import json
2
+ import logging
1
3
  import os
2
4
  import time
5
+ import uuid
3
6
  from threading import Lock
4
- from typing import List, Tuple
7
+ from typing import Any, Dict, List, Optional, Sequence, Tuple
5
8
 
6
9
  import requests
7
10
 
8
- from wxo_agentic_evaluation.service_provider.provider import Provider
11
+ from wxo_agentic_evaluation.service_provider.provider import (
12
+ ChatResult,
13
+ Provider,
14
+ )
9
15
  from wxo_agentic_evaluation.utils.utils import is_ibm_cloud_url
10
16
 
17
+ logger = logging.getLogger(__name__)
18
+
11
19
  AUTH_ENDPOINT_AWS = (
12
20
  "https://iam.platform.saas.ibm.com/siusermgr/api/1.0/apikeys/token"
13
21
  )
@@ -15,7 +23,6 @@ AUTH_ENDPOINT_IBM_CLOUD = "https://iam.cloud.ibm.com/identity/token"
15
23
  DEFAULT_PARAM = {
16
24
  "min_new_tokens": 1,
17
25
  "decoding_method": "greedy",
18
- "max_new_tokens": 400,
19
26
  }
20
27
 
21
28
 
@@ -36,17 +43,62 @@ def _normalize_cpd_auth_url(url: str) -> str:
36
43
  return url
37
44
 
38
45
 
46
+ def _truncate(value: Any, max_len: int = 1000) -> str:
47
+ if value is None:
48
+ return ""
49
+ s = str(value)
50
+ return (
51
+ s
52
+ if len(s) <= max_len
53
+ else s[:max_len] + f"... [truncated {len(s) - max_len} chars]"
54
+ )
55
+
56
+
57
+ def _translate_params_to_chat(params: Dict[str, Any] | None) -> Dict[str, Any]:
58
+ # Translate legacy generation params to chat.completions params.
59
+ p = params or {}
60
+ out: Dict[str, Any] = {}
61
+
62
+ passthrough = {
63
+ "temperature",
64
+ "top_p",
65
+ "n",
66
+ "stream",
67
+ "stop",
68
+ "presence_penalty",
69
+ "frequency_penalty",
70
+ "logit_bias",
71
+ "user",
72
+ "max_tokens",
73
+ "seed",
74
+ "response_format",
75
+ }
76
+ for k in passthrough:
77
+ if k in p:
78
+ out[k] = p[k]
79
+
80
+ if "max_new_tokens" in p and "max_tokens" not in out:
81
+ out["max_tokens"] = p["max_new_tokens"]
82
+
83
+ return out
84
+
85
+
39
86
  class ModelProxyProvider(Provider):
40
87
  def __init__(
41
88
  self,
42
- model_id=None,
43
- api_key=None,
44
- instance_url=None,
45
- timeout=300,
46
- embedding_model_id=None,
47
- params=None,
89
+ model_id: Optional[str] = None,
90
+ api_key: Optional[str] = None,
91
+ instance_url: Optional[str] = None,
92
+ timeout: int = 300,
93
+ embedding_model_id: Optional[str] = None,
94
+ params: Optional[Dict[str, Any]] = None,
95
+ use_legacy_query: Optional[
96
+ bool
97
+ ] = None, # Provider routes query() to old/new based on this
98
+ system_prompt: Optional[str] = None,
99
+ token: Optional[str] = None,
48
100
  ):
49
- super().__init__()
101
+ super().__init__(use_legacy_query=use_legacy_query)
50
102
 
51
103
  instance_url = os.environ.get("WO_INSTANCE", instance_url)
52
104
  if not instance_url:
@@ -56,6 +108,7 @@ class ModelProxyProvider(Provider):
56
108
 
57
109
  self.timeout = timeout
58
110
  self.model_id = os.environ.get("MODEL_OVERRIDE", model_id)
111
+ logger.info("[d b]Using inference model %s", self.model_id)
59
112
  self.embedding_model_id = embedding_model_id
60
113
 
61
114
  self.api_key = os.environ.get("WO_API_KEY", api_key)
@@ -106,14 +159,17 @@ class ModelProxyProvider(Provider):
106
159
  "WO_API_KEY must be specified for SaaS or IBM IAM auth"
107
160
  )
108
161
 
162
+ # Endpoints
109
163
  self.url = (
110
164
  self.instance_url + "/ml/v1/text/generation?version=2024-05-01"
111
- )
165
+ ) # legacy
166
+ self.chat_url = self.instance_url + "/ml/v1/chat/completions" # chat
112
167
  self.embedding_url = self.instance_url + "/ml/v1/text/embeddings"
113
168
 
114
169
  self.lock = Lock()
115
170
  self.token, self.refresh_time = self.get_token()
116
171
  self.params = params if params else DEFAULT_PARAM
172
+ self.system_prompt = system_prompt
117
173
 
118
174
  def _resolve_auth_mode_and_url(
119
175
  self, explicit_auth_url: str | None
@@ -254,6 +310,7 @@ class ModelProxyProvider(Provider):
254
310
  json=payload,
255
311
  headers=headers,
256
312
  verify=self._wo_ssl_verify,
313
+ timeout=self.timeout,
257
314
  )
258
315
 
259
316
  if resp.status_code == 200:
@@ -262,9 +319,11 @@ class ModelProxyProvider(Provider):
262
319
 
263
320
  resp.raise_for_status()
264
321
 
265
- def query(self, sentence: str) -> str:
322
+ def old_query(self, sentence: str) -> str:
323
+ # Legacy /ml/v1/text/generation
266
324
  if self.model_id is None:
267
325
  raise Exception("model id must be specified for text generation")
326
+
268
327
  self.refresh_token_if_expires()
269
328
  headers = self.get_header()
270
329
  payload = {
@@ -274,18 +333,357 @@ class ModelProxyProvider(Provider):
274
333
  "timeout": self.timeout,
275
334
  "parameters": self.params,
276
335
  }
277
- resp = requests.post(
278
- self.url, json=payload, headers=headers, verify=self._wo_ssl_verify
336
+
337
+ request_id = str(uuid.uuid4())
338
+ start_time = time.time()
339
+
340
+ # Input logging
341
+ logger.debug(
342
+ "[d][b]Sending text.generation request | request_id=%s url=%s model=%s space_id=%s params=%s input_preview=%s",
343
+ request_id,
344
+ self.url,
345
+ self.model_id,
346
+ self.space_id,
347
+ json.dumps(
348
+ payload.get("parameters", {}),
349
+ sort_keys=True,
350
+ ensure_ascii=False,
351
+ ),
352
+ _truncate(sentence, 200),
279
353
  )
280
- if resp.status_code == 200:
281
- return resp.json()["results"][0]["generated_text"]
282
354
 
283
- resp.raise_for_status()
355
+ resp = None
356
+ try:
357
+ resp = requests.post(
358
+ self.url,
359
+ json=payload,
360
+ headers=headers,
361
+ verify=self._wo_ssl_verify,
362
+ timeout=self.timeout,
363
+ )
364
+
365
+ duration_ms = int((time.time() - start_time) * 1000)
366
+ resp.raise_for_status()
367
+ data = resp.json()
368
+
369
+ result = (
370
+ data["results"][0]
371
+ if "results" in data and data["results"]
372
+ else data
373
+ )
374
+ output_text = (
375
+ (
376
+ result.get("generated_text")
377
+ if isinstance(result, dict)
378
+ else None
379
+ )
380
+ or (result.get("message") if isinstance(result, dict) else None)
381
+ or ""
382
+ )
383
+
384
+ # Usage (best-effort)
385
+ usage = data.get("usage") or {}
386
+ if not usage and isinstance(result, dict):
387
+ in_tok = result.get("input_token_count")
388
+ out_tok = result.get("generated_token_count") or result.get(
389
+ "output_token_count"
390
+ )
391
+ if in_tok is not None or out_tok is not None:
392
+ usage = {
393
+ "prompt_tokens": in_tok,
394
+ "completion_tokens": out_tok,
395
+ "total_tokens": (in_tok or 0) + (out_tok or 0),
396
+ }
397
+
398
+ api_request_id = resp.headers.get(
399
+ "x-request-id"
400
+ ) or resp.headers.get("request-id")
401
+
402
+ # Output logging
403
+ logger.debug(
404
+ "[d][b]text.generation response received | request_id=%s status_code=%s duration_ms=%s usage=%s output_preview=%s api_request_id=%s",
405
+ request_id,
406
+ resp.status_code,
407
+ duration_ms,
408
+ json.dumps(usage, sort_keys=True, ensure_ascii=False),
409
+ _truncate(output_text, 2000),
410
+ api_request_id,
411
+ )
412
+
413
+ if output_text:
414
+ return output_text
415
+ else:
416
+ raise ValueError(
417
+ f"Unexpected response from legacy endpoint: {data}"
418
+ )
419
+
420
+ except Exception as e:
421
+ duration_ms = int((time.time() - start_time) * 1000)
422
+ status_code = getattr(resp, "status_code", None)
423
+ resp_text_preview = (
424
+ _truncate(getattr(resp, "text", None), 2000)
425
+ if resp is not None
426
+ else None
427
+ )
428
+
429
+ logger.exception(
430
+ "text.generation request failed | request_id=%s status_code=%s duration_ms=%s response_text_preview=%s",
431
+ request_id,
432
+ status_code,
433
+ duration_ms,
434
+ resp_text_preview,
435
+ )
436
+
437
+ with self.lock:
438
+ if (
439
+ "authentication_token_expired" in str(e)
440
+ or status_code == 401
441
+ ):
442
+ self.token, self.refresh_time = self.get_token()
443
+ raise
444
+
445
+ def new_query(self, sentence: str) -> str:
446
+ """
447
+ New /ml/v1/chat/completions
448
+ Returns assistant message content of the first choice.
449
+ """
450
+ if self.model_id is None:
451
+ raise Exception("model id must be specified for text generation")
452
+
453
+ self.refresh_token_if_expires()
454
+ headers = self.get_header()
455
+
456
+ messages: List[Dict[str, Any]] = []
457
+ if getattr(self, "system_prompt", None):
458
+ messages.append({"role": "system", "content": self.system_prompt})
459
+ messages.append(
460
+ {
461
+ "role": "user",
462
+ "content": [{"type": "text", "text": sentence}],
463
+ }
464
+ )
465
+
466
+ chat_params = _translate_params_to_chat(self.params)
467
+ if isinstance(self.params, dict) and "time_limit" in self.params:
468
+ chat_params["time_limit"] = self.params["time_limit"]
469
+
470
+ payload: Dict[str, Any] = {
471
+ "model_id": self.model_id,
472
+ "space_id": self.space_id,
473
+ "messages": messages,
474
+ **chat_params,
475
+ }
476
+
477
+ url = f"{self.instance_url}/ml/v1/text/chat?version=2024-10-08"
478
+ request_id = str(uuid.uuid4())
479
+ start_time = time.time()
480
+
481
+ logger.debug(
482
+ "[d][b]Sending chat.completions request | request_id=%s url=%s model=%s space_id=%s params=%s input_preview=%s",
483
+ request_id,
484
+ url,
485
+ self.model_id,
486
+ self.space_id,
487
+ json.dumps(chat_params, sort_keys=True, ensure_ascii=False),
488
+ _truncate(sentence, 200),
489
+ )
490
+
491
+ resp = None
492
+ try:
493
+ resp = requests.post(
494
+ url,
495
+ json=payload,
496
+ headers=headers,
497
+ verify=self._wo_ssl_verify,
498
+ timeout=self.timeout,
499
+ )
500
+ duration_ms = int((time.time() - start_time) * 1000)
501
+ resp.raise_for_status()
502
+ data = resp.json()
503
+
504
+ choice = data["choices"][0]
505
+ content = choice["message"]["content"]
506
+ finish_reason = choice.get("finish_reason")
507
+ usage = data.get("usage", {})
508
+ api_request_id = resp.headers.get(
509
+ "x-request-id"
510
+ ) or resp.headers.get("request-id")
511
+
512
+ logger.debug(
513
+ "[d][b]chat.completions response received | request_id=%s status_code=%s duration_ms=%s finish_reason=%s usage=%s output_preview=%s api_request_id=%s",
514
+ request_id,
515
+ resp.status_code,
516
+ duration_ms,
517
+ finish_reason,
518
+ json.dumps(usage, sort_keys=True, ensure_ascii=False),
519
+ _truncate(content, 2000),
520
+ api_request_id,
521
+ )
522
+
523
+ return content
524
+
525
+ except Exception as e:
526
+ duration_ms = int((time.time() - start_time) * 1000)
527
+ status_code = getattr(resp, "status_code", None)
528
+ resp_text_preview = (
529
+ _truncate(getattr(resp, "text", None), 2000)
530
+ if resp is not None
531
+ else None
532
+ )
533
+
534
+ logger.exception(
535
+ "chat.completions request failed | request_id=%s status_code=%s duration_ms=%s response_text_preview=%s",
536
+ request_id,
537
+ status_code,
538
+ duration_ms,
539
+ resp_text_preview,
540
+ )
541
+
542
+ with self.lock:
543
+ if (
544
+ "authentication_token_expired" in str(e)
545
+ or status_code == 401
546
+ ):
547
+ self.token, self.refresh_time = self.get_token()
548
+ raise
549
+
550
+ def chat(
551
+ self,
552
+ messages: Sequence[Dict[str, str]],
553
+ params: Optional[Dict[str, Any]] = None,
554
+ ) -> ChatResult:
555
+ # Non-streaming chat using /ml/v1/chat/completions.
556
+ if self.model_id is None:
557
+ raise Exception("model id must be specified for chat")
558
+
559
+ self.refresh_token_if_expires()
560
+ headers = self.get_header()
561
+
562
+ # Convert messages to watsonx format: user content is typed list
563
+ wx_messages: List[Dict[str, Any]] = []
564
+ for m in messages:
565
+ role = m.get("role")
566
+ content = m.get("content", "")
567
+ if role == "user" and isinstance(content, str):
568
+ wx_messages.append(
569
+ {
570
+ "role": "user",
571
+ "content": [{"type": "text", "text": content}],
572
+ }
573
+ )
574
+ else:
575
+ wx_messages.append({"role": role, "content": content})
576
+
577
+ merged_params = dict(self.params or {})
578
+ if params:
579
+ merged_params.update(params)
580
+ chat_params = _translate_params_to_chat(merged_params)
581
+ chat_params.pop("stream", None) # force non-streaming
582
+ if "time_limit" in merged_params:
583
+ chat_params["time_limit"] = merged_params["time_limit"]
584
+
585
+ payload: Dict[str, Any] = {
586
+ "model_id": self.model_id,
587
+ "space_id": self.space_id,
588
+ "messages": wx_messages,
589
+ **chat_params,
590
+ }
591
+
592
+ url = f"{self.instance_url}/ml/v1/text/chat?version=2024-10-08"
593
+
594
+ last_user = next(
595
+ (
596
+ m.get("content", "")
597
+ for m in reversed(messages)
598
+ if m.get("role") == "user"
599
+ ),
600
+ "",
601
+ )
602
+ request_id = str(uuid.uuid4())
603
+ start_time = time.time()
604
+
605
+ logger.debug(
606
+ "[d][b]Sending chat.completions request (non-streaming) | request_id=%s url=%s model=%s space_id=%s params=%s input_preview=%s",
607
+ request_id,
608
+ url,
609
+ self.model_id,
610
+ self.space_id,
611
+ json.dumps(chat_params, sort_keys=True, ensure_ascii=False),
612
+ _truncate(last_user, 200),
613
+ )
614
+
615
+ resp = None
616
+ try:
617
+ resp = requests.post(
618
+ url,
619
+ json=payload,
620
+ headers=headers,
621
+ verify=self._wo_ssl_verify,
622
+ timeout=self.timeout,
623
+ )
624
+ duration_ms = int((time.time() - start_time) * 1000)
625
+ resp.raise_for_status()
626
+ data = resp.json()
627
+
628
+ choice = data["choices"][0]
629
+ content = choice["message"]["content"]
630
+ finish_reason = choice.get("finish_reason")
631
+ usage = data.get("usage", {})
632
+ api_request_id = resp.headers.get(
633
+ "x-request-id"
634
+ ) or resp.headers.get("request-id")
635
+
636
+ logger.debug(
637
+ "[d][b]chat.completions response received (non-streaming) | request_id=%s status_code=%s duration_ms=%s finish_reason=%s usage=%s output_preview=%s api_request_id=%s",
638
+ request_id,
639
+ resp.status_code,
640
+ duration_ms,
641
+ finish_reason,
642
+ json.dumps(usage, sort_keys=True, ensure_ascii=False),
643
+ _truncate(content, 2000),
644
+ api_request_id,
645
+ )
646
+
647
+ return ChatResult(
648
+ text=content, usage=usage, finish_reason=finish_reason, raw=data
649
+ )
650
+
651
+ except Exception as e:
652
+ duration_ms = int((time.time() - start_time) * 1000)
653
+ status_code = getattr(resp, "status_code", None)
654
+ resp_text_preview = (
655
+ _truncate(getattr(resp, "text", None), 2000)
656
+ if resp is not None
657
+ else None
658
+ )
659
+
660
+ logger.exception(
661
+ "chat.completions request failed (non-streaming) | request_id=%s status_code=%s duration_ms=%s response_text_preview=%s",
662
+ request_id,
663
+ status_code,
664
+ duration_ms,
665
+ resp_text_preview,
666
+ )
667
+ with self.lock:
668
+ if (
669
+ "authentication_token_expired" in str(e)
670
+ or status_code == 401
671
+ ):
672
+ self.token, self.refresh_time = self.get_token()
673
+ raise
284
674
 
285
675
 
286
676
  if __name__ == "__main__":
677
+ logging.basicConfig(
678
+ level=logging.INFO,
679
+ format="%(asctime)s %(levelname)s %(name)s %(message)s",
680
+ )
681
+
287
682
  provider = ModelProxyProvider(
288
683
  model_id="meta-llama/llama-3-3-70b-instruct",
289
684
  embedding_model_id="ibm/slate-30m-english-rtrvr",
685
+ use_legacy_query=False,
686
+ system_prompt="",
290
687
  )
688
+ # Base class will route .query() to new_query() by default (unless USE_LEGACY_QUERY=true)
291
689
  print(provider.query("ok"))