opik-optimizer 2.0.1__py3-none-any.whl → 2.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,679 @@
1
+ from contextlib import contextmanager
2
+ from typing import Any
3
+ from collections.abc import Iterator
4
+
5
+ from rich.panel import Panel
6
+ from rich.text import Text
7
+
8
+ from ..optimization_config import chat_prompt
9
+ from ..reporting_utils import (
10
+ convert_tqdm_to_rich,
11
+ display_configuration, # noqa: F401
12
+ display_header, # noqa: F401
13
+ display_messages,
14
+ display_result, # noqa: F401
15
+ get_console,
16
+ suppress_opik_logs,
17
+ )
18
+
19
+ PANEL_WIDTH = 90
20
+ console = get_console()
21
+
22
+
23
+ def display_retry_attempt(
24
+ attempt: int,
25
+ max_attempts: int,
26
+ failure_mode_name: str,
27
+ verbose: int = 1,
28
+ ) -> None:
29
+ """Display retry attempt information."""
30
+ if verbose >= 1:
31
+ console.print(
32
+ Text(
33
+ f"│ Retry attempt {attempt + 1}/{max_attempts} for failure mode '{failure_mode_name}' (no improvement observed)",
34
+ style="yellow",
35
+ )
36
+ )
37
+
38
+
39
+ @contextmanager
40
+ def display_round_progress(max_rounds: int, verbose: int = 1) -> Any:
41
+ """Context manager to display messages during an evaluation phase."""
42
+
43
+ # Create a simple object with a method to set the score
44
+ class Reporter:
45
+ def failed_to_generate(self, num_prompts: int, error: str) -> None:
46
+ if verbose >= 1:
47
+ console.print(
48
+ Text(
49
+ f"│ Failed to generate {num_prompts} candidate prompt{'' if num_prompts == 1 else 's'}: {error}",
50
+ style="red",
51
+ )
52
+ )
53
+ console.print(Text("│"))
54
+
55
+ def round_start(self, round_number: int) -> None:
56
+ if verbose >= 1:
57
+ console.print(
58
+ Text(
59
+ f"│ - Starting optimization round {round_number + 1} of {max_rounds}"
60
+ )
61
+ )
62
+
63
+ def round_end(self, round_number: int, score: float, best_score: float) -> None:
64
+ if verbose >= 1:
65
+ console.print(
66
+ Text(
67
+ f"│ Completed optimization round {round_number + 1} of {max_rounds}"
68
+ )
69
+ )
70
+ if best_score == 0 and score == 0:
71
+ console.print(
72
+ Text(
73
+ "│ No improvement in this optimization round - score is 0",
74
+ style="yellow",
75
+ )
76
+ )
77
+ elif best_score == 0:
78
+ console.print(
79
+ Text(
80
+ f"│ Found a new best performing prompt: {score:.4f}",
81
+ style="green",
82
+ )
83
+ )
84
+ elif score > best_score:
85
+ perc_change = (score - best_score) / best_score
86
+ console.print(
87
+ Text(
88
+ f"│ Found a new best performing prompt: {score:.4f} ({perc_change:.2%})",
89
+ style="green",
90
+ )
91
+ )
92
+ elif score <= best_score:
93
+ console.print(
94
+ Text(
95
+ "│ No improvement in this optimization round",
96
+ style="red",
97
+ )
98
+ )
99
+
100
+ console.print(Text("│"))
101
+
102
+ # Use our log suppression context manager and yield the reporter
103
+ with suppress_opik_logs():
104
+ with convert_tqdm_to_rich(verbose=verbose):
105
+ try:
106
+ yield Reporter()
107
+ finally:
108
+ pass
109
+
110
+
111
+ @contextmanager
112
+ def display_evaluation(
113
+ message: str = "First we will establish the baseline performance:",
114
+ verbose: int = 1,
115
+ indent: str = "> ",
116
+ baseline_score: float | None = None,
117
+ ) -> Any:
118
+ """Context manager to display messages during an evaluation phase.
119
+
120
+ Args:
121
+ message: Message to display
122
+ verbose: Verbosity level
123
+ indent: Prefix for the message (default "> " for top-level, "│ " for nested)
124
+ baseline_score: If provided, shows score comparison instead of "Baseline score"
125
+ """
126
+ # Entry point
127
+ if verbose >= 1:
128
+ console.print(Text(f"{indent}{message}"))
129
+
130
+ # Create a simple object with a method to set the score
131
+ class Reporter:
132
+ def set_score(self, s: float) -> None:
133
+ if verbose >= 1:
134
+ # Adjust score indentation based on indent style
135
+ score_indent = " " if indent == "> " else "│ "
136
+
137
+ if baseline_score is None:
138
+ # This is the baseline evaluation
139
+ console.print(
140
+ Text(
141
+ f"\r{score_indent}Baseline score was: {s:.4f}.",
142
+ style="green",
143
+ )
144
+ )
145
+ console.print(Text("│"))
146
+ else:
147
+ # This is an improved prompt evaluation - show comparison
148
+ if s > baseline_score:
149
+ improvement_pct = (
150
+ ((s - baseline_score) / baseline_score * 100)
151
+ if baseline_score > 0
152
+ else 0
153
+ )
154
+ console.print(
155
+ Text(
156
+ f"\r{score_indent}Score for updated prompt: {s:.4f} (+{improvement_pct:.1f}%)",
157
+ style="green bold",
158
+ )
159
+ )
160
+ elif s < baseline_score:
161
+ decline_pct = (
162
+ ((baseline_score - s) / baseline_score * 100)
163
+ if baseline_score > 0
164
+ else 0
165
+ )
166
+ console.print(
167
+ Text(
168
+ f"\r{score_indent}Score for updated prompt: {s:.4f} (-{decline_pct:.1f}%)",
169
+ style="red",
170
+ )
171
+ )
172
+ else:
173
+ console.print(
174
+ Text(
175
+ f"\r{score_indent}Score for updated prompt: {s:.4f} (no change)",
176
+ style="yellow",
177
+ )
178
+ )
179
+ console.print(Text("│"))
180
+
181
+ # Use our log suppression context manager and yield the reporter
182
+ # Adjust progress bar indentation based on indent style
183
+ progress_indent = " Evaluation" if indent == "> " else "│ Evaluation"
184
+ with suppress_opik_logs():
185
+ with convert_tqdm_to_rich(progress_indent, verbose=verbose):
186
+ try:
187
+ yield Reporter()
188
+ finally:
189
+ pass
190
+
191
+
192
+ def display_optimization_start_message(verbose: int = 1) -> None:
193
+ if verbose >= 1:
194
+ console.print(Text("> Starting the optimization run"))
195
+ console.print(Text("│"))
196
+
197
+
198
+ class CandidateGenerationReporter:
199
+ def __init__(self, num_prompts: int):
200
+ self.num_prompts = num_prompts
201
+
202
+ def set_generated_prompts(self) -> None:
203
+ console.print(
204
+ Text(
205
+ f"│ Successfully generated {self.num_prompts} candidate prompt{'' if self.num_prompts == 1 else 's'}",
206
+ style="dim",
207
+ )
208
+ )
209
+ console.print(Text("│"))
210
+
211
+
212
+ def display_tool_description(description: str, label: str, color: str) -> None:
213
+ if not description.strip():
214
+ return
215
+ console.print(
216
+ Panel(
217
+ description.strip(),
218
+ title=label,
219
+ border_style=color,
220
+ )
221
+ )
222
+
223
+
224
+ @contextmanager
225
+ def display_candidate_generation_report(
226
+ num_prompts: int, verbose: int = 1
227
+ ) -> Iterator[CandidateGenerationReporter]:
228
+ if verbose >= 1:
229
+ console.print(
230
+ Text(f"│ Generating candidate prompt{'' if num_prompts == 1 else 's'}:")
231
+ )
232
+
233
+ try:
234
+ yield CandidateGenerationReporter(num_prompts)
235
+ finally:
236
+ pass
237
+
238
+
239
+ @contextmanager
240
+ def display_prompt_candidate_scoring_report(verbose: int = 1) -> Any:
241
+ """Context manager to display messages during an evaluation phase."""
242
+
243
+ # Create a simple object with a method to set the score
244
+ class Reporter:
245
+ def set_generated_prompts(
246
+ self, candidate_count: int, prompt: chat_prompt.ChatPrompt
247
+ ) -> None:
248
+ if verbose >= 1:
249
+ console.print(
250
+ Text(f"│ Evaluating candidate prompt {candidate_count + 1}:")
251
+ )
252
+ display_messages(prompt.get_messages(), "│ ")
253
+
254
+ def set_final_score(self, best_score: float, score: float) -> None:
255
+ if verbose >= 1:
256
+ if best_score == 0 and score > 0:
257
+ console.print(
258
+ Text(
259
+ f"│ Evaluation score: {score:.4f}",
260
+ style="green",
261
+ )
262
+ )
263
+ elif best_score == 0 and score == 0:
264
+ console.print(
265
+ Text(
266
+ f"│ Evaluation score: {score:.4f}",
267
+ style="dim yellow",
268
+ )
269
+ )
270
+ elif score > best_score:
271
+ perc_change = (score - best_score) / best_score
272
+ console.print(
273
+ Text(
274
+ f"│ Evaluation score: {score:.4f} ({perc_change:.2%})",
275
+ style="green",
276
+ )
277
+ )
278
+ elif score < best_score:
279
+ perc_change = (score - best_score) / best_score
280
+ console.print(
281
+ Text(
282
+ f"│ Evaluation score: {score:.4f} ({perc_change:.2%})",
283
+ style="red",
284
+ )
285
+ )
286
+ else:
287
+ console.print(
288
+ Text(
289
+ f"│ Evaluation score: {score:.4f}",
290
+ style="dim yellow",
291
+ )
292
+ )
293
+
294
+ console.print(Text("│ "))
295
+ console.print(Text("│ "))
296
+
297
+ try:
298
+ with suppress_opik_logs():
299
+ with convert_tqdm_to_rich("│ Evaluation", verbose=verbose):
300
+ yield Reporter()
301
+ finally:
302
+ pass
303
+
304
+
305
+ @contextmanager
306
+ def display_optimization_iteration(iteration: int, verbose: int = 1) -> Iterator[Any]:
307
+ """Context manager to display progress for a single optimization iteration."""
308
+ if verbose >= 1:
309
+ console.print()
310
+ console.print(Text("│"))
311
+ console.print(Text(f"│ Iteration {iteration}", style="bold cyan"))
312
+
313
+ class Reporter:
314
+ def iteration_complete(self, best_score: float, improved: bool) -> None:
315
+ if verbose >= 1:
316
+ if improved:
317
+ console.print(
318
+ Text(
319
+ f"│ Iteration {iteration} complete - New best score: {best_score:.4f}",
320
+ style="green",
321
+ )
322
+ )
323
+ else:
324
+ console.print(
325
+ Text(
326
+ f"│ Iteration {iteration} complete - No improvement (best: {best_score:.4f})",
327
+ style="yellow",
328
+ )
329
+ )
330
+ console.print(Text("│"))
331
+
332
+ try:
333
+ yield Reporter()
334
+ finally:
335
+ pass
336
+
337
+
338
+ @contextmanager
339
+ def display_root_cause_analysis(verbose: int = 1) -> Iterator[Any]:
340
+ """Context manager to display progress during root cause analysis with batch tracking."""
341
+ if verbose >= 1:
342
+ console.print(Text("│ "))
343
+ console.print(
344
+ Text("│ Analyzing root cause of failed evaluation items", style="cyan")
345
+ )
346
+
347
+ class Reporter:
348
+ def set_completed(self, total_test_cases: int, num_batches: int) -> None:
349
+ if verbose >= 1:
350
+ console.print(
351
+ Text(
352
+ f"│ Analyzed {total_test_cases} test cases across {num_batches} batches",
353
+ style="green",
354
+ )
355
+ )
356
+ console.print(Text("│ "))
357
+
358
+ try:
359
+ with suppress_opik_logs():
360
+ with convert_tqdm_to_rich("│ Batch analysis", verbose=verbose):
361
+ yield Reporter()
362
+ finally:
363
+ pass
364
+
365
+
366
+ @contextmanager
367
+ def display_batch_synthesis(num_batches: int, verbose: int = 1) -> Iterator[Any]:
368
+ """Context manager to display message during batch synthesis."""
369
+ if verbose >= 1:
370
+ console.print(Text("│ Synthesizing failure modes", style="cyan"))
371
+
372
+ class Reporter:
373
+ def set_completed(self, num_unified_modes: int) -> None:
374
+ # No completion message needed - failure modes will be displayed next
375
+ pass
376
+
377
+ with suppress_opik_logs():
378
+ yield Reporter()
379
+
380
+
381
+ def display_hierarchical_synthesis(
382
+ total_test_cases: int, num_batches: int, synthesis_notes: str, verbose: int = 1
383
+ ) -> None:
384
+ """Display hierarchical analysis synthesis information in a box."""
385
+ if verbose < 1:
386
+ return
387
+
388
+ synthesis_content = Text()
389
+ synthesis_content.append(
390
+ f"Analyzed {total_test_cases} test cases across {num_batches} batches\n\n",
391
+ style="bold",
392
+ )
393
+ synthesis_content.append("Synthesis Notes:\n", style="cyan")
394
+ synthesis_content.append(synthesis_notes)
395
+
396
+ panel = Panel(
397
+ synthesis_content,
398
+ title="🔍 Hierarchical Root Cause Analysis",
399
+ title_align="left",
400
+ border_style="cyan",
401
+ width=PANEL_WIDTH,
402
+ )
403
+
404
+ # Capture the panel as rendered text with ANSI styles and prefix each line
405
+ with console.capture() as capture:
406
+ console.print(panel)
407
+
408
+ rendered_panel = capture.get()
409
+ for line in rendered_panel.splitlines():
410
+ console.print(Text("│ ") + Text.from_ansi(line))
411
+
412
+ console.print()
413
+
414
+
415
+ def display_failure_modes(failure_modes: list[Any], verbose: int = 1) -> None:
416
+ """Display identified failure modes in formatted panels."""
417
+ if verbose < 1:
418
+ return
419
+
420
+ # Display header panel
421
+ header_panel = Panel(
422
+ Text(
423
+ f"Found {len(failure_modes)} distinct failure pattern{'s' if len(failure_modes) != 1 else ''}",
424
+ style="bold yellow",
425
+ ),
426
+ title="⚠️ IDENTIFIED FAILURE MODES",
427
+ title_align="left",
428
+ border_style="yellow",
429
+ width=PANEL_WIDTH,
430
+ )
431
+
432
+ with console.capture() as capture:
433
+ console.print(header_panel)
434
+
435
+ rendered_header = capture.get()
436
+ for line in rendered_header.splitlines():
437
+ console.print(Text("│ ") + Text.from_ansi(line))
438
+
439
+ console.print()
440
+
441
+ for idx, failure_mode in enumerate(failure_modes, 1):
442
+ # Create content for this failure mode
443
+ mode_content = Text()
444
+ mode_content.append(f"{failure_mode.name}\n\n", style="bold white")
445
+ mode_content.append("Description:\n", style="cyan")
446
+ mode_content.append(f"{failure_mode.description}\n\n")
447
+ mode_content.append("Root Cause:\n", style="cyan")
448
+ mode_content.append(f"{failure_mode.root_cause}")
449
+
450
+ panel = Panel(
451
+ mode_content,
452
+ title=f"Failure Mode {idx}",
453
+ title_align="left",
454
+ border_style="red" if idx == 1 else "yellow",
455
+ width=PANEL_WIDTH,
456
+ )
457
+
458
+ # Capture and prefix each line
459
+ with console.capture() as capture:
460
+ console.print(panel)
461
+
462
+ rendered_panel = capture.get()
463
+ for line in rendered_panel.splitlines():
464
+ console.print(Text("│ ") + Text.from_ansi(line))
465
+
466
+ if idx < len(failure_modes):
467
+ console.print("│")
468
+
469
+
470
+ @contextmanager
471
+ def display_prompt_improvement(
472
+ failure_mode_name: str, verbose: int = 1
473
+ ) -> Iterator[Any]:
474
+ """Context manager to display progress while generating improved prompt."""
475
+ if verbose >= 1:
476
+ console.print()
477
+ console.print(Text("│ "))
478
+ console.print(Text(f"│ Addressing: {failure_mode_name}", style="bold cyan"))
479
+
480
+ class Reporter:
481
+ def set_reasoning(self, reasoning: str) -> None:
482
+ if verbose >= 1:
483
+ reasoning_content = Text()
484
+ reasoning_content.append("Improvement Strategy:\n", style="cyan")
485
+ reasoning_content.append(reasoning)
486
+
487
+ panel = Panel(
488
+ reasoning_content,
489
+ title="💡 Reasoning",
490
+ title_align="left",
491
+ border_style="blue",
492
+ width=PANEL_WIDTH - 10,
493
+ padding=(0, 1),
494
+ )
495
+
496
+ # Capture and prefix each line
497
+ with console.capture() as capture:
498
+ console.print(panel)
499
+
500
+ rendered_panel = capture.get()
501
+ for line in rendered_panel.splitlines():
502
+ console.print(Text("│ ") + Text.from_ansi(line))
503
+
504
+ console.print(Text("│ "))
505
+
506
+ try:
507
+ with suppress_opik_logs():
508
+ with convert_tqdm_to_rich(
509
+ "│ Generating improved prompt", verbose=verbose
510
+ ):
511
+ yield Reporter()
512
+ finally:
513
+ pass
514
+
515
+
516
+ def display_improvement_reasoning(
517
+ failure_mode_name: str, reasoning: str, verbose: int = 1
518
+ ) -> None:
519
+ """Display prompt improvement reasoning for a specific failure mode."""
520
+ if verbose < 1:
521
+ return
522
+
523
+ console.print()
524
+ console.print(Text("│ "))
525
+ console.print(Text(f"│ Addressing: {failure_mode_name}", style="bold cyan"))
526
+
527
+ reasoning_content = Text()
528
+ reasoning_content.append("Improvement Strategy:\n", style="cyan")
529
+ reasoning_content.append(reasoning)
530
+
531
+ panel = Panel(
532
+ reasoning_content,
533
+ title="💡 Reasoning",
534
+ title_align="left",
535
+ border_style="blue",
536
+ width=PANEL_WIDTH - 10,
537
+ padding=(0, 1),
538
+ )
539
+
540
+ # Capture and prefix each line
541
+ with console.capture() as capture:
542
+ console.print(panel)
543
+
544
+ rendered_panel = capture.get()
545
+ for line in rendered_panel.splitlines():
546
+ console.print(Text("│ ") + Text.from_ansi(line))
547
+
548
+ console.print(Text("│ "))
549
+
550
+
551
+ def display_iteration_improvement(
552
+ improvement: float, current_score: float, best_score: float, verbose: int = 1
553
+ ) -> None:
554
+ """Display the improvement result for a failure mode iteration."""
555
+ if verbose < 1:
556
+ return
557
+
558
+ if improvement > 0:
559
+ console.print(
560
+ Text(
561
+ f"│ ✓ Improvement: {improvement:.2%} (from {best_score:.4f} to {current_score:.4f})",
562
+ style="green bold",
563
+ )
564
+ )
565
+ else:
566
+ console.print(
567
+ Text(
568
+ f"│ ✗ No improvement: {improvement:.2%} (score: {current_score:.4f}, best: {best_score:.4f})",
569
+ style="yellow",
570
+ )
571
+ )
572
+
573
+
574
+ def display_optimized_prompt_diff(
575
+ initial_messages: list[dict[str, str]],
576
+ optimized_messages: list[dict[str, str]],
577
+ initial_score: float,
578
+ best_score: float,
579
+ verbose: int = 1,
580
+ ) -> None:
581
+ """Display git-style diff of prompt changes."""
582
+ import difflib
583
+
584
+ if verbose < 1:
585
+ return
586
+
587
+ console.print()
588
+ console.print(Text("│"))
589
+ console.print(Text("│ > Optimization Results", style="bold green"))
590
+ console.print(Text("│"))
591
+
592
+ # Show score improvement
593
+ if best_score > initial_score:
594
+ perc_change = (best_score - initial_score) / initial_score
595
+ console.print(
596
+ Text(
597
+ f"│ Prompt improved from {initial_score:.4f} to {best_score:.4f} ({perc_change:.2%})",
598
+ style="green",
599
+ )
600
+ )
601
+ else:
602
+ console.print(
603
+ Text(f"│ No improvement found (score: {best_score:.4f})", style="yellow")
604
+ )
605
+
606
+ console.print(Text("│"))
607
+ console.print(Text("│ Prompt Changes:", style="cyan"))
608
+ console.print(Text("│"))
609
+
610
+ # Compare each message
611
+ for idx in range(max(len(initial_messages), len(optimized_messages))):
612
+ initial_msg = initial_messages[idx] if idx < len(initial_messages) else None
613
+ optimized_msg = (
614
+ optimized_messages[idx] if idx < len(optimized_messages) else None
615
+ )
616
+
617
+ # Get role from whichever message exists
618
+ role = "message"
619
+ if initial_msg:
620
+ role = initial_msg.get("role", "message")
621
+ elif optimized_msg:
622
+ role = optimized_msg.get("role", "message")
623
+
624
+ initial_content = initial_msg.get("content", "") if initial_msg else ""
625
+ optimized_content = optimized_msg.get("content", "") if optimized_msg else ""
626
+
627
+ # Handle added messages
628
+ if not initial_msg:
629
+ console.print(Text(f"│ {role}: (added)", style="green bold"))
630
+ for line in optimized_content.splitlines():
631
+ console.print(Text(f"│ +{line}", style="green"))
632
+ console.print(Text("│"))
633
+ continue
634
+
635
+ # Handle removed messages
636
+ if not optimized_msg:
637
+ console.print(Text(f"│ {role}: (removed)", style="red bold"))
638
+ for line in initial_content.splitlines():
639
+ console.print(Text(f"│ -{line}", style="red"))
640
+ console.print(Text("│"))
641
+ continue
642
+
643
+ # Check if there are changes
644
+ if initial_content == optimized_content:
645
+ # No changes in this message
646
+ console.print(Text(f"│ {role}: (unchanged)", style="dim"))
647
+ continue
648
+
649
+ # Generate unified diff
650
+ diff_lines = list(
651
+ difflib.unified_diff(
652
+ initial_content.splitlines(keepends=False),
653
+ optimized_content.splitlines(keepends=False),
654
+ lineterm="",
655
+ n=3, # 3 lines of context
656
+ )
657
+ )
658
+
659
+ if not diff_lines:
660
+ continue
661
+
662
+ # Display message header
663
+ console.print(Text(f"│ {role}:", style="bold cyan"))
664
+
665
+ # Create diff content
666
+ diff_content = Text()
667
+ for line in diff_lines[3:]: # Skip first 3 lines (---, +++, @@)
668
+ if line.startswith("+"):
669
+ diff_content.append("│ " + line + "\n", style="green")
670
+ elif line.startswith("-"):
671
+ diff_content.append("│ " + line + "\n", style="red")
672
+ elif line.startswith("@@"):
673
+ diff_content.append("│ " + line + "\n", style="cyan dim")
674
+ else:
675
+ # Context line
676
+ diff_content.append("│ " + line + "\n", style="dim")
677
+
678
+ console.print(diff_content)
679
+ console.print(Text("│"))