scorebook 0.0.14__py3-none-any.whl → 0.0.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. scorebook/__init__.py +2 -0
  2. scorebook/dashboard/credentials.py +34 -4
  3. scorebook/eval_datasets/eval_dataset.py +2 -2
  4. scorebook/evaluate/_async/evaluate_async.py +27 -11
  5. scorebook/evaluate/_sync/evaluate.py +27 -11
  6. scorebook/metrics/README.md +121 -0
  7. scorebook/metrics/__init__.py +8 -0
  8. scorebook/metrics/accuracy.py +2 -6
  9. scorebook/metrics/bertscore.py +50 -0
  10. scorebook/metrics/bleu.py +82 -0
  11. scorebook/metrics/core/__init__.py +1 -0
  12. scorebook/metrics/{metric_base.py → core/metric_base.py} +1 -2
  13. scorebook/metrics/core/metric_registry.py +195 -0
  14. scorebook/metrics/exactmatch.py +95 -0
  15. scorebook/metrics/f1.py +96 -0
  16. scorebook/metrics/precision.py +84 -9
  17. scorebook/metrics/recall.py +94 -0
  18. scorebook/metrics/rouge.py +85 -0
  19. scorebook/score/score_helpers.py +28 -11
  20. scorebook/types.py +2 -2
  21. scorebook/utils/progress_bars.py +58 -786
  22. {scorebook-0.0.14.dist-info → scorebook-0.0.15.dist-info}/METADATA +32 -24
  23. scorebook-0.0.15.dist-info/RECORD +110 -0
  24. {scorebook-0.0.14.dist-info → scorebook-0.0.15.dist-info}/WHEEL +1 -1
  25. tutorials/README.md +147 -0
  26. tutorials/__init__.py +5 -0
  27. tutorials/examples/1-score/1-scoring_model_accuracy.py +47 -0
  28. tutorials/examples/1-score/2-scoring_model_bleu.py +46 -0
  29. tutorials/examples/1-score/3-scoring_model_f1.py +64 -0
  30. tutorials/examples/1-score/4-scoring_model_rouge.py +64 -0
  31. tutorials/examples/1-score/5-scoring_model_exact_match.py +84 -0
  32. tutorials/examples/1-score/6-scoring_with_bertscore.py +57 -0
  33. tutorials/examples/1-score/__init__.py +0 -0
  34. tutorials/examples/2-evaluate/1-evaluating_local_models.py +106 -0
  35. tutorials/examples/2-evaluate/2-evaluating_local_models_with_batching.py +108 -0
  36. tutorials/examples/2-evaluate/3-evaluating_cloud_models.py +109 -0
  37. tutorials/examples/2-evaluate/4-evaluating_cloud_models_with_batching.py +170 -0
  38. tutorials/examples/2-evaluate/5-hyperparameter_sweeps.py +122 -0
  39. tutorials/examples/2-evaluate/6-inference_pipelines.py +141 -0
  40. tutorials/examples/3-evaluation_datasets/1-evaluation_datasets_from_files.py +110 -0
  41. tutorials/examples/3-evaluation_datasets/2-evaluation_datasets_from_huggingface.py +101 -0
  42. tutorials/examples/3-evaluation_datasets/3-evaluation_datasets_from_huggingface_with_yaml_configs.py +110 -0
  43. tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.csv +11 -0
  44. tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.json +42 -0
  45. tutorials/examples/3-evaluation_datasets/example_yaml_configs/Cais-MMLU.yaml +19 -0
  46. tutorials/examples/3-evaluation_datasets/example_yaml_configs/TIGER-Lab-MMLU-Pro.yaml +18 -0
  47. tutorials/examples/4-adaptive_evaluations/1-adaptive_evaluation.py +114 -0
  48. tutorials/examples/4-adaptive_evaluations/2-adaptive_dataset_splits.py +106 -0
  49. tutorials/examples/5-upload_results/1-uploading_score_results.py +92 -0
  50. tutorials/examples/5-upload_results/2-uploading_evaluate_results.py +117 -0
  51. tutorials/examples/5-upload_results/3-uploading_your_results.py +153 -0
  52. tutorials/examples/6-providers/aws/__init__.py +1 -0
  53. tutorials/examples/6-providers/aws/batch_example.py +219 -0
  54. tutorials/examples/6-providers/portkey/__init__.py +1 -0
  55. tutorials/examples/6-providers/portkey/batch_example.py +120 -0
  56. tutorials/examples/6-providers/portkey/messages_example.py +121 -0
  57. tutorials/examples/6-providers/vertex/__init__.py +1 -0
  58. tutorials/examples/6-providers/vertex/batch_example.py +166 -0
  59. tutorials/examples/6-providers/vertex/messages_example.py +142 -0
  60. tutorials/examples/__init__.py +0 -0
  61. tutorials/notebooks/1-scoring.ipynb +162 -0
  62. tutorials/notebooks/2-evaluating.ipynb +316 -0
  63. tutorials/notebooks/3.1-adaptive_evaluation_phi.ipynb +354 -0
  64. tutorials/notebooks/3.2-adaptive_evaluation_gpt.ipynb +243 -0
  65. tutorials/notebooks/4-uploading_results.ipynb +175 -0
  66. tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_openai_demo.ipynb +229 -0
  67. tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_qwen_demo.ipynb +256 -0
  68. tutorials/quickstarts/classical_evaluations/classical_evaluation_demo.ipynb +277 -0
  69. tutorials/quickstarts/getting_started.ipynb +197 -0
  70. tutorials/utils/__init__.py +35 -0
  71. tutorials/utils/args_parser.py +132 -0
  72. tutorials/utils/output.py +23 -0
  73. tutorials/utils/setup.py +98 -0
  74. scorebook/metrics/metric_registry.py +0 -107
  75. scorebook-0.0.14.dist-info/RECORD +0 -53
  76. {scorebook-0.0.14.dist-info → scorebook-0.0.15.dist-info}/entry_points.txt +0 -0
  77. {scorebook-0.0.14.dist-info → scorebook-0.0.15.dist-info}/licenses/LICENSE +0 -0
@@ -1,792 +1,100 @@
1
1
  """Progress bar utilities for evaluation tracking."""
2
2
 
3
- import re
4
- import shutil
5
- import threading
6
- import time
7
3
  from contextlib import contextmanager
8
- from dataclasses import dataclass
9
- from itertools import cycle
10
- from typing import Callable, Generator, Optional, cast
4
+ from dataclasses import dataclass, field
5
+ from typing import Generator, Optional
11
6
 
12
7
  from tqdm.auto import tqdm
13
8
 
14
- _IS_NOTEBOOK: Optional[bool] = None
15
-
16
-
17
- def _is_notebook() -> bool:
18
- """Detect if code is running in a Jupyter notebook environment.
19
-
20
- Uses lazy evaluation with caching for efficiency.
21
- """
22
- global _IS_NOTEBOOK
23
- if _IS_NOTEBOOK is None:
24
- try:
25
- shell = get_ipython().__class__.__name__ # type: ignore[name-defined]
26
- _IS_NOTEBOOK = shell == "ZMQInteractiveShell"
27
- except NameError:
28
- _IS_NOTEBOOK = False
29
- return _IS_NOTEBOOK
30
-
31
-
32
- # Color codes - ANSI for terminals, plain text for notebooks
33
- RESET = "\033[0m"
34
-
35
-
36
- def _make_color_func(ansi_code: str) -> Callable[[str], str]:
37
- """Create a color function that checks notebook status at runtime.
38
-
39
- Args:
40
- ansi_code: The ANSI escape code for the color (e.g., "32" for green)
41
-
42
- Returns:
43
- A function that formats text with the color, or returns plain text in notebooks
44
- """
45
-
46
- def color_func(text: str) -> str:
47
- if _is_notebook():
48
- return text
49
- return f"\033[{ansi_code}m{text}\033[0m"
50
-
51
- return color_func
52
-
53
-
54
- # Color functions - automatically handle notebook vs terminal rendering
55
- GREEN = _make_color_func("32") # Green
56
- RED = _make_color_func("31") # Red
57
- LIGHT_GREEN = _make_color_func("92") # Light green
58
- LIGHT_RED = _make_color_func("91") # Light red
59
- BLUE_BASE = _make_color_func("34") # Blue
60
- BLUE_HIGHLIGHT = _make_color_func("1;34") # Bright blue
61
-
62
-
63
- # Shimmer effect width (number of characters highlighted in sweep animation)
64
- # Tested values: 2 (too subtle), 3 (optimal), 5 (too wide)
65
- SHIMMER_WIDTH = 3
66
-
67
- # Spinner blue shimmer colors for terminals (cycled for visual effect)
68
- SPINNER_BLUE_COLORS = [
69
- "\033[34m", # Standard blue
70
- "\033[1;34m", # Bright blue
71
- "\033[94m", # Light blue
72
- "\033[36m", # Cyan
73
- "\033[1;36m", # Bright cyan
74
- "\033[96m", # Light cyan
75
- ]
76
-
77
- # Progress bar configuration
78
- PROGRESS_BAR_FORMAT = "{desc}|{bar}|" # Compact format for progress bars
79
- HEADER_FORMAT = "{desc}" # Header shows only description, no bar
80
-
81
- # Spinner update interval in seconds
82
- # 0.08s = 12.5 Hz provides smooth animation without excessive CPU usage
83
- # Lower values (0.05) cause flickering, higher values (0.2) appear choppy
84
- SPINNER_INTERVAL_SECONDS = 0.08
85
-
86
- # Terminal size fallback if detection fails
87
- # 120 columns: Common wide terminal default
88
- # 20 rows: Not used but required by shutil.get_terminal_size()
89
- TERMINAL_FALLBACK_SIZE = (120, 20)
90
-
91
- # Minimum spacing between header left and right sections
92
- # Prevents sections from touching when terminal is narrow
93
- MINIMUM_HEADER_SPACING = 3
94
-
95
- # Spinner animation frames
96
- SPINNER_FRAMES_UNICODE = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"]
97
- SPINNER_FRAMES_ASCII = ["|", "/", "-", "\\", "|", "/", "-", "\\"]
98
-
99
-
100
- def _select_spinner_frames() -> list[str]:
101
- """Select appropriate spinner frames based on terminal capabilities."""
102
- import sys
103
-
104
- encoding = sys.stdout.encoding or "ascii"
105
-
106
- if encoding.lower() in ("utf-8", "utf8"):
107
- return SPINNER_FRAMES_UNICODE
108
- else:
109
- return SPINNER_FRAMES_ASCII
110
-
111
-
112
- # Use Braille characters for smooth rotation (fallback to ASCII if needed)
113
- SPINNER_FRAMES = _select_spinner_frames()
114
-
115
- # Progress bar labels
116
- EVALUATIONS_LABEL = "Evaluations" # Label for run-level progress
117
- ITEMS_LABEL = "Items" # Label for item-level progress
118
-
119
- # Compiled regex pattern for ANSI escape codes (used for calculating visual length)
120
- _ANSI_ESCAPE_PATTERN = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])")
121
-
122
-
123
- def _visual_length(text: str) -> int:
124
- """Calculate the visual length of text, excluding ANSI escape codes."""
125
- return len(_ANSI_ESCAPE_PATTERN.sub("", text))
126
-
127
9
 
128
10
  @dataclass
129
- class EvaluationConfig:
130
- """Configuration for evaluation progress tracking."""
131
-
132
- total_eval_runs: int
133
- total_items: int
134
- dataset_count: int
135
- hyperparam_count: int
136
- model_display: str
137
-
138
- @property
139
- def dataset_label(self) -> str:
140
- """Get the appropriate dataset label (singular/plural)."""
141
- return "Dataset" if self.dataset_count == 1 else "Datasets"
142
-
143
- @property
144
- def hyperparam_label(self) -> str:
145
- """Get the appropriate hyperparameter label (singular/plural)."""
146
- if self.hyperparam_count == 1:
147
- return "Hyperparam Configuration"
148
- return "Hyperparam Configurations"
149
-
150
-
151
- class ProgressBarFormatter:
152
- """Handles formatting for progress bar descriptions and headers.
153
-
154
- This class is responsible for:
155
- - Formatting progress descriptions with aligned counts and percentages
156
- - Building header sections with spinner, timing, and statistics
157
- - Ensuring proper text alignment accounting for ANSI escape codes
158
-
159
- The formatter maintains consistent column widths based on the maximum
160
- number of digits needed for counts, ensuring progress bars don't shift
161
- as numbers increment.
162
- """
163
-
164
- def __init__(self, config: EvaluationConfig) -> None:
165
- """Initialize the formatter with configuration."""
166
- self.config = config
167
- self._label_width = max(len(EVALUATIONS_LABEL), len(ITEMS_LABEL))
168
- self._count_width = max(len(str(config.total_eval_runs)), len(str(config.total_items)), 1)
169
-
170
- def format_progress_description(self, label: str, completed: int, total: int) -> str:
171
- """Format a progress bar description with counts and percentage."""
172
- label_str = label.ljust(self._label_width)
173
- count_str = f"{completed:>{self._count_width}}/{total:>{self._count_width}}"
174
-
175
- if total > 0:
176
- percent = int((completed / total) * 100)
177
- percent_str = f"{percent:>3d}%"
178
- else:
179
- percent_str = " --%"
180
-
181
- return f"{label_str} {count_str} {percent_str} "
182
-
183
- @staticmethod
184
- def format_elapsed_time(elapsed_seconds: float) -> str:
185
- """Format elapsed time as mm:ss or hh:mm:ss."""
186
- total_seconds = int(max(elapsed_seconds, 0))
187
- hours, remainder = divmod(total_seconds, 3600)
188
- minutes, seconds = divmod(remainder, 60)
189
-
190
- if hours:
191
- return f"{hours:02d}:{minutes:02d}:{seconds:02d}"
192
- return f"{minutes:02d}:{seconds:02d}"
193
-
194
- def format_header(
195
- self,
196
- spinner_frame: str,
197
- elapsed_seconds: float,
198
- completed_runs: int,
199
- failed_runs: int,
200
- uploaded_runs: int,
201
- upload_failed_runs: int,
202
- shimmer_text: str = "",
203
- ) -> str:
204
- """Compose the header line with spinner, elapsed time, and run statistics."""
205
- elapsed_str = ProgressBarFormatter.format_elapsed_time(elapsed_seconds)
206
- left_section = self._build_left_section(spinner_frame, elapsed_str, shimmer_text)
207
- right_section = ProgressBarFormatter._build_run_status_section(
208
- completed_runs, failed_runs, uploaded_runs, upload_failed_runs
209
- )
210
-
211
- return ProgressBarFormatter._combine_header_sections(left_section, right_section)
212
-
213
- def _build_left_section(
214
- self, spinner_frame: str, elapsed_str: str, shimmer_text: str = ""
215
- ) -> str:
216
- """Build the left section of the header with spinner and evaluation info."""
217
- # Apply shimmer effect to the model display name
218
- evaluating_text = f"Evaluating {self.config.model_display}"
219
- model_text = shimmer_text if shimmer_text else evaluating_text
220
-
221
- return (
222
- f"{spinner_frame} {model_text} ({elapsed_str}) | "
223
- f"{self.config.dataset_count} {self.config.dataset_label} | "
224
- f"{self.config.hyperparam_count} {self.config.hyperparam_label}"
225
- )
226
-
227
- @staticmethod
228
- def _build_run_status_section(
229
- completed_runs: int, failed_runs: int, uploaded_runs: int, upload_failed_runs: int
230
- ) -> tuple[str, str]:
231
- """Build the run status section with plain and colored versions."""
232
- # Build base run statistics
233
- run_parts = [f"RUNS PASSED: {completed_runs}"]
234
- colored_run_parts = [GREEN(f"RUNS PASSED: {completed_runs}")]
235
-
236
- if failed_runs > 0:
237
- run_parts.append(f"RUNS FAILED: {failed_runs}")
238
- colored_run_parts.append(RED(f"RUNS FAILED: {failed_runs}"))
239
-
240
- # Add upload statistics if any uploads have occurred
241
- if uploaded_runs > 0 or upload_failed_runs > 0:
242
- run_parts.append(f"RUNS UPLOADED: {uploaded_runs}")
243
- colored_run_parts.append(LIGHT_GREEN(f"RUNS UPLOADED: {uploaded_runs}"))
244
-
245
- if upload_failed_runs > 0:
246
- run_parts.append(f"UPLOADS FAILED: {upload_failed_runs}")
247
- colored_run_parts.append(LIGHT_RED(f"UPLOADS FAILED: {upload_failed_runs}"))
248
-
249
- plain = f"[{', '.join(run_parts)}]"
250
- colored = f"[{', '.join(colored_run_parts)}]"
251
-
252
- return plain, colored
253
-
254
- @staticmethod
255
- def _combine_header_sections(left_section: str, right_sections: tuple[str, str]) -> str:
256
- """Combine left and right header sections with appropriate spacing."""
257
- plain_right, colored_right = right_sections
258
-
259
- term_width = shutil.get_terminal_size(fallback=TERMINAL_FALLBACK_SIZE).columns
260
- left_visual_length = _visual_length(left_section)
261
- right_visual_length = len(plain_right)
262
-
263
- # Check for terminal width overflow
264
- total_content_width = left_visual_length + right_visual_length
265
- if total_content_width >= term_width - MINIMUM_HEADER_SPACING:
266
- # Terminal too narrow, truncate left section
267
- max_left_width = term_width - right_visual_length - MINIMUM_HEADER_SPACING - 3
268
- if max_left_width < 20:
269
- # Terminal impossibly narrow, just show right section
270
- return colored_right
271
-
272
- # Truncate left section (strip ANSI codes for simplicity)
273
- left_plain = _ANSI_ESCAPE_PATTERN.sub("", left_section)
274
- left_truncated = left_plain[:max_left_width] + "..."
275
- left_section = left_truncated
276
- left_visual_length = len(left_truncated)
277
-
278
- spacing = term_width - left_visual_length - right_visual_length
279
- spacing = max(spacing, MINIMUM_HEADER_SPACING)
280
-
281
- return f"{left_section}{' ' * spacing}{colored_right}"
282
-
283
-
284
- class SpinnerManager:
285
- """Manages spinner animation for the progress header.
286
-
287
- Features:
288
- - Runs spinner animation in a background daemon thread
289
- - Applies blue color cycling to spinner frames (terminal only)
290
- - Provides shimmer sweep effect for text highlighting
291
- - Thread-safe state management with locks
292
-
293
- The spinner updates at SPINNER_INTERVAL_SECONDS frequency and
294
- automatically stops when stop() is called. In notebook environments,
295
- plain text frames are used without ANSI color codes. The daemon thread
296
- ensures the program can exit cleanly even if the spinner doesn't stop.
297
- """
298
-
299
- def __init__(self) -> None:
300
- """Initialize the spinner manager."""
301
- self._frames = SpinnerManager._normalize_spinner_frames()
302
- self._cycle: Optional[cycle] = None
303
- self._stop_event = threading.Event()
304
- self._thread: Optional[threading.Thread] = None
305
- self.frame_width = len(self._frames[0]) if self._frames else 0
306
- self._shimmer_position = 0 # Position of the shimmer sweep
307
- self._spinner_color_index = 0 # Index for spinner color cycling
308
- self._lock = threading.Lock() # Protects spinner state
309
-
310
- @staticmethod
311
- def _normalize_spinner_frames() -> list[str]:
312
- """Normalize spinner frames to have consistent width."""
313
- if not SPINNER_FRAMES:
314
- return []
315
-
316
- width = max(len(frame) for frame in SPINNER_FRAMES)
317
- return [frame.ljust(width) for frame in SPINNER_FRAMES]
318
-
319
- def start(self, update_callback: Callable[[str], None]) -> None:
320
- """Start the spinner animation."""
321
- if self._thread is not None or not self._frames:
322
- return
323
-
324
- self._stop_event.clear()
325
- self._cycle = cycle(self._frames)
326
- self._thread = threading.Thread(target=self._animate, args=(update_callback,), daemon=True)
327
- self._thread.start()
328
-
329
- def is_running(self) -> bool:
330
- """Check if the spinner animation is currently running."""
331
- return self._thread is not None and self._thread.is_alive()
332
-
333
- def stop(self) -> None:
334
- """Stop the spinner animation."""
335
- if self._thread is None:
336
- return
337
-
338
- self._stop_event.set()
339
- self._thread.join(timeout=5.0)
340
-
341
- if self._thread.is_alive():
342
- import logging
343
-
344
- logger = logging.getLogger(__name__)
345
- logger.warning("Spinner thread did not stop cleanly within 5 seconds")
346
- # Thread is daemon, so it will be killed on exit anyway
347
-
348
- self._thread = None
349
-
350
- def get_initial_frame(self) -> str:
351
- """Get the first spinner frame with blue shimmer effect (terminals only)."""
352
- if not self._frames:
353
- return ""
354
- frame = self._frames[0]
355
-
356
- # Return plain frame for notebooks (no ANSI colors)
357
- if _is_notebook():
358
- return frame
359
-
360
- # Add color codes for terminals
361
- color = SPINNER_BLUE_COLORS[self._spinner_color_index % len(SPINNER_BLUE_COLORS)]
362
- return f"{color}{frame}{RESET}"
363
-
364
- def get_empty_frame(self) -> str:
365
- """Get an empty frame with the same width as spinner frames."""
366
- return " " * self.frame_width
367
-
368
- def get_next_spinner_frame(self) -> str:
369
- """Get the next spinner frame with blue shimmer effect (terminals only)."""
370
- if not self._frames or not self._cycle:
371
- return ""
372
-
373
- frame = cast(str, next(self._cycle))
374
-
375
- # Return plain frame for notebooks (no ANSI colors)
376
- if _is_notebook():
377
- return frame
378
-
379
- # Add color codes for terminals (thread-safe)
380
- with self._lock:
381
- color = SPINNER_BLUE_COLORS[self._spinner_color_index % len(SPINNER_BLUE_COLORS)]
382
- self._spinner_color_index += 1
383
- return f"{color}{frame}{RESET}"
384
-
385
- def get_shimmer_text(self, text: str) -> str:
386
- """Apply sweep shimmer effect to text, returning formatted string."""
387
- if not text:
388
- return text
389
-
390
- # Get current shimmer position (thread-safe)
391
- with self._lock:
392
- shimmer_pos = self._shimmer_position
393
- self._shimmer_position += 1
394
- if self._shimmer_position >= len(text) + SHIMMER_WIDTH:
395
- self._shimmer_position = -SHIMMER_WIDTH
396
-
397
- # Build the text in segments using list (more efficient than string concat)
398
- result_parts = []
399
- i = 0
400
-
401
- while i < len(text):
402
- # Determine if we're in a highlight segment or base segment
403
- if shimmer_pos <= i < shimmer_pos + SHIMMER_WIDTH:
404
- # Start highlight segment
405
- highlight_start = i
406
- while i < len(text) and shimmer_pos <= i < shimmer_pos + SHIMMER_WIDTH:
407
- i += 1
408
- result_parts.append(BLUE_HIGHLIGHT(text[highlight_start:i]))
409
- else:
410
- # Start base segment
411
- base_start = i
412
- while i < len(text) and not (shimmer_pos <= i < shimmer_pos + SHIMMER_WIDTH):
413
- i += 1
414
- result_parts.append(BLUE_BASE(text[base_start:i]))
415
-
416
- return "".join(result_parts)
417
-
418
- def _animate(self, update_callback: Callable[[str], None]) -> None:
419
- """Continuously update the spinner animation."""
420
- import logging
421
-
422
- logger = logging.getLogger(__name__)
423
-
424
- while not self._stop_event.is_set() and self._cycle is not None:
425
- try:
426
- frame = self.get_next_spinner_frame()
427
- update_callback(frame)
428
- time.sleep(SPINNER_INTERVAL_SECONDS)
429
- except Exception as e:
430
- logger.error(
431
- f"Non-critical: Spinner animation thread encountered an error "
432
- f"and will stop. Progress bars will continue without animation. "
433
- f"Details: {e}",
434
- exc_info=True,
435
- )
436
- break # Exit gracefully rather than crash silently
437
-
438
-
439
11
  class EvaluationProgressBars:
440
- """Manages progress bars for evaluation runs and item processing.
441
-
442
- This class coordinates multiple progress displays:
443
- - Terminal mode: header bar + evaluations bar + items bar
444
- - Notebook mode: single simplified evaluations bar
12
+ """Tracks progress for evaluation runs."""
445
13
 
446
- Thread Safety:
447
- All state updates (completed_runs, failed_runs, etc.) are protected
448
- by _state_lock to prevent race conditions with the spinner thread.
14
+ _runs_bar: tqdm
15
+ _items_bar: tqdm
16
+ completed_runs: int = field(default=0, init=False)
17
+ failed_runs: int = field(default=0, init=False)
18
+ uploaded_runs: int = field(default=0, init=False)
19
+ upload_failed_runs: int = field(default=0, init=False)
449
20
 
450
- Lifecycle:
451
- 1. __init__: Initialize with configuration
452
- 2. start_progress_bars: Create and display bars
453
- 3. on_run_completed: Update when runs finish
454
- 4. on_upload_completed: Update when uploads finish
455
- 5. close_progress_bars: Clean up and show summary
456
- """
457
-
458
- def __init__(self, config: EvaluationConfig) -> None:
459
- """Initialize progress bar manager.
21
+ def on_run_completed(self, items_processed: int, succeeded: bool) -> None:
22
+ """Update progress when an evaluation run completes.
460
23
 
461
24
  Args:
462
- config: Configuration for the evaluation progress tracking
25
+ items_processed: Number of items processed in this run.
26
+ Pass 0 for adaptive evals (items tracked via on_item_progress).
27
+ succeeded: Whether the run completed successfully.
463
28
  """
464
- self.config = config
465
- self.formatter = ProgressBarFormatter(config)
466
- self.spinner = SpinnerManager()
467
-
468
- # Progress bar instances
469
- self._header_bar: Optional[tqdm] = None
470
- self._evaluations_bar: Optional[tqdm] = None
471
- self._items_bar: Optional[tqdm] = None
472
-
473
- # State tracking
474
- self.completed_runs = 0
475
- self.failed_runs = 0
476
- self.uploaded_runs = 0
477
- self.upload_failed_runs = 0
478
- self._start_time: Optional[float] = None
479
- self._state_lock = threading.Lock() # Protects run counters
480
-
481
- def start_progress_bars(self) -> None:
482
- """Start the evaluation progress bars."""
483
- self._start_time = time.monotonic()
484
-
485
- try:
486
- self._initialize_progress_bars()
487
- except Exception:
488
- # Ensure spinner is stopped if initialization fails
489
- self.spinner.stop()
490
- raise
491
-
492
- def _initialize_progress_bars(self) -> None:
493
- """Initialize progress bars based on environment."""
494
- if _is_notebook():
495
- # Simplified notebook version - just one progress bar for evaluation runs
496
- spinner_frame = SPINNER_FRAMES[0] if SPINNER_FRAMES else ""
497
- desc = (
498
- f"{spinner_frame} Evaluating {self.config.model_display} | "
499
- f"{self.config.dataset_count} {self.config.dataset_label} | "
500
- f"{self.config.hyperparam_count} {self.config.hyperparam_label}"
501
- )
502
- self._evaluations_bar = tqdm(
503
- total=self.config.total_eval_runs,
504
- desc=desc,
505
- unit="run",
506
- leave=False,
507
- bar_format="{desc} | {n}/{total} Runs {percentage:3.0f}%|{bar}|",
508
- )
509
- # Start spinner animation for notebooks
510
- self.spinner.start(self._update_notebook_spinner)
29
+ self._runs_bar.update(1)
30
+ if items_processed > 0:
31
+ self._items_bar.update(items_processed)
32
+ if succeeded:
33
+ self.completed_runs += 1
511
34
  else:
512
- # Full terminal version with header, spinner, and multiple bars
513
- initial_frame = self.spinner.get_initial_frame()
514
- evaluating_text = f"Evaluating {self.config.model_display}"
515
- initial_shimmer = self.spinner.get_shimmer_text(evaluating_text)
516
- header_desc = self.formatter.format_header(
517
- initial_frame, 0.0, 0, 0, 0, 0, initial_shimmer
518
- )
519
- self._header_bar = tqdm(
520
- total=0,
521
- desc=header_desc,
522
- leave=False,
523
- dynamic_ncols=True,
524
- bar_format=HEADER_FORMAT,
525
- )
526
-
527
- eval_desc = self.formatter.format_progress_description(
528
- EVALUATIONS_LABEL, 0, self.config.total_eval_runs
529
- )
530
- self._evaluations_bar = tqdm(
531
- total=self.config.total_eval_runs,
532
- desc=eval_desc,
533
- unit="run",
534
- leave=False,
535
- dynamic_ncols=True,
536
- bar_format=PROGRESS_BAR_FORMAT,
537
- )
538
-
539
- items_desc = self.formatter.format_progress_description(
540
- ITEMS_LABEL, 0, self.config.total_items
541
- )
542
- self._items_bar = tqdm(
543
- total=self.config.total_items,
544
- desc=items_desc,
545
- unit="item",
546
- leave=False,
547
- dynamic_ncols=True,
548
- bar_format=PROGRESS_BAR_FORMAT,
549
- )
550
-
551
- self._refresh_progress_descriptions()
552
- self.spinner.start(self._update_header_spinner)
553
-
554
- def on_run_completed(self, items_processed: int, succeeded: bool) -> None:
555
- """Update progress when an evaluation run completes."""
556
- with self._state_lock:
557
- if succeeded:
558
- self.completed_runs += 1
559
- else:
560
- self.failed_runs += 1
561
-
562
- if self._evaluations_bar is not None:
563
- self._evaluations_bar.update(1)
35
+ self.failed_runs += 1
564
36
 
565
- if self._items_bar is not None:
566
- self._items_bar.update(items_processed)
37
+ def on_item_progress(self, current: int, total: int) -> None:
38
+ """Update progress for individual items (used by adaptive evaluations).
567
39
 
568
- self._refresh_progress_descriptions()
40
+ Args:
41
+ current: Current item count.
42
+ total: Total item count.
43
+ """
44
+ self._items_bar.n = current
45
+ if total != self._items_bar.total:
46
+ self._items_bar.total = total
47
+ self._items_bar.refresh()
569
48
 
570
49
  def on_upload_completed(self, succeeded: bool) -> None:
571
50
  """Update progress when an upload completes."""
572
- with self._state_lock:
573
- if succeeded:
574
- self.uploaded_runs += 1
575
- else:
576
- self.upload_failed_runs += 1
577
-
578
- # Trigger header refresh in terminal mode
579
- if not _is_notebook() and self._header_bar is not None:
580
- self._refresh_header()
581
-
582
- def close_progress_bars(self) -> None:
583
- """Close all progress bars and cleanup resources."""
584
- self.spinner.stop()
585
- self._finalize_header()
586
-
587
- if self._items_bar is not None:
588
- self._items_bar.close()
589
- self._items_bar = None
590
- if self._evaluations_bar is not None:
591
- self._evaluations_bar.close()
592
- self._evaluations_bar = None
593
- if self._header_bar is not None:
594
- self._header_bar.close()
595
- self._header_bar = None
596
-
597
- self._start_time = None
598
-
599
- # Print summary after clearing progress bars
600
- self._print_summary()
601
-
602
- def _refresh_progress_descriptions(self) -> None:
603
- """Refresh progress bar descriptions to maintain alignment as counts change."""
604
- # Skip refresh in notebooks (spinner handles description updates)
605
- if _is_notebook():
606
- return
607
-
608
- if self._evaluations_bar is not None:
609
- eval_desc = self.formatter.format_progress_description(
610
- EVALUATIONS_LABEL,
611
- min(self._evaluations_bar.n, self.config.total_eval_runs),
612
- self.config.total_eval_runs,
613
- )
614
- self._evaluations_bar.set_description_str(eval_desc, refresh=False)
615
-
616
- if self._items_bar is not None:
617
- items_desc = self.formatter.format_progress_description(
618
- ITEMS_LABEL,
619
- min(self._items_bar.n, self.config.total_items),
620
- self.config.total_items,
621
- )
622
- self._items_bar.set_description_str(items_desc, refresh=False)
623
-
624
- # Refresh both bars
625
- if self._evaluations_bar is not None:
626
- self._evaluations_bar.refresh()
627
- if self._items_bar is not None:
628
- self._items_bar.refresh()
629
-
630
- def _update_notebook_spinner(self, frame: str) -> None:
631
- """Update the notebook progress bar spinner (notebooks only)."""
632
- if self._evaluations_bar is not None:
633
- desc = (
634
- f"{frame} Evaluating {self.config.model_display} | "
635
- f"{self.config.dataset_count} {self.config.dataset_label} | "
636
- f"{self.config.hyperparam_count} {self.config.hyperparam_label}"
637
- )
638
- self._evaluations_bar.set_description_str(desc, refresh=False)
639
- self._evaluations_bar.refresh()
640
-
641
- def _update_header_spinner(self, frame: str) -> None:
642
- """Update the header with a new spinner frame (terminals only)."""
643
- if self._header_bar is not None and self._start_time is not None:
644
- elapsed = time.monotonic() - self._start_time
645
- evaluating_text = f"Evaluating {self.config.model_display}"
646
- shimmer_text = self.spinner.get_shimmer_text(evaluating_text)
647
-
648
- # Read state with lock
649
- with self._state_lock:
650
- completed = self.completed_runs
651
- failed = self.failed_runs
652
- uploaded = self.uploaded_runs
653
- upload_failed = self.upload_failed_runs
654
-
655
- header_desc = self.formatter.format_header(
656
- frame,
657
- elapsed,
658
- completed,
659
- failed,
660
- uploaded,
661
- upload_failed,
662
- shimmer_text,
663
- )
664
- self._header_bar.set_description_str(header_desc, refresh=False)
665
- self._header_bar.refresh()
666
-
667
- def _refresh_header(self) -> None:
668
- """Refresh the header bar with current statistics."""
669
- if self._header_bar is None or self._start_time is None:
670
- return
671
-
672
- elapsed = time.monotonic() - self._start_time
673
-
674
- # Get current spinner frame (or empty if stopped)
675
- if self.spinner.is_running():
676
- # Spinner running, will update via callback soon
677
- return
51
+ if succeeded:
52
+ self.uploaded_runs += 1
678
53
  else:
679
- # Spinner stopped, update manually
680
- frame = self.spinner.get_empty_frame()
681
-
682
- with self._state_lock:
683
- completed = self.completed_runs
684
- failed = self.failed_runs
685
- uploaded = self.uploaded_runs
686
- upload_failed = self.upload_failed_runs
687
-
688
- header_desc = self.formatter.format_header(
689
- frame, elapsed, completed, failed, uploaded, upload_failed, ""
690
- )
691
- self._header_bar.set_description_str(header_desc, refresh=True)
692
-
693
- def _finalize_header(self) -> None:
694
- """Finalize the header line without spinner animation."""
695
- # Only for terminal mode
696
- if _is_notebook():
697
- return
698
-
699
- if self._header_bar is not None and self._start_time is not None:
700
- elapsed = time.monotonic() - self._start_time
701
- final_frame = self.spinner.get_empty_frame()
702
-
703
- # Read state with lock
704
- with self._state_lock:
705
- completed = self.completed_runs
706
- failed = self.failed_runs
707
- uploaded = self.uploaded_runs
708
- upload_failed = self.upload_failed_runs
709
-
710
- # No shimmer for final header
711
- final_desc = self.formatter.format_header(
712
- final_frame, elapsed, completed, failed, uploaded, upload_failed, ""
713
- )
714
- self._header_bar.set_description_str(final_desc, refresh=True)
715
-
716
- def _print_summary(self) -> None:
717
- """Print a clean summary after evaluation completes."""
718
- # Build summary message
719
- summary_parts = [f"Evaluating {self.config.model_display} Completed"]
720
-
721
- # Add run completion info
722
- total_runs = self.completed_runs + self.failed_runs
723
- expected_runs = self.config.total_eval_runs
724
-
725
- # Show if some runs didn't complete (cancelled/interrupted)
726
- if total_runs < expected_runs:
727
- summary_parts.append(
728
- f"{self.completed_runs}/{total_runs} Runs Completed Successfully "
729
- f"(out of {expected_runs} expected)"
730
- )
731
- elif self.failed_runs == 0:
732
- summary_parts.append(f"{self.completed_runs} Runs Completed Successfully")
733
- else:
734
- summary_parts.append(f"{self.completed_runs}/{total_runs} Runs Completed Successfully")
735
-
736
- # Add upload info if any uploads occurred
737
- if self.uploaded_runs > 0 or self.upload_failed_runs > 0:
738
- total_uploads = self.uploaded_runs + self.upload_failed_runs
739
- if self.upload_failed_runs == 0:
740
- summary_parts.append(f"{self.uploaded_runs} Runs Uploaded Successfully")
741
- else:
742
- summary_parts.append(
743
- f"{self.uploaded_runs}/{total_uploads} Runs Uploaded Successfully"
744
- )
745
-
746
- # Join parts with ", " and print
747
- summary = ", ".join(summary_parts)
748
- print(summary)
54
+ self.upload_failed_runs += 1
749
55
 
750
56
 
751
57
  @contextmanager
752
58
  def evaluation_progress_context(
753
59
  total_eval_runs: int,
754
60
  total_items: int,
755
- dataset_count: int,
756
- hyperparam_count: int,
757
61
  model_display: str,
758
62
  enabled: bool = True,
759
63
  ) -> Generator[Optional[EvaluationProgressBars], None, None]:
760
64
  """Context manager for evaluation progress bars.
761
65
 
762
66
  Args:
763
- total_eval_runs: Total number of runs that will be executed
764
- total_items: Total number of evaluation items across all runs
765
- dataset_count: Number of datasets included in the evaluation
766
- hyperparam_count: Number of hyperparameter configurations evaluated
767
- model_display: Human readable model/inference name for the header
768
- enabled: Whether to show progress bars (default: True)
67
+ total_eval_runs: Total number of evaluation runs.
68
+ total_items: Total number of items across all runs.
69
+ model_display: Model name to display in progress description.
70
+ enabled: Whether to show progress bars.
769
71
 
770
72
  Yields:
771
- Optional[EvaluationProgressBars]: Progress bar manager instance (None if disabled)
73
+ EvaluationProgressBars instance, or None if disabled.
772
74
  """
773
75
  if not enabled:
774
76
  yield None
775
77
  return
776
78
 
777
- config = EvaluationConfig(
778
- total_eval_runs=total_eval_runs,
779
- total_items=total_items,
780
- dataset_count=dataset_count,
781
- hyperparam_count=hyperparam_count,
782
- model_display=model_display,
79
+ runs_bar = tqdm(
80
+ total=total_eval_runs,
81
+ desc=f"Evaluating {model_display}",
82
+ unit="run",
83
+ leave=False,
783
84
  )
784
- progress_bars = EvaluationProgressBars(config)
785
- progress_bars.start_progress_bars()
85
+ items_bar = tqdm(
86
+ total=total_items,
87
+ desc="Items",
88
+ unit="item",
89
+ leave=False,
90
+ )
91
+
92
+ progress = EvaluationProgressBars(_runs_bar=runs_bar, _items_bar=items_bar)
786
93
  try:
787
- yield progress_bars
94
+ yield progress
788
95
  finally:
789
- progress_bars.close_progress_bars()
96
+ items_bar.close()
97
+ runs_bar.close()
790
98
 
791
99
 
792
100
  @contextmanager
@@ -797,60 +105,24 @@ def scoring_progress_context(
797
105
  """Context manager for scoring progress display.
798
106
 
799
107
  Args:
800
- total_metrics: Total number of metrics to score
801
- enabled: Whether to show progress bar (default: True)
108
+ total_metrics: Total number of metrics to score.
109
+ enabled: Whether to show progress bar.
802
110
 
803
111
  Yields:
804
- Optional[tqdm]: Progress bar instance (None if disabled)
112
+ tqdm progress bar instance, or None if disabled.
805
113
  """
806
114
  if not enabled:
807
115
  yield None
808
116
  return
809
117
 
810
- # Use appropriate spinner frames based on environment
811
- spinner_frames = SPINNER_FRAMES if SPINNER_FRAMES else ["|"]
812
- spinner_cycle_obj = cycle(spinner_frames)
813
-
814
- # Get initial spinner frame
815
- initial_frame = next(spinner_cycle_obj)
816
-
817
118
  progress_bar = tqdm(
818
119
  total=total_metrics,
819
- desc=f"{initial_frame} Scoring metrics",
120
+ desc="Scoring",
820
121
  unit="metric",
821
122
  leave=False,
822
- bar_format="{desc} | {n}/{total} metrics {percentage:3.0f}%|{bar}|",
823
123
  )
824
124
 
825
- # Start spinner animation thread
826
- stop_event = threading.Event()
827
- current_metric_name = [""] # List to allow mutation in closure
828
-
829
- def animate_spinner() -> None:
830
- """Update spinner and description in background thread."""
831
- while not stop_event.is_set():
832
- try:
833
- frame = next(spinner_cycle_obj)
834
- metric_suffix = f": {current_metric_name[0]}" if current_metric_name[0] else ""
835
- progress_bar.set_description_str(
836
- f"{frame} Scoring metrics{metric_suffix}", refresh=True
837
- )
838
- time.sleep(SPINNER_INTERVAL_SECONDS)
839
- except Exception:
840
- break
841
-
842
- spinner_thread = threading.Thread(target=animate_spinner, daemon=True)
843
- spinner_thread.start()
844
-
845
- # Attach helper method to update current metric name
846
- def set_current_metric(metric_name: str) -> None:
847
- current_metric_name[0] = metric_name
848
-
849
- progress_bar.set_current_metric = set_current_metric
850
-
851
125
  try:
852
126
  yield progress_bar
853
127
  finally:
854
- stop_event.set()
855
- spinner_thread.join(timeout=1.0)
856
128
  progress_bar.close()