scorebook 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. scorebook/__init__.py +12 -5
  2. scorebook/cli/auth.py +1 -1
  3. scorebook/dashboard/__init__.py +1 -0
  4. scorebook/dashboard/create_project.py +91 -0
  5. scorebook/{trismik → dashboard}/credentials.py +57 -12
  6. scorebook/{trismik → dashboard}/upload_results.py +1 -1
  7. scorebook/eval_datasets/__init__.py +0 -4
  8. scorebook/eval_datasets/eval_dataset.py +4 -2
  9. scorebook/evaluate/__init__.py +1 -15
  10. scorebook/evaluate/_async/evaluate_async.py +36 -19
  11. scorebook/evaluate/_sync/evaluate.py +36 -19
  12. scorebook/evaluate/evaluate_helpers.py +4 -3
  13. scorebook/inference/__init__.py +1 -11
  14. scorebook/inference/clients/__init__.py +1 -8
  15. scorebook/inference/inference_pipeline.py +1 -1
  16. scorebook/metrics/README.md +121 -0
  17. scorebook/metrics/__init__.py +7 -16
  18. scorebook/metrics/accuracy.py +2 -6
  19. scorebook/metrics/bertscore.py +50 -0
  20. scorebook/metrics/bleu.py +82 -0
  21. scorebook/metrics/core/__init__.py +1 -0
  22. scorebook/metrics/{metric_base.py → core/metric_base.py} +1 -2
  23. scorebook/metrics/core/metric_registry.py +195 -0
  24. scorebook/metrics/exactmatch.py +95 -0
  25. scorebook/metrics/f1.py +96 -0
  26. scorebook/metrics/precision.py +84 -9
  27. scorebook/metrics/recall.py +94 -0
  28. scorebook/metrics/rouge.py +85 -0
  29. scorebook/score/__init__.py +0 -5
  30. scorebook/score/_async/score_async.py +3 -2
  31. scorebook/score/_sync/score.py +3 -2
  32. scorebook/score/score_helpers.py +29 -12
  33. scorebook/types.py +3 -3
  34. scorebook/utils/__init__.py +0 -22
  35. scorebook/utils/common_helpers.py +1 -1
  36. scorebook/utils/mock_llm/__init__.py +41 -0
  37. scorebook/utils/mock_llm/data/mock_llm_data.json +21970 -0
  38. scorebook/utils/progress_bars.py +58 -786
  39. scorebook-0.0.15.dist-info/METADATA +300 -0
  40. scorebook-0.0.15.dist-info/RECORD +110 -0
  41. {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/WHEEL +1 -1
  42. tutorials/README.md +147 -0
  43. tutorials/__init__.py +5 -0
  44. tutorials/examples/1-score/1-scoring_model_accuracy.py +47 -0
  45. tutorials/examples/1-score/2-scoring_model_bleu.py +46 -0
  46. tutorials/examples/1-score/3-scoring_model_f1.py +64 -0
  47. tutorials/examples/1-score/4-scoring_model_rouge.py +64 -0
  48. tutorials/examples/1-score/5-scoring_model_exact_match.py +84 -0
  49. tutorials/examples/1-score/6-scoring_with_bertscore.py +57 -0
  50. tutorials/examples/1-score/__init__.py +0 -0
  51. tutorials/examples/2-evaluate/1-evaluating_local_models.py +106 -0
  52. tutorials/examples/2-evaluate/2-evaluating_local_models_with_batching.py +108 -0
  53. tutorials/examples/2-evaluate/3-evaluating_cloud_models.py +109 -0
  54. tutorials/examples/2-evaluate/4-evaluating_cloud_models_with_batching.py +170 -0
  55. tutorials/examples/2-evaluate/5-hyperparameter_sweeps.py +122 -0
  56. tutorials/examples/2-evaluate/6-inference_pipelines.py +141 -0
  57. tutorials/examples/3-evaluation_datasets/1-evaluation_datasets_from_files.py +110 -0
  58. tutorials/examples/3-evaluation_datasets/2-evaluation_datasets_from_huggingface.py +101 -0
  59. tutorials/examples/3-evaluation_datasets/3-evaluation_datasets_from_huggingface_with_yaml_configs.py +110 -0
  60. tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.csv +11 -0
  61. tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.json +42 -0
  62. tutorials/examples/3-evaluation_datasets/example_yaml_configs/Cais-MMLU.yaml +19 -0
  63. tutorials/examples/3-evaluation_datasets/example_yaml_configs/TIGER-Lab-MMLU-Pro.yaml +18 -0
  64. tutorials/examples/4-adaptive_evaluations/1-adaptive_evaluation.py +114 -0
  65. tutorials/examples/4-adaptive_evaluations/2-adaptive_dataset_splits.py +106 -0
  66. tutorials/examples/5-upload_results/1-uploading_score_results.py +92 -0
  67. tutorials/examples/5-upload_results/2-uploading_evaluate_results.py +117 -0
  68. tutorials/examples/5-upload_results/3-uploading_your_results.py +153 -0
  69. tutorials/examples/6-providers/aws/__init__.py +1 -0
  70. tutorials/examples/6-providers/aws/batch_example.py +219 -0
  71. tutorials/examples/6-providers/portkey/__init__.py +1 -0
  72. tutorials/examples/6-providers/portkey/batch_example.py +120 -0
  73. tutorials/examples/6-providers/portkey/messages_example.py +121 -0
  74. tutorials/examples/6-providers/vertex/__init__.py +1 -0
  75. tutorials/examples/6-providers/vertex/batch_example.py +166 -0
  76. tutorials/examples/6-providers/vertex/messages_example.py +142 -0
  77. tutorials/examples/__init__.py +0 -0
  78. tutorials/notebooks/1-scoring.ipynb +162 -0
  79. tutorials/notebooks/2-evaluating.ipynb +316 -0
  80. tutorials/notebooks/3.1-adaptive_evaluation_phi.ipynb +354 -0
  81. tutorials/notebooks/3.2-adaptive_evaluation_gpt.ipynb +243 -0
  82. tutorials/notebooks/4-uploading_results.ipynb +175 -0
  83. tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_openai_demo.ipynb +229 -0
  84. tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_qwen_demo.ipynb +256 -0
  85. tutorials/quickstarts/classical_evaluations/classical_evaluation_demo.ipynb +277 -0
  86. tutorials/quickstarts/getting_started.ipynb +197 -0
  87. tutorials/utils/__init__.py +35 -0
  88. tutorials/utils/args_parser.py +132 -0
  89. tutorials/utils/output.py +23 -0
  90. tutorials/utils/setup.py +98 -0
  91. scorebook/metrics/metric_registry.py +0 -105
  92. scorebook/trismik/__init__.py +0 -10
  93. scorebook-0.0.13.dist-info/METADATA +0 -389
  94. scorebook-0.0.13.dist-info/RECORD +0 -50
  95. {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/entry_points.txt +0 -0
  96. {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/licenses/LICENSE +0 -0
scorebook/__init__.py CHANGED
@@ -9,12 +9,16 @@ import importlib.metadata
9
9
  # get version from pyproject.toml
10
10
  __version__ = importlib.metadata.version(__package__ or __name__)
11
11
 
12
- from scorebook.eval_datasets import EvalDataset
13
- from scorebook.evaluate import evaluate, evaluate_async
12
+ from scorebook.dashboard.create_project import create_project, create_project_async
13
+ from scorebook.dashboard.credentials import login, logout, whoami
14
+ from scorebook.dashboard.upload_results import upload_result, upload_result_async
15
+ from scorebook.eval_datasets.eval_dataset import EvalDataset
16
+ from scorebook.evaluate._async.evaluate_async import evaluate_async
17
+ from scorebook.evaluate._sync.evaluate import evaluate
14
18
  from scorebook.inference.inference_pipeline import InferencePipeline
15
- from scorebook.score import score, score_async
16
- from scorebook.trismik.credentials import login, logout, whoami
17
- from scorebook.trismik.upload_results import upload_result, upload_result_async
19
+ from scorebook.metrics.core.metric_registry import scorebook_metric
20
+ from scorebook.score._async.score_async import score_async
21
+ from scorebook.score._sync.score import score
18
22
  from scorebook.utils.render_template import render_template
19
23
 
20
24
  __all__ = [
@@ -28,6 +32,9 @@ __all__ = [
28
32
  "logout",
29
33
  "whoami",
30
34
  "InferencePipeline",
35
+ "create_project",
36
+ "create_project_async",
31
37
  "upload_result",
32
38
  "upload_result_async",
39
+ "scorebook_metric",
33
40
  ]
scorebook/cli/auth.py CHANGED
@@ -4,7 +4,7 @@ import argparse
4
4
  import getpass
5
5
  import sys
6
6
 
7
- from scorebook.trismik.credentials import get_stored_token, get_token_path, login, logout, whoami
7
+ from scorebook.dashboard.credentials import get_stored_token, get_token_path, login, logout, whoami
8
8
 
9
9
 
10
10
  def auth_command(args: argparse.Namespace) -> int:
@@ -0,0 +1 @@
1
+ """Trismik authentication and API integration."""
@@ -0,0 +1,91 @@
1
+ """Create projects in Trismik's experimentation platform."""
2
+
3
+ import logging
4
+ from typing import Optional
5
+
6
+ from trismik.types import TrismikProject
7
+
8
+ from scorebook.evaluate.evaluate_helpers import (
9
+ create_trismik_async_client,
10
+ create_trismik_sync_client,
11
+ )
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ def create_project(
17
+ name: str,
18
+ team_id: Optional[str] = None,
19
+ description: Optional[str] = None,
20
+ ) -> TrismikProject:
21
+ """Create a new project in Trismik's experimentation platform (synchronous).
22
+
23
+ This function creates a new project that can be used to organize experiments
24
+ and evaluation runs in the Trismik platform.
25
+
26
+ Args:
27
+ name: Name of the project
28
+ team_id: Optional ID of the team to create the project in. If not provided,
29
+ the project will be created in the user's default team.
30
+ description: Optional description of the project
31
+
32
+ Returns:
33
+ TrismikProject: Created project object containing project details including
34
+ id, name, description, accountId, createdAt, and updatedAt fields
35
+
36
+ Raises:
37
+ TrismikValidationError: If the request fails validation
38
+ TrismikApiError: If the API request fails
39
+ """
40
+ # Create Trismik client
41
+ trismik_client = create_trismik_sync_client()
42
+
43
+ # Create project via Trismik API
44
+ project = trismik_client.create_project(
45
+ name=name,
46
+ team_id=team_id,
47
+ description=description,
48
+ )
49
+
50
+ logger.info(f"Project '{name}' created successfully with ID: {project.id}")
51
+
52
+ return project
53
+
54
+
55
+ async def create_project_async(
56
+ name: str,
57
+ team_id: Optional[str] = None,
58
+ description: Optional[str] = None,
59
+ ) -> TrismikProject:
60
+ """Create a new project in Trismik's experimentation platform (asynchronous).
61
+
62
+ This function creates a new project that can be used to organize experiments
63
+ and evaluation runs in the Trismik platform.
64
+
65
+ Args:
66
+ name: Name of the project
67
+ team_id: Optional ID of the team to create the project in. If not provided,
68
+ the project will be created in the user's default team.
69
+ description: Optional description of the project
70
+
71
+ Returns:
72
+ TrismikProject: Created project object containing project details including
73
+ id, name, description, accountId, createdAt, and updatedAt fields
74
+
75
+ Raises:
76
+ TrismikValidationError: If the request fails validation
77
+ TrismikApiError: If the API request fails
78
+ """
79
+ # Create Trismik async client
80
+ trismik_client = create_trismik_async_client()
81
+
82
+ # Create project via Trismik API (async)
83
+ project = await trismik_client.create_project(
84
+ name=name,
85
+ team_id=team_id,
86
+ description=description,
87
+ )
88
+
89
+ logger.info(f"Project '{name}' created successfully with ID: {project.id}")
90
+
91
+ return project
@@ -3,8 +3,14 @@
3
3
  import logging
4
4
  import os
5
5
  import pathlib
6
+ import warnings
6
7
  from typing import Optional
7
8
 
9
+ from dotenv import load_dotenv
10
+ from trismik import TrismikClient
11
+
12
+ from scorebook.settings import TRISMIK_SERVICE_URL
13
+
8
14
  logger = logging.getLogger(__name__)
9
15
 
10
16
 
@@ -66,27 +72,66 @@ def get_token() -> Optional[str]:
66
72
 
67
73
 
68
74
  def validate_token(token: str) -> bool:
69
- """Validate the token by making a test API call to trismik."""
70
- # TODO: Implement actual API validation once you have an endpoint
71
- # This would typically make a request to something like:
72
- # response = requests.get("https://api.trismik.com/whoami",
73
- # headers={"Authorization": f"Bearer {token}"})
74
- # return response.status_code == 200
75
+ """Validate the token by making a test API call to trismik.
76
+
77
+ Args:
78
+ token: The API token to validate.
79
+
80
+ Returns:
81
+ bool: True if the token is valid, False otherwise.
82
+ """
83
+ if not token or not token.strip():
84
+ return False
75
85
 
76
- # For now, just check it's not empty
77
- return bool(token and token.strip())
86
+ try:
87
+ # Create a client with the token and verify it works
88
+ client = TrismikClient(service_url=TRISMIK_SERVICE_URL, api_key=token)
89
+ client.me()
90
+ client.close()
91
+ return True
92
+ except Exception as e:
93
+ logger.debug(f"Token validation failed: {e}")
94
+ return False
78
95
 
79
96
 
80
- def login(trismik_api_key: str) -> None:
97
+ def login(trismik_api_key: Optional[str] = None) -> None:
81
98
  """Login to trismik by saving API key locally.
82
99
 
100
+ If no API key is provided, the function will attempt to read it from the
101
+ TRISMIK_API_KEY environment variable or .env file (using python-dotenv).
102
+ Environment variables take precedence over .env file values.
103
+
83
104
  Args:
84
- trismik_api_key: The API key to use.
105
+ trismik_api_key: The API key to use. If not provided, reads from
106
+ environment or .env file.
85
107
  Raises:
86
- ValueError: If API key is empty or invalid.
108
+ ValueError: If API key is empty, not found, or invalid.
109
+
110
+ Warns:
111
+ UserWarning: If an explicit API key is passed but TRISMIK_API_KEY
112
+ environment variable is also set.
87
113
  """
114
+ # Warn if user passes explicit key but env var is also set
115
+ if trismik_api_key is not None and os.environ.get("TRISMIK_API_KEY"):
116
+ warnings.warn(
117
+ "TRISMIK_API_KEY environment variable is set. The environment variable "
118
+ "takes precedence over the stored token when calling evaluate(). "
119
+ "To use the explicitly provided key, unset the TRISMIK_API_KEY "
120
+ "environment variable.",
121
+ UserWarning,
122
+ stacklevel=2,
123
+ )
124
+
125
+ if trismik_api_key is None:
126
+ # Load from .env file if TRISMIK_API_KEY is not already set in environment
127
+ load_dotenv()
128
+ trismik_api_key = os.environ.get("TRISMIK_API_KEY")
129
+
88
130
  if not trismik_api_key:
89
- raise ValueError("API key cannot be empty")
131
+ raise ValueError(
132
+ "API key cannot be empty. Either pass it as a parameter or "
133
+ "set the TRISMIK_API_KEY environment variable or .env file."
134
+ )
90
135
 
91
136
  # Validate token
92
137
  if not validate_token(trismik_api_key):
@@ -21,7 +21,7 @@ logger = logging.getLogger(__name__)
21
21
 
22
22
  # Known fields that are not metrics or hyperparameters
23
23
  KNOWN_AGGREGATE_FIELDS = {"dataset", "run_id", "run_completed"}
24
- KNOWN_ITEM_FIELDS = {"id", "dataset_name", "input", "output", "label", "run_id"}
24
+ KNOWN_ITEM_FIELDS = {"id", "dataset", "input", "output", "label", "run_id"}
25
25
 
26
26
 
27
27
  def upload_result(
@@ -1,5 +1 @@
1
1
  """Dataset utilities for scorebook."""
2
-
3
- from scorebook.eval_datasets.eval_dataset import EvalDataset
4
-
5
- __all__ = ["EvalDataset"]
@@ -18,8 +18,10 @@ from scorebook.exceptions import (
18
18
  DatasetSampleError,
19
19
  MissingFieldError,
20
20
  )
21
- from scorebook.metrics import MetricBase, MetricRegistry
22
- from scorebook.utils import render_template, validate_path
21
+ from scorebook.metrics.core.metric_base import MetricBase
22
+ from scorebook.metrics.core.metric_registry import MetricRegistry
23
+ from scorebook.utils.io_helpers import validate_path
24
+ from scorebook.utils.render_template import render_template
23
25
 
24
26
 
25
27
  class EvalDataset:
@@ -1,15 +1 @@
1
- """
2
- Evaluation module for Scorebook.
3
-
4
- This module provides both synchronous and asynchronous evaluation functions.
5
- The async version serves as the source of truth, with the sync version
6
- automatically generated using unasync.
7
- """
8
-
9
- # Import from async module
10
- from ._async.evaluate_async import evaluate_async
11
-
12
- # Import from generated sync module
13
- from ._sync.evaluate import evaluate
14
-
15
- __all__ = ["evaluate", "evaluate_async"]
1
+ """Evaluation module for Scorebook."""
@@ -6,7 +6,7 @@ from trismik import TrismikAsyncClient, TrismikClient
6
6
  from trismik.settings import evaluation_settings
7
7
  from trismik.types import TrismikRunMetadata
8
8
 
9
- from scorebook.eval_datasets import EvalDataset
9
+ from scorebook.eval_datasets.eval_dataset import EvalDataset
10
10
  from scorebook.evaluate.evaluate_helpers import (
11
11
  build_eval_run_specs,
12
12
  create_trismik_async_client,
@@ -28,12 +28,9 @@ from scorebook.types import (
28
28
  EvalResult,
29
29
  EvalRunSpec,
30
30
  )
31
- from scorebook.utils import (
32
- async_nullcontext,
33
- evaluation_progress_context,
34
- resolve_show_progress,
35
- resolve_upload_results,
36
- )
31
+ from scorebook.utils.async_utils import async_nullcontext
32
+ from scorebook.utils.common_helpers import resolve_show_progress, resolve_upload_results
33
+ from scorebook.utils.progress_bars import evaluation_progress_context
37
34
 
38
35
  logger = logging.getLogger(__name__)
39
36
 
@@ -116,8 +113,6 @@ async def evaluate_async(
116
113
  with evaluation_progress_context(
117
114
  total_eval_runs=len(eval_run_specs),
118
115
  total_items=total_items,
119
- dataset_count=len(datasets),
120
- hyperparam_count=len(hyperparameter_configs),
121
116
  model_display=model_display,
122
117
  enabled=show_progress_bars,
123
118
  ) as progress_bars:
@@ -154,19 +149,31 @@ async def execute_runs(
154
149
  async def worker(
155
150
  run: Union[EvalRunSpec, AdaptiveEvalRunSpec]
156
151
  ) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
152
+ # Create progress callback for adaptive evals
153
+ on_progress: Optional[Callable[[int, int], None]] = None
154
+ if progress_bars is not None and isinstance(run, AdaptiveEvalRunSpec):
155
+
156
+ def _on_progress(current: int, total: int) -> None:
157
+ progress_bars.on_item_progress(current, total)
158
+
159
+ on_progress = _on_progress
160
+
157
161
  # Execute run (score_async handles upload internally for classic evals)
158
162
  run_result = await execute_run(
159
- inference, run, upload_results, experiment_id, project_id, metadata, trismik_client
163
+ inference,
164
+ run,
165
+ upload_results,
166
+ experiment_id,
167
+ project_id,
168
+ metadata,
169
+ trismik_client,
170
+ on_progress,
160
171
  )
161
172
 
162
173
  # Update progress bars with items processed and success status
163
174
  if progress_bars is not None:
164
- # Classic evals have .items; adaptive evals use max_iterations
165
- items_processed = (
166
- len(run.dataset.items)
167
- if isinstance(run, EvalRunSpec)
168
- else evaluation_settings["max_iterations"]
169
- )
175
+ # Classic evals: update items count; Adaptive evals: items already tracked via callback
176
+ items_processed = len(run.dataset.items) if isinstance(run, EvalRunSpec) else 0
170
177
  progress_bars.on_run_completed(items_processed, run_result.run_completed)
171
178
 
172
179
  # Update upload progress for classic evals
@@ -198,11 +205,12 @@ async def execute_runs(
198
205
  async def execute_run(
199
206
  inference: Callable,
200
207
  run: Union[EvalRunSpec, AdaptiveEvalRunSpec],
201
- upload_results: bool, # NEW PARAMETER
208
+ upload_results: bool,
202
209
  experiment_id: Optional[str] = None,
203
210
  project_id: Optional[str] = None,
204
211
  metadata: Optional[Dict[str, Any]] = None,
205
212
  trismik_client: Optional[Union[TrismikClient, TrismikAsyncClient]] = None,
213
+ on_progress: Optional[Callable[[int, int], None]] = None,
206
214
  ) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
207
215
  """Execute a single evaluation run."""
208
216
 
@@ -221,6 +229,7 @@ async def execute_run(
221
229
  resolved_project_id,
222
230
  metadata,
223
231
  trismik_client,
232
+ on_progress,
224
233
  )
225
234
 
226
235
  else:
@@ -341,6 +350,7 @@ async def execute_adaptive_eval_run(
341
350
  project_id: str,
342
351
  metadata: Optional[Dict[str, Any]] = None,
343
352
  trismik_client: Optional[Union[TrismikClient, TrismikAsyncClient]] = None,
353
+ on_progress: Optional[Callable[[int, int], None]] = None,
344
354
  ) -> AdaptiveEvalRunResult:
345
355
  """Execute an adaptive evaluation run."""
346
356
  logger.debug("Executing adaptive run for %s", run)
@@ -350,7 +360,7 @@ async def execute_adaptive_eval_run(
350
360
  raise ScoreBookError("Trismik client is required for adaptive evaluation")
351
361
 
352
362
  adaptive_eval_run_result = await run_adaptive_evaluation(
353
- inference, run, experiment_id, project_id, metadata, trismik_client
363
+ inference, run, experiment_id, project_id, metadata, trismik_client, on_progress
354
364
  )
355
365
  logger.debug("Adaptive evaluation completed for run %s", adaptive_eval_run_result)
356
366
 
@@ -368,6 +378,7 @@ async def run_adaptive_evaluation(
368
378
  project_id: str,
369
379
  metadata: Any,
370
380
  trismik_client: Union[TrismikClient, TrismikAsyncClient],
381
+ on_progress: Optional[Callable[[int, int], None]] = None,
371
382
  ) -> AdaptiveEvalRunResult:
372
383
  """Run an adaptive evaluation using the Trismik API.
373
384
 
@@ -378,6 +389,7 @@ async def run_adaptive_evaluation(
378
389
  project_id: Trismik project ID
379
390
  metadata: Additional metadata
380
391
  trismik_client: Trismik client instance
392
+ on_progress: Optional callback for progress updates (current, total)
381
393
  Returns:
382
394
  Results from the adaptive evaluation
383
395
  """
@@ -392,6 +404,10 @@ async def run_adaptive_evaluation(
392
404
  available_splits=available_splits,
393
405
  )
394
406
 
407
+ # Create inference function with bound hyperparameters
408
+ async def inference_with_hyperparams(items: Any) -> Any:
409
+ return await inference(items, **adaptive_run_spec.hyperparameter_config)
410
+
395
411
  trismik_results = await trismik_client.run(
396
412
  test_id=adaptive_run_spec.dataset,
397
413
  split=resolved_split,
@@ -402,7 +418,8 @@ async def run_adaptive_evaluation(
402
418
  test_configuration={},
403
419
  inference_setup={},
404
420
  ),
405
- item_processor=make_trismik_inference(inference),
421
+ item_processor=make_trismik_inference(inference_with_hyperparams),
422
+ on_progress=on_progress,
406
423
  return_dict=False,
407
424
  )
408
425
 
@@ -5,7 +5,7 @@ from trismik import TrismikAsyncClient, TrismikClient
5
5
  from trismik.settings import evaluation_settings
6
6
  from trismik.types import TrismikRunMetadata
7
7
 
8
- from scorebook.eval_datasets import EvalDataset
8
+ from scorebook.eval_datasets.eval_dataset import EvalDataset
9
9
  from scorebook.evaluate.evaluate_helpers import (
10
10
  build_eval_run_specs,
11
11
  create_trismik_sync_client,
@@ -27,12 +27,9 @@ from scorebook.types import (
27
27
  EvalResult,
28
28
  EvalRunSpec,
29
29
  )
30
- from scorebook.utils import (
31
- nullcontext,
32
- evaluation_progress_context,
33
- resolve_show_progress,
34
- resolve_upload_results,
35
- )
30
+ from contextlib import nullcontext
31
+ from scorebook.utils.common_helpers import resolve_show_progress, resolve_upload_results
32
+ from scorebook.utils.progress_bars import evaluation_progress_context
36
33
 
37
34
  logger = logging.getLogger(__name__)
38
35
 
@@ -115,8 +112,6 @@ def evaluate(
115
112
  with evaluation_progress_context(
116
113
  total_eval_runs=len(eval_run_specs),
117
114
  total_items=total_items,
118
- dataset_count=len(datasets),
119
- hyperparam_count=len(hyperparameter_configs),
120
115
  model_display=model_display,
121
116
  enabled=show_progress_bars,
122
117
  ) as progress_bars:
@@ -153,19 +148,31 @@ def execute_runs(
153
148
  def worker(
154
149
  run: Union[EvalRunSpec, AdaptiveEvalRunSpec]
155
150
  ) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
151
+ # Create progress callback for adaptive evals
152
+ on_progress: Optional[Callable[[int, int], None]] = None
153
+ if progress_bars is not None and isinstance(run, AdaptiveEvalRunSpec):
154
+
155
+ def _on_progress(current: int, total: int) -> None:
156
+ progress_bars.on_item_progress(current, total)
157
+
158
+ on_progress = _on_progress
159
+
156
160
  # Execute run (score_async handles upload internally for classic evals)
157
161
  run_result = execute_run(
158
- inference, run, upload_results, experiment_id, project_id, metadata, trismik_client
162
+ inference,
163
+ run,
164
+ upload_results,
165
+ experiment_id,
166
+ project_id,
167
+ metadata,
168
+ trismik_client,
169
+ on_progress,
159
170
  )
160
171
 
161
172
  # Update progress bars with items processed and success status
162
173
  if progress_bars is not None:
163
- # Classic evals have .items; adaptive evals use max_iterations
164
- items_processed = (
165
- len(run.dataset.items)
166
- if isinstance(run, EvalRunSpec)
167
- else evaluation_settings["max_iterations"]
168
- )
174
+ # Classic evals: update items count; Adaptive evals: items already tracked via callback
175
+ items_processed = len(run.dataset.items) if isinstance(run, EvalRunSpec) else 0
169
176
  progress_bars.on_run_completed(items_processed, run_result.run_completed)
170
177
 
171
178
  # Update upload progress for classic evals
@@ -197,11 +204,12 @@ def execute_runs(
197
204
  def execute_run(
198
205
  inference: Callable,
199
206
  run: Union[EvalRunSpec, AdaptiveEvalRunSpec],
200
- upload_results: bool, # NEW PARAMETER
207
+ upload_results: bool,
201
208
  experiment_id: Optional[str] = None,
202
209
  project_id: Optional[str] = None,
203
210
  metadata: Optional[Dict[str, Any]] = None,
204
211
  trismik_client: Optional[Union[TrismikClient, TrismikAsyncClient]] = None,
212
+ on_progress: Optional[Callable[[int, int], None]] = None,
205
213
  ) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
206
214
  """Execute a single evaluation run."""
207
215
 
@@ -220,6 +228,7 @@ def execute_run(
220
228
  resolved_project_id,
221
229
  metadata,
222
230
  trismik_client,
231
+ on_progress,
223
232
  )
224
233
 
225
234
  else:
@@ -340,6 +349,7 @@ def execute_adaptive_eval_run(
340
349
  project_id: str,
341
350
  metadata: Optional[Dict[str, Any]] = None,
342
351
  trismik_client: Optional[Union[TrismikClient, TrismikAsyncClient]] = None,
352
+ on_progress: Optional[Callable[[int, int], None]] = None,
343
353
  ) -> AdaptiveEvalRunResult:
344
354
  """Execute an adaptive evaluation run."""
345
355
  logger.debug("Executing adaptive run for %s", run)
@@ -349,7 +359,7 @@ def execute_adaptive_eval_run(
349
359
  raise ScoreBookError("Trismik client is required for adaptive evaluation")
350
360
 
351
361
  adaptive_eval_run_result = run_adaptive_evaluation(
352
- inference, run, experiment_id, project_id, metadata, trismik_client
362
+ inference, run, experiment_id, project_id, metadata, trismik_client, on_progress
353
363
  )
354
364
  logger.debug("Adaptive evaluation completed for run %s", adaptive_eval_run_result)
355
365
 
@@ -367,6 +377,7 @@ def run_adaptive_evaluation(
367
377
  project_id: str,
368
378
  metadata: Any,
369
379
  trismik_client: Union[TrismikClient, TrismikAsyncClient],
380
+ on_progress: Optional[Callable[[int, int], None]] = None,
370
381
  ) -> AdaptiveEvalRunResult:
371
382
  """Run an adaptive evaluation using the Trismik API.
372
383
 
@@ -377,6 +388,7 @@ def run_adaptive_evaluation(
377
388
  project_id: Trismik project ID
378
389
  metadata: Additional metadata
379
390
  trismik_client: Trismik client instance
391
+ on_progress: Optional callback for progress updates (current, total)
380
392
  Returns:
381
393
  Results from the adaptive evaluation
382
394
  """
@@ -391,6 +403,10 @@ def run_adaptive_evaluation(
391
403
  available_splits=available_splits,
392
404
  )
393
405
 
406
+ # Create inference function with bound hyperparameters
407
+ def inference_with_hyperparams(items: Any) -> Any:
408
+ return inference(items, **adaptive_run_spec.hyperparameter_config)
409
+
394
410
  trismik_results = trismik_client.run(
395
411
  test_id=adaptive_run_spec.dataset,
396
412
  split=resolved_split,
@@ -401,7 +417,8 @@ def run_adaptive_evaluation(
401
417
  test_configuration={},
402
418
  inference_setup={},
403
419
  ),
404
- item_processor=make_trismik_inference(inference),
420
+ item_processor=make_trismik_inference(inference_with_hyperparams),
421
+ on_progress=on_progress,
405
422
  return_dict=False,
406
423
  )
407
424
 
@@ -9,7 +9,8 @@ from trismik._async.client import TrismikAsyncClient
9
9
  from trismik._sync.client import TrismikClient
10
10
  from trismik.types import TrismikMultipleChoiceTextItem
11
11
 
12
- from scorebook import EvalDataset
12
+ from scorebook.dashboard.credentials import get_token
13
+ from scorebook.eval_datasets.eval_dataset import EvalDataset
13
14
  from scorebook.exceptions import (
14
15
  DataMismatchError,
15
16
  MetricComputationError,
@@ -17,9 +18,9 @@ from scorebook.exceptions import (
17
18
  ScoreBookError,
18
19
  )
19
20
  from scorebook.settings import TRISMIK_SERVICE_URL
20
- from scorebook.trismik.credentials import get_token
21
21
  from scorebook.types import AdaptiveEvalDataset, AdaptiveEvalRunSpec, EvalResult, EvalRunSpec
22
- from scorebook.utils import expand_dict, is_awaitable
22
+ from scorebook.utils.async_utils import is_awaitable
23
+ from scorebook.utils.transform_helpers import expand_dict
23
24
 
24
25
  logger = logging.getLogger(__name__)
25
26
 
@@ -1,11 +1 @@
1
- """
2
- Inference module for model execution and predictions.
3
-
4
- This module provides functionality for running inference with various models
5
- and processing their responses. It includes utilities for both single and
6
- batch inference operations.
7
- """
8
-
9
- from scorebook.inference.inference_pipeline import InferencePipeline
10
-
11
- __all__ = ["InferencePipeline"]
1
+ """Inference module for model execution and predictions."""
@@ -1,8 +1 @@
1
- """
2
- Inference clients for various LLM providers.
3
-
4
- This module provides client implementations for different LLM providers including
5
- OpenAI, AWS Bedrock, Google Vertex AI, and Portkey.
6
- """
7
-
8
- __all__ = ["bedrock", "openai", "portkey", "vertex"]
1
+ """Inference clients for various LLM providers."""
@@ -9,7 +9,7 @@ configurable way.
9
9
  import asyncio
10
10
  from typing import Any, Callable, Dict, List, Optional, cast
11
11
 
12
- from scorebook.utils import is_awaitable
12
+ from scorebook.utils.async_utils import is_awaitable
13
13
 
14
14
 
15
15
  class InferencePipeline: