scorebook 0.0.9__py3-none-any.whl → 0.0.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. scorebook/__init__.py +12 -4
  2. scorebook/cli/auth.py +1 -1
  3. scorebook/evaluate/__init__.py +15 -0
  4. scorebook/evaluate/_async/__init__.py +0 -0
  5. scorebook/evaluate/_async/evaluate_async.py +413 -0
  6. scorebook/evaluate/_sync/__init__.py +0 -0
  7. scorebook/evaluate/_sync/evaluate.py +413 -0
  8. scorebook/evaluate/evaluate_helpers.py +365 -0
  9. scorebook/inference/__init__.py +4 -0
  10. scorebook/inference/clients/__init__.py +8 -0
  11. scorebook/inference/{openai.py → clients/openai.py} +35 -23
  12. scorebook/{inference_pipeline.py → inference/inference_pipeline.py} +66 -4
  13. scorebook/settings.py +18 -0
  14. scorebook/trismik/__init__.py +10 -0
  15. scorebook/utils/__init__.py +9 -2
  16. scorebook/utils/async_utils.py +20 -1
  17. scorebook/utils/progress_bars.py +22 -61
  18. {scorebook-0.0.9.dist-info → scorebook-0.0.10.dist-info}/METADATA +3 -4
  19. scorebook-0.0.10.dist-info/RECORD +41 -0
  20. scorebook/evaluate.py +0 -623
  21. scorebook/trismik_services/__init__.py +0 -6
  22. scorebook/trismik_services/adaptive_testing_service.py +0 -141
  23. scorebook/trismik_services/upload_classic_eval_run.py +0 -102
  24. scorebook-0.0.9.dist-info/RECORD +0 -36
  25. /scorebook/inference/{bedrock.py → clients/bedrock.py} +0 -0
  26. /scorebook/inference/{portkey.py → clients/portkey.py} +0 -0
  27. /scorebook/inference/{vertex.py → clients/vertex.py} +0 -0
  28. /scorebook/{trismik_services/login.py → trismik/credentials.py} +0 -0
  29. {scorebook-0.0.9.dist-info → scorebook-0.0.10.dist-info}/WHEEL +0 -0
  30. {scorebook-0.0.9.dist-info → scorebook-0.0.10.dist-info}/entry_points.txt +0 -0
  31. {scorebook-0.0.9.dist-info → scorebook-0.0.10.dist-info}/licenses/LICENSE +0 -0
@@ -1,141 +0,0 @@
1
- """Trismik adaptive testing service integration."""
2
-
3
- import asyncio
4
- import dataclasses
5
- import inspect
6
- import logging
7
- from typing import Any, Callable, Iterable, Mapping
8
-
9
- from trismik.adaptive_test import AdaptiveTest
10
- from trismik.client_async import TrismikAsyncClient
11
- from trismik.types import TrismikMultipleChoiceTextItem, TrismikRunMetadata
12
-
13
- from scorebook.types import AdaptiveEvalRunResult, AdaptiveEvalRunSpec
14
-
15
- from .login import get_token
16
-
17
- logger = logging.getLogger(__name__)
18
-
19
-
20
- async def run_adaptive_evaluation(
21
- inference: Callable,
22
- adaptive_run_spec: AdaptiveEvalRunSpec,
23
- experiment_id: str,
24
- project_id: str,
25
- metadata: Any,
26
- ) -> AdaptiveEvalRunResult:
27
- """Run an adaptive evaluation using the Trismik API.
28
-
29
- Args:
30
- inference: Function to run inference
31
- adaptive_run_spec: Specification for the adaptive evaluation
32
- experiment_id: Experiment identifier
33
- project_id: Trismik project ID
34
- metadata: Additional metadata
35
- Returns:
36
- Results from the adaptive evaluation
37
- """
38
- runner = AdaptiveTest(
39
- make_trismik_inference(inference),
40
- client=TrismikAsyncClient(
41
- service_url="https://api.trismik.com/adaptive-testing", api_key=get_token()
42
- ),
43
- )
44
-
45
- logger.debug(
46
- "test_id: %s, project_id: %s, experiment: %s ",
47
- adaptive_run_spec.dataset,
48
- project_id,
49
- experiment_id,
50
- )
51
- trismik_results = runner.run(
52
- adaptive_run_spec.dataset,
53
- project_id,
54
- experiment_id,
55
- run_metadata=TrismikRunMetadata(
56
- model_metadata=TrismikRunMetadata.ModelMetadata(name="unknown"),
57
- test_configuration={},
58
- inference_setup={},
59
- ),
60
- return_dict=False,
61
- )
62
-
63
- # Convert TrismikRunResults to AdaptiveEvalRunResult
64
- # Extract scores from the Trismik results
65
- scores = {}
66
- if hasattr(trismik_results, "scores") and trismik_results.scores:
67
- scores = trismik_results.scores
68
- elif hasattr(trismik_results, "__dict__"):
69
- # If scores aren't directly available, include all attributes as scores
70
- scores = {k: v for k, v in trismik_results.__dict__.items() if not k.startswith("_")}
71
-
72
- # Convert AdaptiveTestScore objects to JSON-serializable dictionaries
73
- def make_json_serializable(obj: Any) -> Any:
74
- if hasattr(obj, "theta") and hasattr(obj, "std_error"):
75
- # This is likely an AdaptiveTestScore object
76
- return {"theta": obj.theta, "std_error": obj.std_error}
77
- elif isinstance(obj, dict):
78
- return {k: make_json_serializable(v) for k, v in obj.items()}
79
- elif isinstance(obj, (list, tuple)):
80
- return [make_json_serializable(item) for item in obj]
81
- else:
82
- return obj
83
-
84
- # Make scores JSON serializable
85
- scores = make_json_serializable(scores)
86
-
87
- return AdaptiveEvalRunResult(run_spec=adaptive_run_spec, scores=scores)
88
-
89
-
90
- def make_trismik_inference(
91
- inference_function: Callable,
92
- return_list: bool = False,
93
- ) -> Callable[[Any], Any]:
94
- """Wrap an inference function for flexible input handling.
95
-
96
- Takes a function expecting list[dict] and makes it accept single dict
97
- or TrismikMultipleChoiceTextItem.
98
- """
99
-
100
- # Check if the inference function is async
101
- is_async = inspect.iscoroutinefunction(inference_function) or (
102
- hasattr(inference_function, "__call__")
103
- and inspect.iscoroutinefunction(inference_function.__call__)
104
- )
105
-
106
- def sync_trismik_inference_function(eval_items: Any, **kwargs: Any) -> Any:
107
- # Single TrismikMultipleChoiceTextItem dataclass
108
- if isinstance(eval_items, TrismikMultipleChoiceTextItem):
109
- eval_item_dict = dataclasses.asdict(eval_items)
110
- results = inference_function([eval_item_dict], **kwargs)
111
- if is_async:
112
- results = asyncio.run(results)
113
- return results if return_list else results[0]
114
-
115
- # Single item (a mapping)
116
- if isinstance(eval_items, Mapping):
117
- results = inference_function([eval_items], **kwargs)
118
- if is_async:
119
- results = asyncio.run(results)
120
- return results if return_list else results[0]
121
-
122
- # Iterable of items (but not a string/bytes)
123
- if isinstance(eval_items, Iterable) and not isinstance(eval_items, (str, bytes)):
124
- # Convert any TrismikMultipleChoiceTextItem instances to dicts
125
- converted_items = []
126
- for item in eval_items:
127
- if isinstance(item, TrismikMultipleChoiceTextItem):
128
- converted_items.append(dataclasses.asdict(item))
129
- else:
130
- converted_items.append(item)
131
- results = inference_function(converted_items, **kwargs)
132
- if is_async:
133
- results = asyncio.run(results)
134
- return results
135
-
136
- raise TypeError(
137
- "Expected a single item (Mapping[str, Any] or TrismikMultipleChoiceTextItem) "
138
- "or an iterable of such items."
139
- )
140
-
141
- return sync_trismik_inference_function
@@ -1,102 +0,0 @@
1
- """Upload classic evaluation run results to Trismik platform."""
2
-
3
- import logging
4
- from typing import Any, Dict, List, Optional
5
-
6
- from trismik.adaptive_test import AdaptiveTest
7
- from trismik.client_async import TrismikAsyncClient
8
- from trismik.types import (
9
- TrismikClassicEvalItem,
10
- TrismikClassicEvalMetric,
11
- TrismikClassicEvalRequest,
12
- TrismikClassicEvalResponse,
13
- )
14
-
15
- from scorebook.trismik_services.login import get_token
16
- from scorebook.types import ClassicEvalRunResult
17
-
18
- logger = logging.getLogger(__name__)
19
-
20
-
21
- async def upload_classic_eval_run(
22
- run: ClassicEvalRunResult,
23
- experiment_id: str,
24
- project_id: str,
25
- model: str,
26
- metadata: Optional[Dict[str, Any]],
27
- ) -> TrismikClassicEvalResponse:
28
- """Upload a classic evaluation run result to Trismik platform.
29
-
30
- Args:
31
- run: The evaluation run result to upload
32
- experiment_id: Trismik experiment identifier
33
- project_id: Trismik project identifier
34
- model: Model name used for evaluation
35
- metadata: Optional metadata dictionary
36
-
37
- Returns:
38
- Response from Trismik API containing the upload result
39
- """
40
- runner = AdaptiveTest(
41
- lambda x: None,
42
- client=TrismikAsyncClient(
43
- service_url="https://api.trismik.com/adaptive-testing", api_key=get_token()
44
- ),
45
- )
46
-
47
- # Create eval items from run_spec items, outputs, and labels
48
- items: List[TrismikClassicEvalItem] = []
49
- for idx, (item, output) in enumerate(zip(run.run_spec.items, run.outputs)):
50
- label = run.run_spec.labels[idx] if idx < len(run.run_spec.labels) else ""
51
-
52
- # Calculate item-level metrics for this item
53
- item_metrics: Dict[str, Any] = {}
54
- for metric_name, metric_data in run.scores.items():
55
- if isinstance(metric_data, dict) and "item_scores" in metric_data:
56
- if idx < len(metric_data["item_scores"]):
57
- item_metrics[metric_name] = metric_data["item_scores"][idx]
58
- else:
59
- # If scores is just a single value, use it for all items
60
- item_metrics[metric_name] = metric_data
61
-
62
- eval_item = TrismikClassicEvalItem(
63
- datasetItemId=str(idx),
64
- modelInput=str(item),
65
- modelOutput=str(output),
66
- goldOutput=str(label),
67
- metrics=item_metrics,
68
- )
69
- items.append(eval_item)
70
-
71
- # Create eval metrics from run aggregate scores
72
- metrics: List[TrismikClassicEvalMetric] = []
73
- for metric_name, metric_data in run.scores.items():
74
- if isinstance(metric_data, dict) and "aggregate_scores" in metric_data:
75
- # Handle structured metric data with aggregate scores
76
- for agg_name, agg_value in metric_data["aggregate_scores"].items():
77
- metric_id = f"{metric_name}_{agg_name}" if agg_name != metric_name else metric_name
78
- metric = TrismikClassicEvalMetric(metricId=metric_id, value=agg_value)
79
- metrics.append(metric)
80
- else:
81
- # Handle simple metric data (single value)
82
- metric = TrismikClassicEvalMetric(metricId=metric_name, value=metric_data)
83
- metrics.append(metric)
84
-
85
- classic_eval_request = TrismikClassicEvalRequest(
86
- project_id,
87
- experiment_id,
88
- run.run_spec.dataset.name,
89
- model,
90
- run.run_spec.hyperparameter_config,
91
- items,
92
- metrics,
93
- )
94
-
95
- response: TrismikClassicEvalResponse = await runner.submit_classic_eval_async(
96
- classic_eval_request
97
- )
98
-
99
- run_id: str = response.id
100
- logger.info(f"Classic eval run uploaded successfully with run_id: {run_id}")
101
-
102
- return response
@@ -1,36 +0,0 @@
1
- scorebook/__init__.py,sha256=30kyXG8sVbThtCt6cjPtkx7hiaUEukRQC-RsgunlkL4,557
2
- scorebook/cli/__init__.py,sha256=E89jR1DljFSHhfjEGSRKLgz0KhxGyRQ9a3vpUOmQL9o,32
3
- scorebook/cli/auth.py,sha256=bv3imsgmY_t52wFoMJt9iu-cKPwvKYkVqZ7nE8EVc6E,2931
4
- scorebook/cli/main.py,sha256=cEvShENl6L6feX_sa7FGNTeoz5UtwqzwenmcHaON1hg,1589
5
- scorebook/eval_dataset.py,sha256=LSTyxUkT06iEAVYCnjIDFxFgZzRejwiS5CZA-jvy1ns,15098
6
- scorebook/evaluate.py,sha256=OOBTZmx84ZAuZKbIe1pp9L3201cX2gjPAkw_llYNnfE,21899
7
- scorebook/exceptions.py,sha256=emq2QY-4mW6VXlq1dxunPjt-xZpLQIxo8Ck_gYxz1VE,1827
8
- scorebook/inference/__init__.py,sha256=tqSXSyVurc_YRfPypYed8iTH7Fwt7iFCXMxBXnqY-9I,242
9
- scorebook/inference/bedrock.py,sha256=wllq0ysNFQKWJDEqoN-k96Jx43BHCAvfxm14zMRCf90,10074
10
- scorebook/inference/openai.py,sha256=iJVWp0HT9skyM4KXf21yaEjLafdETT5kK3HKl7MZ1hg,8292
11
- scorebook/inference/portkey.py,sha256=OHSS-sa2aLxuO6fEfG8MsPlhXc_95_-6j7ImbCkY8KE,5952
12
- scorebook/inference/vertex.py,sha256=jv_Nbt1NJQ6mMUyEuW_idxhj_3fugBojshtpGP9fMeY,9874
13
- scorebook/inference_pipeline.py,sha256=-HcGGbwM34fGJ_FlXcyqj_pV6DjWIXRGgICiN_63UsU,3251
14
- scorebook/metrics/__init__.py,sha256=be_riJNojebXw2xfkMsHHjl3HFKgk9jQWlLkXJHhheI,782
15
- scorebook/metrics/accuracy.py,sha256=5KQ4hfOn9M94sB7WsXUelJWJiuKfoCGQEl5q5q9vNfo,1467
16
- scorebook/metrics/metric_base.py,sha256=I3L0DGcRojFp93UGFnXG1tZ2UK9ilTcXXJG6lj5ddXA,857
17
- scorebook/metrics/metric_registry.py,sha256=jWwt9P3zvtFLlEYrd60v7LS7X251nZczouE02zcCxWg,3402
18
- scorebook/metrics/precision.py,sha256=AaYPYYKnY74Nwqp_p3jd2Ewf3VHNOJjoRWf5fhb-tXk,563
19
- scorebook/trismik_services/__init__.py,sha256=CiGl1u4GcfYhWmB_fGOlsJPwYeKXtIr-uCXoOv4O8yg,284
20
- scorebook/trismik_services/adaptive_testing_service.py,sha256=4FVW8g7EvJmHYpQp68y0U3xzOw_qJ9nkhEPiMnzTb4s,5103
21
- scorebook/trismik_services/login.py,sha256=WtJLaNmBMwCi6gT1Bgp4J9x2tq5HDrDI9U074r08TnU,3275
22
- scorebook/trismik_services/upload_classic_eval_run.py,sha256=t6tvaKfP2lFBKmZ_bM6oe_mUOjA_UKu35GbgL26QW3A,3658
23
- scorebook/types.py,sha256=zt8sGfbRjXatx1WtttWZDVIoiS-yhh_1lP0K4VHYvAM,5797
24
- scorebook/utils/__init__.py,sha256=l_bfi9lAMz1oyGnuyKuzYasQKt2DJwffqsbfSl4-GIQ,452
25
- scorebook/utils/async_utils.py,sha256=OeNvMrOT9P4rIyaCf5IbR3ZIFMtEzXgoAArNbINRtMU,728
26
- scorebook/utils/build_prompt.py,sha256=L_Y84a1ewm3GvwnSSuUXfPO_M0QL1Dl8UgOS_l_zvh4,1617
27
- scorebook/utils/io_helpers.py,sha256=ksOJ9ILcZqqt-HwRUYy1NMQbS6RuMh8i2ZzUADLMlQ8,913
28
- scorebook/utils/jinja_helpers.py,sha256=ksIKHiKdj8N0o7ZJZGasfbSNoAY6K5d9X_KM6mcKYD4,4208
29
- scorebook/utils/mappers.py,sha256=OcUnPBrnSUxZNhAzJhVmVWUWmqIKFXLTrK-xLi6_SUg,1259
30
- scorebook/utils/progress_bars.py,sha256=TBz41w3yFujsO9n8vUjeubgOrmdiAMI2P2SSVqTJzAA,5269
31
- scorebook/utils/transform_helpers.py,sha256=UnVLtFvcJrtmBEmLsuA4rrX4iJlNUKxm2DkIOGLl-2o,1030
32
- scorebook-0.0.9.dist-info/METADATA,sha256=-VyogmnwuCfPBaj6BS_gOGKxgAkxcnAx6k2GFaI1jGg,11516
33
- scorebook-0.0.9.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
34
- scorebook-0.0.9.dist-info/entry_points.txt,sha256=9gNd3Q0MEozhJ7fog-Q-Z_PrcGMnF-404Jon40MH2_U,53
35
- scorebook-0.0.9.dist-info/licenses/LICENSE,sha256=JLH1g9FhxHZf6CBCeQ_xAisPtICVObuNGW1bLPiTYEs,1068
36
- scorebook-0.0.9.dist-info/RECORD,,
File without changes
File without changes
File without changes