scorebook 0.0.9__py3-none-any.whl → 0.0.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scorebook/__init__.py +12 -4
- scorebook/cli/auth.py +1 -1
- scorebook/evaluate/__init__.py +15 -0
- scorebook/evaluate/_async/__init__.py +0 -0
- scorebook/evaluate/_async/evaluate_async.py +413 -0
- scorebook/evaluate/_sync/__init__.py +0 -0
- scorebook/evaluate/_sync/evaluate.py +413 -0
- scorebook/evaluate/evaluate_helpers.py +365 -0
- scorebook/inference/__init__.py +4 -0
- scorebook/inference/clients/__init__.py +8 -0
- scorebook/inference/{openai.py → clients/openai.py} +35 -23
- scorebook/{inference_pipeline.py → inference/inference_pipeline.py} +66 -4
- scorebook/settings.py +18 -0
- scorebook/trismik/__init__.py +10 -0
- scorebook/utils/__init__.py +9 -2
- scorebook/utils/async_utils.py +20 -1
- scorebook/utils/progress_bars.py +22 -61
- {scorebook-0.0.9.dist-info → scorebook-0.0.10.dist-info}/METADATA +3 -4
- scorebook-0.0.10.dist-info/RECORD +41 -0
- scorebook/evaluate.py +0 -623
- scorebook/trismik_services/__init__.py +0 -6
- scorebook/trismik_services/adaptive_testing_service.py +0 -141
- scorebook/trismik_services/upload_classic_eval_run.py +0 -102
- scorebook-0.0.9.dist-info/RECORD +0 -36
- /scorebook/inference/{bedrock.py → clients/bedrock.py} +0 -0
- /scorebook/inference/{portkey.py → clients/portkey.py} +0 -0
- /scorebook/inference/{vertex.py → clients/vertex.py} +0 -0
- /scorebook/{trismik_services/login.py → trismik/credentials.py} +0 -0
- {scorebook-0.0.9.dist-info → scorebook-0.0.10.dist-info}/WHEEL +0 -0
- {scorebook-0.0.9.dist-info → scorebook-0.0.10.dist-info}/entry_points.txt +0 -0
- {scorebook-0.0.9.dist-info → scorebook-0.0.10.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,141 +0,0 @@
|
|
|
1
|
-
"""Trismik adaptive testing service integration."""
|
|
2
|
-
|
|
3
|
-
import asyncio
|
|
4
|
-
import dataclasses
|
|
5
|
-
import inspect
|
|
6
|
-
import logging
|
|
7
|
-
from typing import Any, Callable, Iterable, Mapping
|
|
8
|
-
|
|
9
|
-
from trismik.adaptive_test import AdaptiveTest
|
|
10
|
-
from trismik.client_async import TrismikAsyncClient
|
|
11
|
-
from trismik.types import TrismikMultipleChoiceTextItem, TrismikRunMetadata
|
|
12
|
-
|
|
13
|
-
from scorebook.types import AdaptiveEvalRunResult, AdaptiveEvalRunSpec
|
|
14
|
-
|
|
15
|
-
from .login import get_token
|
|
16
|
-
|
|
17
|
-
logger = logging.getLogger(__name__)
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
async def run_adaptive_evaluation(
|
|
21
|
-
inference: Callable,
|
|
22
|
-
adaptive_run_spec: AdaptiveEvalRunSpec,
|
|
23
|
-
experiment_id: str,
|
|
24
|
-
project_id: str,
|
|
25
|
-
metadata: Any,
|
|
26
|
-
) -> AdaptiveEvalRunResult:
|
|
27
|
-
"""Run an adaptive evaluation using the Trismik API.
|
|
28
|
-
|
|
29
|
-
Args:
|
|
30
|
-
inference: Function to run inference
|
|
31
|
-
adaptive_run_spec: Specification for the adaptive evaluation
|
|
32
|
-
experiment_id: Experiment identifier
|
|
33
|
-
project_id: Trismik project ID
|
|
34
|
-
metadata: Additional metadata
|
|
35
|
-
Returns:
|
|
36
|
-
Results from the adaptive evaluation
|
|
37
|
-
"""
|
|
38
|
-
runner = AdaptiveTest(
|
|
39
|
-
make_trismik_inference(inference),
|
|
40
|
-
client=TrismikAsyncClient(
|
|
41
|
-
service_url="https://api.trismik.com/adaptive-testing", api_key=get_token()
|
|
42
|
-
),
|
|
43
|
-
)
|
|
44
|
-
|
|
45
|
-
logger.debug(
|
|
46
|
-
"test_id: %s, project_id: %s, experiment: %s ",
|
|
47
|
-
adaptive_run_spec.dataset,
|
|
48
|
-
project_id,
|
|
49
|
-
experiment_id,
|
|
50
|
-
)
|
|
51
|
-
trismik_results = runner.run(
|
|
52
|
-
adaptive_run_spec.dataset,
|
|
53
|
-
project_id,
|
|
54
|
-
experiment_id,
|
|
55
|
-
run_metadata=TrismikRunMetadata(
|
|
56
|
-
model_metadata=TrismikRunMetadata.ModelMetadata(name="unknown"),
|
|
57
|
-
test_configuration={},
|
|
58
|
-
inference_setup={},
|
|
59
|
-
),
|
|
60
|
-
return_dict=False,
|
|
61
|
-
)
|
|
62
|
-
|
|
63
|
-
# Convert TrismikRunResults to AdaptiveEvalRunResult
|
|
64
|
-
# Extract scores from the Trismik results
|
|
65
|
-
scores = {}
|
|
66
|
-
if hasattr(trismik_results, "scores") and trismik_results.scores:
|
|
67
|
-
scores = trismik_results.scores
|
|
68
|
-
elif hasattr(trismik_results, "__dict__"):
|
|
69
|
-
# If scores aren't directly available, include all attributes as scores
|
|
70
|
-
scores = {k: v for k, v in trismik_results.__dict__.items() if not k.startswith("_")}
|
|
71
|
-
|
|
72
|
-
# Convert AdaptiveTestScore objects to JSON-serializable dictionaries
|
|
73
|
-
def make_json_serializable(obj: Any) -> Any:
|
|
74
|
-
if hasattr(obj, "theta") and hasattr(obj, "std_error"):
|
|
75
|
-
# This is likely an AdaptiveTestScore object
|
|
76
|
-
return {"theta": obj.theta, "std_error": obj.std_error}
|
|
77
|
-
elif isinstance(obj, dict):
|
|
78
|
-
return {k: make_json_serializable(v) for k, v in obj.items()}
|
|
79
|
-
elif isinstance(obj, (list, tuple)):
|
|
80
|
-
return [make_json_serializable(item) for item in obj]
|
|
81
|
-
else:
|
|
82
|
-
return obj
|
|
83
|
-
|
|
84
|
-
# Make scores JSON serializable
|
|
85
|
-
scores = make_json_serializable(scores)
|
|
86
|
-
|
|
87
|
-
return AdaptiveEvalRunResult(run_spec=adaptive_run_spec, scores=scores)
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
def make_trismik_inference(
|
|
91
|
-
inference_function: Callable,
|
|
92
|
-
return_list: bool = False,
|
|
93
|
-
) -> Callable[[Any], Any]:
|
|
94
|
-
"""Wrap an inference function for flexible input handling.
|
|
95
|
-
|
|
96
|
-
Takes a function expecting list[dict] and makes it accept single dict
|
|
97
|
-
or TrismikMultipleChoiceTextItem.
|
|
98
|
-
"""
|
|
99
|
-
|
|
100
|
-
# Check if the inference function is async
|
|
101
|
-
is_async = inspect.iscoroutinefunction(inference_function) or (
|
|
102
|
-
hasattr(inference_function, "__call__")
|
|
103
|
-
and inspect.iscoroutinefunction(inference_function.__call__)
|
|
104
|
-
)
|
|
105
|
-
|
|
106
|
-
def sync_trismik_inference_function(eval_items: Any, **kwargs: Any) -> Any:
|
|
107
|
-
# Single TrismikMultipleChoiceTextItem dataclass
|
|
108
|
-
if isinstance(eval_items, TrismikMultipleChoiceTextItem):
|
|
109
|
-
eval_item_dict = dataclasses.asdict(eval_items)
|
|
110
|
-
results = inference_function([eval_item_dict], **kwargs)
|
|
111
|
-
if is_async:
|
|
112
|
-
results = asyncio.run(results)
|
|
113
|
-
return results if return_list else results[0]
|
|
114
|
-
|
|
115
|
-
# Single item (a mapping)
|
|
116
|
-
if isinstance(eval_items, Mapping):
|
|
117
|
-
results = inference_function([eval_items], **kwargs)
|
|
118
|
-
if is_async:
|
|
119
|
-
results = asyncio.run(results)
|
|
120
|
-
return results if return_list else results[0]
|
|
121
|
-
|
|
122
|
-
# Iterable of items (but not a string/bytes)
|
|
123
|
-
if isinstance(eval_items, Iterable) and not isinstance(eval_items, (str, bytes)):
|
|
124
|
-
# Convert any TrismikMultipleChoiceTextItem instances to dicts
|
|
125
|
-
converted_items = []
|
|
126
|
-
for item in eval_items:
|
|
127
|
-
if isinstance(item, TrismikMultipleChoiceTextItem):
|
|
128
|
-
converted_items.append(dataclasses.asdict(item))
|
|
129
|
-
else:
|
|
130
|
-
converted_items.append(item)
|
|
131
|
-
results = inference_function(converted_items, **kwargs)
|
|
132
|
-
if is_async:
|
|
133
|
-
results = asyncio.run(results)
|
|
134
|
-
return results
|
|
135
|
-
|
|
136
|
-
raise TypeError(
|
|
137
|
-
"Expected a single item (Mapping[str, Any] or TrismikMultipleChoiceTextItem) "
|
|
138
|
-
"or an iterable of such items."
|
|
139
|
-
)
|
|
140
|
-
|
|
141
|
-
return sync_trismik_inference_function
|
|
@@ -1,102 +0,0 @@
|
|
|
1
|
-
"""Upload classic evaluation run results to Trismik platform."""
|
|
2
|
-
|
|
3
|
-
import logging
|
|
4
|
-
from typing import Any, Dict, List, Optional
|
|
5
|
-
|
|
6
|
-
from trismik.adaptive_test import AdaptiveTest
|
|
7
|
-
from trismik.client_async import TrismikAsyncClient
|
|
8
|
-
from trismik.types import (
|
|
9
|
-
TrismikClassicEvalItem,
|
|
10
|
-
TrismikClassicEvalMetric,
|
|
11
|
-
TrismikClassicEvalRequest,
|
|
12
|
-
TrismikClassicEvalResponse,
|
|
13
|
-
)
|
|
14
|
-
|
|
15
|
-
from scorebook.trismik_services.login import get_token
|
|
16
|
-
from scorebook.types import ClassicEvalRunResult
|
|
17
|
-
|
|
18
|
-
logger = logging.getLogger(__name__)
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
async def upload_classic_eval_run(
|
|
22
|
-
run: ClassicEvalRunResult,
|
|
23
|
-
experiment_id: str,
|
|
24
|
-
project_id: str,
|
|
25
|
-
model: str,
|
|
26
|
-
metadata: Optional[Dict[str, Any]],
|
|
27
|
-
) -> TrismikClassicEvalResponse:
|
|
28
|
-
"""Upload a classic evaluation run result to Trismik platform.
|
|
29
|
-
|
|
30
|
-
Args:
|
|
31
|
-
run: The evaluation run result to upload
|
|
32
|
-
experiment_id: Trismik experiment identifier
|
|
33
|
-
project_id: Trismik project identifier
|
|
34
|
-
model: Model name used for evaluation
|
|
35
|
-
metadata: Optional metadata dictionary
|
|
36
|
-
|
|
37
|
-
Returns:
|
|
38
|
-
Response from Trismik API containing the upload result
|
|
39
|
-
"""
|
|
40
|
-
runner = AdaptiveTest(
|
|
41
|
-
lambda x: None,
|
|
42
|
-
client=TrismikAsyncClient(
|
|
43
|
-
service_url="https://api.trismik.com/adaptive-testing", api_key=get_token()
|
|
44
|
-
),
|
|
45
|
-
)
|
|
46
|
-
|
|
47
|
-
# Create eval items from run_spec items, outputs, and labels
|
|
48
|
-
items: List[TrismikClassicEvalItem] = []
|
|
49
|
-
for idx, (item, output) in enumerate(zip(run.run_spec.items, run.outputs)):
|
|
50
|
-
label = run.run_spec.labels[idx] if idx < len(run.run_spec.labels) else ""
|
|
51
|
-
|
|
52
|
-
# Calculate item-level metrics for this item
|
|
53
|
-
item_metrics: Dict[str, Any] = {}
|
|
54
|
-
for metric_name, metric_data in run.scores.items():
|
|
55
|
-
if isinstance(metric_data, dict) and "item_scores" in metric_data:
|
|
56
|
-
if idx < len(metric_data["item_scores"]):
|
|
57
|
-
item_metrics[metric_name] = metric_data["item_scores"][idx]
|
|
58
|
-
else:
|
|
59
|
-
# If scores is just a single value, use it for all items
|
|
60
|
-
item_metrics[metric_name] = metric_data
|
|
61
|
-
|
|
62
|
-
eval_item = TrismikClassicEvalItem(
|
|
63
|
-
datasetItemId=str(idx),
|
|
64
|
-
modelInput=str(item),
|
|
65
|
-
modelOutput=str(output),
|
|
66
|
-
goldOutput=str(label),
|
|
67
|
-
metrics=item_metrics,
|
|
68
|
-
)
|
|
69
|
-
items.append(eval_item)
|
|
70
|
-
|
|
71
|
-
# Create eval metrics from run aggregate scores
|
|
72
|
-
metrics: List[TrismikClassicEvalMetric] = []
|
|
73
|
-
for metric_name, metric_data in run.scores.items():
|
|
74
|
-
if isinstance(metric_data, dict) and "aggregate_scores" in metric_data:
|
|
75
|
-
# Handle structured metric data with aggregate scores
|
|
76
|
-
for agg_name, agg_value in metric_data["aggregate_scores"].items():
|
|
77
|
-
metric_id = f"{metric_name}_{agg_name}" if agg_name != metric_name else metric_name
|
|
78
|
-
metric = TrismikClassicEvalMetric(metricId=metric_id, value=agg_value)
|
|
79
|
-
metrics.append(metric)
|
|
80
|
-
else:
|
|
81
|
-
# Handle simple metric data (single value)
|
|
82
|
-
metric = TrismikClassicEvalMetric(metricId=metric_name, value=metric_data)
|
|
83
|
-
metrics.append(metric)
|
|
84
|
-
|
|
85
|
-
classic_eval_request = TrismikClassicEvalRequest(
|
|
86
|
-
project_id,
|
|
87
|
-
experiment_id,
|
|
88
|
-
run.run_spec.dataset.name,
|
|
89
|
-
model,
|
|
90
|
-
run.run_spec.hyperparameter_config,
|
|
91
|
-
items,
|
|
92
|
-
metrics,
|
|
93
|
-
)
|
|
94
|
-
|
|
95
|
-
response: TrismikClassicEvalResponse = await runner.submit_classic_eval_async(
|
|
96
|
-
classic_eval_request
|
|
97
|
-
)
|
|
98
|
-
|
|
99
|
-
run_id: str = response.id
|
|
100
|
-
logger.info(f"Classic eval run uploaded successfully with run_id: {run_id}")
|
|
101
|
-
|
|
102
|
-
return response
|
scorebook-0.0.9.dist-info/RECORD
DELETED
|
@@ -1,36 +0,0 @@
|
|
|
1
|
-
scorebook/__init__.py,sha256=30kyXG8sVbThtCt6cjPtkx7hiaUEukRQC-RsgunlkL4,557
|
|
2
|
-
scorebook/cli/__init__.py,sha256=E89jR1DljFSHhfjEGSRKLgz0KhxGyRQ9a3vpUOmQL9o,32
|
|
3
|
-
scorebook/cli/auth.py,sha256=bv3imsgmY_t52wFoMJt9iu-cKPwvKYkVqZ7nE8EVc6E,2931
|
|
4
|
-
scorebook/cli/main.py,sha256=cEvShENl6L6feX_sa7FGNTeoz5UtwqzwenmcHaON1hg,1589
|
|
5
|
-
scorebook/eval_dataset.py,sha256=LSTyxUkT06iEAVYCnjIDFxFgZzRejwiS5CZA-jvy1ns,15098
|
|
6
|
-
scorebook/evaluate.py,sha256=OOBTZmx84ZAuZKbIe1pp9L3201cX2gjPAkw_llYNnfE,21899
|
|
7
|
-
scorebook/exceptions.py,sha256=emq2QY-4mW6VXlq1dxunPjt-xZpLQIxo8Ck_gYxz1VE,1827
|
|
8
|
-
scorebook/inference/__init__.py,sha256=tqSXSyVurc_YRfPypYed8iTH7Fwt7iFCXMxBXnqY-9I,242
|
|
9
|
-
scorebook/inference/bedrock.py,sha256=wllq0ysNFQKWJDEqoN-k96Jx43BHCAvfxm14zMRCf90,10074
|
|
10
|
-
scorebook/inference/openai.py,sha256=iJVWp0HT9skyM4KXf21yaEjLafdETT5kK3HKl7MZ1hg,8292
|
|
11
|
-
scorebook/inference/portkey.py,sha256=OHSS-sa2aLxuO6fEfG8MsPlhXc_95_-6j7ImbCkY8KE,5952
|
|
12
|
-
scorebook/inference/vertex.py,sha256=jv_Nbt1NJQ6mMUyEuW_idxhj_3fugBojshtpGP9fMeY,9874
|
|
13
|
-
scorebook/inference_pipeline.py,sha256=-HcGGbwM34fGJ_FlXcyqj_pV6DjWIXRGgICiN_63UsU,3251
|
|
14
|
-
scorebook/metrics/__init__.py,sha256=be_riJNojebXw2xfkMsHHjl3HFKgk9jQWlLkXJHhheI,782
|
|
15
|
-
scorebook/metrics/accuracy.py,sha256=5KQ4hfOn9M94sB7WsXUelJWJiuKfoCGQEl5q5q9vNfo,1467
|
|
16
|
-
scorebook/metrics/metric_base.py,sha256=I3L0DGcRojFp93UGFnXG1tZ2UK9ilTcXXJG6lj5ddXA,857
|
|
17
|
-
scorebook/metrics/metric_registry.py,sha256=jWwt9P3zvtFLlEYrd60v7LS7X251nZczouE02zcCxWg,3402
|
|
18
|
-
scorebook/metrics/precision.py,sha256=AaYPYYKnY74Nwqp_p3jd2Ewf3VHNOJjoRWf5fhb-tXk,563
|
|
19
|
-
scorebook/trismik_services/__init__.py,sha256=CiGl1u4GcfYhWmB_fGOlsJPwYeKXtIr-uCXoOv4O8yg,284
|
|
20
|
-
scorebook/trismik_services/adaptive_testing_service.py,sha256=4FVW8g7EvJmHYpQp68y0U3xzOw_qJ9nkhEPiMnzTb4s,5103
|
|
21
|
-
scorebook/trismik_services/login.py,sha256=WtJLaNmBMwCi6gT1Bgp4J9x2tq5HDrDI9U074r08TnU,3275
|
|
22
|
-
scorebook/trismik_services/upload_classic_eval_run.py,sha256=t6tvaKfP2lFBKmZ_bM6oe_mUOjA_UKu35GbgL26QW3A,3658
|
|
23
|
-
scorebook/types.py,sha256=zt8sGfbRjXatx1WtttWZDVIoiS-yhh_1lP0K4VHYvAM,5797
|
|
24
|
-
scorebook/utils/__init__.py,sha256=l_bfi9lAMz1oyGnuyKuzYasQKt2DJwffqsbfSl4-GIQ,452
|
|
25
|
-
scorebook/utils/async_utils.py,sha256=OeNvMrOT9P4rIyaCf5IbR3ZIFMtEzXgoAArNbINRtMU,728
|
|
26
|
-
scorebook/utils/build_prompt.py,sha256=L_Y84a1ewm3GvwnSSuUXfPO_M0QL1Dl8UgOS_l_zvh4,1617
|
|
27
|
-
scorebook/utils/io_helpers.py,sha256=ksOJ9ILcZqqt-HwRUYy1NMQbS6RuMh8i2ZzUADLMlQ8,913
|
|
28
|
-
scorebook/utils/jinja_helpers.py,sha256=ksIKHiKdj8N0o7ZJZGasfbSNoAY6K5d9X_KM6mcKYD4,4208
|
|
29
|
-
scorebook/utils/mappers.py,sha256=OcUnPBrnSUxZNhAzJhVmVWUWmqIKFXLTrK-xLi6_SUg,1259
|
|
30
|
-
scorebook/utils/progress_bars.py,sha256=TBz41w3yFujsO9n8vUjeubgOrmdiAMI2P2SSVqTJzAA,5269
|
|
31
|
-
scorebook/utils/transform_helpers.py,sha256=UnVLtFvcJrtmBEmLsuA4rrX4iJlNUKxm2DkIOGLl-2o,1030
|
|
32
|
-
scorebook-0.0.9.dist-info/METADATA,sha256=-VyogmnwuCfPBaj6BS_gOGKxgAkxcnAx6k2GFaI1jGg,11516
|
|
33
|
-
scorebook-0.0.9.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
|
|
34
|
-
scorebook-0.0.9.dist-info/entry_points.txt,sha256=9gNd3Q0MEozhJ7fog-Q-Z_PrcGMnF-404Jon40MH2_U,53
|
|
35
|
-
scorebook-0.0.9.dist-info/licenses/LICENSE,sha256=JLH1g9FhxHZf6CBCeQ_xAisPtICVObuNGW1bLPiTYEs,1068
|
|
36
|
-
scorebook-0.0.9.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|