scorebook 0.0.3__tar.gz → 0.0.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {scorebook-0.0.3 → scorebook-0.0.5}/PKG-INFO +2 -1
- {scorebook-0.0.3 → scorebook-0.0.5}/pyproject.toml +9 -1
- scorebook-0.0.5/src/scorebook/__init__.py +18 -0
- scorebook-0.0.5/src/scorebook/cli/__init__.py +1 -0
- scorebook-0.0.5/src/scorebook/cli/auth.py +98 -0
- scorebook-0.0.5/src/scorebook/cli/main.py +57 -0
- {scorebook-0.0.3/src/scorebook/types → scorebook-0.0.5/src/scorebook}/eval_dataset.py +38 -0
- scorebook-0.0.5/src/scorebook/evaluate.py +531 -0
- {scorebook-0.0.3 → scorebook-0.0.5}/src/scorebook/inference/openai.py +23 -1
- scorebook-0.0.5/src/scorebook/trismik/__init__.py +6 -0
- scorebook-0.0.5/src/scorebook/trismik/adaptive_testing_service.py +141 -0
- scorebook-0.0.5/src/scorebook/trismik/login.py +120 -0
- scorebook-0.0.5/src/scorebook/types.py +165 -0
- scorebook-0.0.3/src/scorebook/__init__.py +0 -16
- scorebook-0.0.3/src/scorebook/evaluator.py +0 -379
- scorebook-0.0.3/src/scorebook/types/__init__.py +0 -12
- scorebook-0.0.3/src/scorebook/types/eval_result.py +0 -133
- scorebook-0.0.3/src/scorebook/types/eval_run_spec.py +0 -28
- scorebook-0.0.3/src/scorebook/utils/logging_utils.py +0 -1
- {scorebook-0.0.3 → scorebook-0.0.5}/LICENSE +0 -0
- {scorebook-0.0.3 → scorebook-0.0.5}/README.md +0 -0
- {scorebook-0.0.3 → scorebook-0.0.5}/src/scorebook/exceptions.py +0 -0
- {scorebook-0.0.3 → scorebook-0.0.5}/src/scorebook/inference/__init__.py +0 -0
- {scorebook-0.0.3 → scorebook-0.0.5}/src/scorebook/inference/bedrock.py +0 -0
- {scorebook-0.0.3 → scorebook-0.0.5}/src/scorebook/inference/portkey.py +0 -0
- {scorebook-0.0.3 → scorebook-0.0.5}/src/scorebook/inference/vertex.py +0 -0
- {scorebook-0.0.3/src/scorebook/types → scorebook-0.0.5/src/scorebook}/inference_pipeline.py +0 -0
- {scorebook-0.0.3 → scorebook-0.0.5}/src/scorebook/metrics/__init__.py +0 -0
- {scorebook-0.0.3 → scorebook-0.0.5}/src/scorebook/metrics/accuracy.py +0 -0
- {scorebook-0.0.3 → scorebook-0.0.5}/src/scorebook/metrics/metric_base.py +0 -0
- {scorebook-0.0.3 → scorebook-0.0.5}/src/scorebook/metrics/metric_registry.py +0 -0
- {scorebook-0.0.3 → scorebook-0.0.5}/src/scorebook/metrics/precision.py +0 -0
- {scorebook-0.0.3 → scorebook-0.0.5}/src/scorebook/utils/__init__.py +0 -0
- {scorebook-0.0.3 → scorebook-0.0.5}/src/scorebook/utils/async_utils.py +0 -0
- {scorebook-0.0.3 → scorebook-0.0.5}/src/scorebook/utils/build_prompt.py +0 -0
- {scorebook-0.0.3 → scorebook-0.0.5}/src/scorebook/utils/io_helpers.py +0 -0
- {scorebook-0.0.3 → scorebook-0.0.5}/src/scorebook/utils/jinja_helpers.py +0 -0
- {scorebook-0.0.3 → scorebook-0.0.5}/src/scorebook/utils/mappers.py +0 -0
- {scorebook-0.0.3 → scorebook-0.0.5}/src/scorebook/utils/progress_bars.py +0 -0
- {scorebook-0.0.3 → scorebook-0.0.5}/src/scorebook/utils/transform_helpers.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: scorebook
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.5
|
|
4
4
|
Summary: A Python project for LLM evaluation.
|
|
5
5
|
Author: Euan Campbell
|
|
6
6
|
Author-email: euan@trismik.com
|
|
@@ -35,6 +35,7 @@ Requires-Dist: torch ; extra == "examples"
|
|
|
35
35
|
Requires-Dist: torchaudio ; extra == "examples"
|
|
36
36
|
Requires-Dist: torchvision ; extra == "examples"
|
|
37
37
|
Requires-Dist: transformers ; extra == "examples"
|
|
38
|
+
Requires-Dist: trismik
|
|
38
39
|
Description-Content-Type: text/markdown
|
|
39
40
|
|
|
40
41
|
# Scorebook
|
|
@@ -11,12 +11,20 @@ requires-python = ">=3.9"
|
|
|
11
11
|
dependencies = [
|
|
12
12
|
"datasets>=3.6.0",
|
|
13
13
|
"notebook (>=7.4.5,<8.0.0)",
|
|
14
|
+
"trismik",
|
|
14
15
|
]
|
|
15
16
|
|
|
17
|
+
[project.scripts]
|
|
18
|
+
scorebook = "scorebook.cli.main:main"
|
|
19
|
+
|
|
16
20
|
[tool.poetry]
|
|
17
|
-
version = "0.0.
|
|
21
|
+
version = "0.0.5" # base version
|
|
18
22
|
packages = [{ include = "scorebook", from = "src" }]
|
|
19
23
|
|
|
24
|
+
[[tool.poetry.source]]
|
|
25
|
+
name = "testpypi"
|
|
26
|
+
url = "https://test.pypi.org/simple/"
|
|
27
|
+
priority = "supplemental"
|
|
20
28
|
|
|
21
29
|
[tool.poetry.group.dev.dependencies]
|
|
22
30
|
pytest = "^8.3.2"
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Scorebook package.
|
|
3
|
+
|
|
4
|
+
A Python project for scorebook functionality.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import importlib.metadata
|
|
8
|
+
|
|
9
|
+
# get version from pyproject.toml
|
|
10
|
+
__version__ = importlib.metadata.version(__package__ or __name__)
|
|
11
|
+
|
|
12
|
+
from scorebook.eval_dataset import EvalDataset
|
|
13
|
+
from scorebook.evaluate import evaluate
|
|
14
|
+
from scorebook.inference_pipeline import InferencePipeline
|
|
15
|
+
from scorebook.trismik.login import login, whoami
|
|
16
|
+
from scorebook.utils.build_prompt import build_prompt
|
|
17
|
+
|
|
18
|
+
__all__ = ["EvalDataset", "evaluate", "build_prompt", "login", "whoami", "InferencePipeline"]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""CLI module for scorebook."""
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
"""Authentication CLI commands."""
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import getpass
|
|
5
|
+
import sys
|
|
6
|
+
|
|
7
|
+
from scorebook.trismik.login import get_stored_token, get_token_path, login, logout, whoami
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def auth_command(args: argparse.Namespace) -> int:
|
|
11
|
+
"""Handle auth subcommands."""
|
|
12
|
+
if args.auth_command == "login":
|
|
13
|
+
return login_command(args)
|
|
14
|
+
elif args.auth_command == "logout":
|
|
15
|
+
return logout_command(args)
|
|
16
|
+
elif args.auth_command == "whoami":
|
|
17
|
+
return whoami_command(args)
|
|
18
|
+
else:
|
|
19
|
+
print(
|
|
20
|
+
"Error: No auth command specified. Use 'login', 'logout', or 'whoami'.", file=sys.stderr
|
|
21
|
+
)
|
|
22
|
+
return 1
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def login_command(args: argparse.Namespace) -> int:
|
|
26
|
+
"""Handle login command."""
|
|
27
|
+
try:
|
|
28
|
+
token = args.token
|
|
29
|
+
|
|
30
|
+
if not token:
|
|
31
|
+
# Check if we're already logged in
|
|
32
|
+
stored_token = get_stored_token()
|
|
33
|
+
|
|
34
|
+
if stored_token:
|
|
35
|
+
print("You are already logged in.")
|
|
36
|
+
overwrite = (
|
|
37
|
+
input("Do you want to overwrite the existing token? (y/N): ").lower().strip()
|
|
38
|
+
)
|
|
39
|
+
if overwrite not in ("y", "yes"):
|
|
40
|
+
print("Login cancelled.")
|
|
41
|
+
return 0
|
|
42
|
+
|
|
43
|
+
# Prompt for token securely
|
|
44
|
+
print("Enter your Trismik API token:")
|
|
45
|
+
print("(You can find your token at: https://trismik.com/settings/tokens)")
|
|
46
|
+
token = getpass.getpass("Token: ").strip()
|
|
47
|
+
|
|
48
|
+
if not token:
|
|
49
|
+
print("Error: No token provided.", file=sys.stderr)
|
|
50
|
+
return 1
|
|
51
|
+
|
|
52
|
+
# Login
|
|
53
|
+
login(token)
|
|
54
|
+
|
|
55
|
+
# Success message
|
|
56
|
+
print(f"Successfully logged in! Token saved to {get_token_path()}")
|
|
57
|
+
return 0
|
|
58
|
+
|
|
59
|
+
except ValueError as e:
|
|
60
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
61
|
+
return 1
|
|
62
|
+
except KeyboardInterrupt:
|
|
63
|
+
print("\nLogin cancelled.")
|
|
64
|
+
return 130
|
|
65
|
+
except Exception as e:
|
|
66
|
+
print(f"Unexpected error: {e}", file=sys.stderr)
|
|
67
|
+
return 1
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def logout_command(args: argparse.Namespace) -> int:
|
|
71
|
+
"""Handle logout command."""
|
|
72
|
+
try:
|
|
73
|
+
success = logout()
|
|
74
|
+
if success:
|
|
75
|
+
print("Successfully logged out!")
|
|
76
|
+
else:
|
|
77
|
+
print("Not currently logged in.")
|
|
78
|
+
return 0
|
|
79
|
+
except Exception as e:
|
|
80
|
+
print(f"Error during logout: {e}", file=sys.stderr)
|
|
81
|
+
return 1
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def whoami_command(args: argparse.Namespace) -> int:
|
|
85
|
+
"""Handle whoami command."""
|
|
86
|
+
try:
|
|
87
|
+
token = whoami()
|
|
88
|
+
if token is None:
|
|
89
|
+
print("Not logged in. Run 'scorebook auth login' first.")
|
|
90
|
+
return 1
|
|
91
|
+
else:
|
|
92
|
+
# TODO: Make actual API call to get user info
|
|
93
|
+
# For now, just confirm we have a token
|
|
94
|
+
print(f"Logged in with token: {token[:8]}...")
|
|
95
|
+
return 0
|
|
96
|
+
except Exception as e:
|
|
97
|
+
print(f"Error checking login status: {e}", file=sys.stderr)
|
|
98
|
+
return 1
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Main CLI entry point for scorebook."""
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import sys
|
|
5
|
+
from typing import List, Optional
|
|
6
|
+
|
|
7
|
+
from .auth import auth_command
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def create_parser() -> argparse.ArgumentParser:
|
|
11
|
+
"""Create the main argument parser."""
|
|
12
|
+
parser = argparse.ArgumentParser(
|
|
13
|
+
prog="scorebook",
|
|
14
|
+
description="Scorebook CLI - A Python project for LLM evaluation",
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
# Add subcommands
|
|
18
|
+
subparsers = parser.add_subparsers(dest="command", help="Available commands")
|
|
19
|
+
|
|
20
|
+
# Auth subcommand
|
|
21
|
+
auth_parser = subparsers.add_parser("auth", help="Authentication commands")
|
|
22
|
+
auth_subparsers = auth_parser.add_subparsers(dest="auth_command", help="Auth commands")
|
|
23
|
+
|
|
24
|
+
# Auth login
|
|
25
|
+
login_parser = auth_subparsers.add_parser("login", help="Login to scorebook")
|
|
26
|
+
login_parser.add_argument("--token", help="API token to use for login")
|
|
27
|
+
|
|
28
|
+
# Auth logout
|
|
29
|
+
auth_subparsers.add_parser("logout", help="Logout from scorebook")
|
|
30
|
+
|
|
31
|
+
# Auth whoami
|
|
32
|
+
auth_subparsers.add_parser("whoami", help="Show current login status")
|
|
33
|
+
|
|
34
|
+
return parser
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def main(argv: Optional[List[str]] = None) -> int:
|
|
38
|
+
"""Run the main CLI entry point."""
|
|
39
|
+
parser = create_parser()
|
|
40
|
+
args = parser.parse_args(argv)
|
|
41
|
+
|
|
42
|
+
try:
|
|
43
|
+
if args.command == "auth":
|
|
44
|
+
return auth_command(args)
|
|
45
|
+
else:
|
|
46
|
+
parser.print_help()
|
|
47
|
+
return 1
|
|
48
|
+
except KeyboardInterrupt:
|
|
49
|
+
print("\nOperation cancelled.")
|
|
50
|
+
return 130
|
|
51
|
+
except Exception as e:
|
|
52
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
53
|
+
return 1
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
if __name__ == "__main__":
|
|
57
|
+
sys.exit(main())
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
import csv
|
|
4
4
|
import json
|
|
5
|
+
import random
|
|
5
6
|
from typing import Any, Dict, Iterator, List, Optional, Type, Union
|
|
6
7
|
|
|
7
8
|
import yaml
|
|
@@ -364,3 +365,40 @@ class EvalDataset:
|
|
|
364
365
|
resolved.append(MetricRegistry.get(m)) # Use registry for str or class
|
|
365
366
|
|
|
366
367
|
return resolved
|
|
368
|
+
|
|
369
|
+
def sample(self, sample_size: int) -> "EvalDataset":
|
|
370
|
+
"""Create a new dataset with randomly sampled items from this dataset.
|
|
371
|
+
|
|
372
|
+
Args:
|
|
373
|
+
sample_size: The number of items to sample from the dataset
|
|
374
|
+
|
|
375
|
+
Returns:
|
|
376
|
+
A new EvalDataset with randomly sampled items
|
|
377
|
+
|
|
378
|
+
Raises:
|
|
379
|
+
ValueError: If sample_size is larger than the dataset size
|
|
380
|
+
"""
|
|
381
|
+
dataset_size = len(self.items)
|
|
382
|
+
|
|
383
|
+
if sample_size > dataset_size:
|
|
384
|
+
raise ValueError(
|
|
385
|
+
f"Sample size {sample_size} is larger than dataset size {dataset_size} "
|
|
386
|
+
f"for dataset '{self.name}'"
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
# Create randomly sampled items
|
|
390
|
+
sampled_items = random.sample(self.items, sample_size)
|
|
391
|
+
|
|
392
|
+
# Create a new EvalDataset instance with sampled items using from_list
|
|
393
|
+
sampled_dataset = self.from_list(
|
|
394
|
+
name=self.name,
|
|
395
|
+
label=self.label,
|
|
396
|
+
metrics=self.metrics,
|
|
397
|
+
data=sampled_items,
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
# Preserve the prompt template if it exists
|
|
401
|
+
if self.prompt_template is not None:
|
|
402
|
+
sampled_dataset.prompt_template = self.prompt_template
|
|
403
|
+
|
|
404
|
+
return sampled_dataset
|