scorebook 0.0.14__py3-none-any.whl → 0.0.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. scorebook/__init__.py +2 -0
  2. scorebook/dashboard/credentials.py +34 -4
  3. scorebook/eval_datasets/eval_dataset.py +2 -2
  4. scorebook/evaluate/_async/evaluate_async.py +27 -11
  5. scorebook/evaluate/_sync/evaluate.py +27 -11
  6. scorebook/metrics/README.md +121 -0
  7. scorebook/metrics/__init__.py +8 -0
  8. scorebook/metrics/accuracy.py +2 -6
  9. scorebook/metrics/bertscore.py +50 -0
  10. scorebook/metrics/bleu.py +82 -0
  11. scorebook/metrics/core/__init__.py +1 -0
  12. scorebook/metrics/{metric_base.py → core/metric_base.py} +1 -2
  13. scorebook/metrics/core/metric_registry.py +195 -0
  14. scorebook/metrics/exactmatch.py +95 -0
  15. scorebook/metrics/f1.py +96 -0
  16. scorebook/metrics/precision.py +84 -9
  17. scorebook/metrics/recall.py +94 -0
  18. scorebook/metrics/rouge.py +85 -0
  19. scorebook/score/score_helpers.py +28 -11
  20. scorebook/types.py +2 -2
  21. scorebook/utils/progress_bars.py +58 -786
  22. {scorebook-0.0.14.dist-info → scorebook-0.0.16.dist-info}/METADATA +32 -24
  23. scorebook-0.0.16.dist-info/RECORD +110 -0
  24. {scorebook-0.0.14.dist-info → scorebook-0.0.16.dist-info}/WHEEL +1 -1
  25. tutorials/README.md +147 -0
  26. tutorials/__init__.py +5 -0
  27. tutorials/examples/1-score/1-scoring_model_accuracy.py +47 -0
  28. tutorials/examples/1-score/2-scoring_model_bleu.py +46 -0
  29. tutorials/examples/1-score/3-scoring_model_f1.py +64 -0
  30. tutorials/examples/1-score/4-scoring_model_rouge.py +64 -0
  31. tutorials/examples/1-score/5-scoring_model_exact_match.py +84 -0
  32. tutorials/examples/1-score/6-scoring_with_bertscore.py +57 -0
  33. tutorials/examples/1-score/__init__.py +0 -0
  34. tutorials/examples/2-evaluate/1-evaluating_local_models.py +106 -0
  35. tutorials/examples/2-evaluate/2-evaluating_local_models_with_batching.py +108 -0
  36. tutorials/examples/2-evaluate/3-evaluating_cloud_models.py +109 -0
  37. tutorials/examples/2-evaluate/4-evaluating_cloud_models_with_batching.py +170 -0
  38. tutorials/examples/2-evaluate/5-hyperparameter_sweeps.py +122 -0
  39. tutorials/examples/2-evaluate/6-inference_pipelines.py +141 -0
  40. tutorials/examples/3-evaluation_datasets/1-evaluation_datasets_from_files.py +110 -0
  41. tutorials/examples/3-evaluation_datasets/2-evaluation_datasets_from_huggingface.py +101 -0
  42. tutorials/examples/3-evaluation_datasets/3-evaluation_datasets_from_huggingface_with_yaml_configs.py +110 -0
  43. tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.csv +11 -0
  44. tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.json +42 -0
  45. tutorials/examples/3-evaluation_datasets/example_yaml_configs/Cais-MMLU.yaml +19 -0
  46. tutorials/examples/3-evaluation_datasets/example_yaml_configs/TIGER-Lab-MMLU-Pro.yaml +18 -0
  47. tutorials/examples/4-adaptive_evaluations/1-adaptive_evaluation.py +114 -0
  48. tutorials/examples/4-adaptive_evaluations/2-adaptive_dataset_splits.py +106 -0
  49. tutorials/examples/5-upload_results/1-uploading_score_results.py +92 -0
  50. tutorials/examples/5-upload_results/2-uploading_evaluate_results.py +117 -0
  51. tutorials/examples/5-upload_results/3-uploading_your_results.py +153 -0
  52. tutorials/examples/6-providers/aws/__init__.py +1 -0
  53. tutorials/examples/6-providers/aws/batch_example.py +219 -0
  54. tutorials/examples/6-providers/portkey/__init__.py +1 -0
  55. tutorials/examples/6-providers/portkey/batch_example.py +120 -0
  56. tutorials/examples/6-providers/portkey/messages_example.py +121 -0
  57. tutorials/examples/6-providers/vertex/__init__.py +1 -0
  58. tutorials/examples/6-providers/vertex/batch_example.py +166 -0
  59. tutorials/examples/6-providers/vertex/messages_example.py +142 -0
  60. tutorials/examples/__init__.py +0 -0
  61. tutorials/notebooks/1-scoring.ipynb +162 -0
  62. tutorials/notebooks/2-evaluating.ipynb +316 -0
  63. tutorials/notebooks/3.1-adaptive_evaluation_phi.ipynb +354 -0
  64. tutorials/notebooks/3.2-adaptive_evaluation_gpt.ipynb +243 -0
  65. tutorials/notebooks/4-uploading_results.ipynb +175 -0
  66. tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_openai_demo.ipynb +229 -0
  67. tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_qwen_demo.ipynb +256 -0
  68. tutorials/quickstarts/classical_evaluations/classical_evaluation_demo.ipynb +277 -0
  69. tutorials/quickstarts/getting_started.ipynb +197 -0
  70. tutorials/utils/__init__.py +35 -0
  71. tutorials/utils/args_parser.py +132 -0
  72. tutorials/utils/output.py +23 -0
  73. tutorials/utils/setup.py +98 -0
  74. scorebook/metrics/metric_registry.py +0 -107
  75. scorebook-0.0.14.dist-info/RECORD +0 -53
  76. {scorebook-0.0.14.dist-info → scorebook-0.0.16.dist-info}/entry_points.txt +0 -0
  77. {scorebook-0.0.14.dist-info → scorebook-0.0.16.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,98 @@
1
+ """
2
+ Utility functions for setting up Scorebook examples.
3
+
4
+ This module provides common helper functions used across multiple Scorebook examples
5
+ for output directory setup and logging configuration.
6
+ """
7
+
8
+ import logging
9
+ from datetime import datetime
10
+ from pathlib import Path
11
+ from typing import Optional
12
+
13
+
14
+ def setup_output_directory() -> Path:
15
+ """Parse command line arguments and setup output directory."""
16
+ import argparse
17
+
18
+ parser = argparse.ArgumentParser(description="Run evaluation and save results.")
19
+ parser.add_argument(
20
+ "--output-dir",
21
+ type=str,
22
+ default=str(Path.cwd() / "examples/example_results"),
23
+ help=(
24
+ "Directory to save evaluation outputs (CSV and JSON). "
25
+ "Defaults to ./examples/example_results in the current working directory."
26
+ ),
27
+ )
28
+ args = parser.parse_args()
29
+ output_dir = Path(args.output_dir)
30
+ output_dir.mkdir(parents=True, exist_ok=True)
31
+ print(f"Saving results to {output_dir}")
32
+ return output_dir
33
+
34
+
35
+ def setup_logging(
36
+ log_dir: str = "logs",
37
+ experiment_id: Optional[str] = None,
38
+ base_dir: Optional[Path] = None,
39
+ ) -> Path:
40
+ """Configure logging for evaluation runs.
41
+
42
+ Args:
43
+ log_dir: Name of the log directory (default: "logs")
44
+ experiment_id: Optional identifier for the experiment
45
+ base_dir: Base directory where log_dir should be created.
46
+ If None, uses current working directory.
47
+ """
48
+ if base_dir is None:
49
+ base_dir = Path.cwd()
50
+
51
+ log_dir_path: Path = base_dir / log_dir
52
+ log_dir_path.mkdir(exist_ok=True, parents=True)
53
+
54
+ timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
55
+ if experiment_id:
56
+ log_file = log_dir_path / f"evaluation_{experiment_id}_{timestamp}.log"
57
+ else:
58
+ log_file = log_dir_path / f"evaluation_{timestamp}.log"
59
+
60
+ # Create file handler for all logs (same as before)
61
+ file_handler = logging.FileHandler(log_file)
62
+ file_handler.setLevel(logging.DEBUG)
63
+ file_handler.setFormatter(
64
+ logging.Formatter("%(asctime)s - %(levelname)s - %(name)s - %(message)s")
65
+ )
66
+
67
+ # Create console handler for warnings and errors only
68
+ console_handler = logging.StreamHandler()
69
+ console_handler.setLevel(logging.WARNING)
70
+ console_handler.setFormatter(logging.Formatter("%(levelname)s - %(name)s - %(message)s"))
71
+
72
+ # Configure root logger with both handlers
73
+ logging.basicConfig(
74
+ level=logging.INFO,
75
+ handlers=[file_handler, console_handler],
76
+ force=True,
77
+ )
78
+
79
+ # Set scorebook loggers to DEBUG level to capture all scorebook logs
80
+ scorebook_logger = logging.getLogger("scorebook")
81
+ scorebook_logger.setLevel(logging.DEBUG)
82
+
83
+ # Ensure trismik_services logs are captured at DEBUG level
84
+ trismik_services_logger = logging.getLogger("scorebook.trismik_services")
85
+ trismik_services_logger.setLevel(logging.DEBUG)
86
+
87
+ # Ensure evaluate logs are captured at DEBUG level
88
+ evaluate_logger = logging.getLogger("scorebook.evaluate._sync.evaluate")
89
+ evaluate_logger.setLevel(logging.DEBUG)
90
+ evaluate_logger = logging.getLogger("scorebook.evaluate._async.evaluate_async")
91
+ evaluate_logger.setLevel(logging.DEBUG)
92
+
93
+ # Exclude OpenAI inference logs to reduce noise
94
+ openai_logger = logging.getLogger("scorebook.inference.openai")
95
+ openai_logger.setLevel(logging.WARNING) # Only log warnings and errors
96
+
97
+ print(f"Logging to {log_file}")
98
+ return log_file
@@ -1,107 +0,0 @@
1
- """
2
- Registry module for evaluation metrics.
3
-
4
- This module maintains a centralized registry of available evaluation metrics
5
- that can be used to assess model performance. It provides a single access point
6
- to retrieve all implemented metric classes.
7
- """
8
-
9
- from typing import Any, Callable, Dict, List, Type, Union
10
-
11
- from scorebook.metrics.metric_base import MetricBase
12
-
13
-
14
- class MetricRegistry:
15
- """A registry for evaluation metrics.
16
-
17
- This class provides a central registry for all evaluation metrics in the system.
18
- It allows metrics to be registered with unique names and retrieved either by
19
- name or by class. The registry ensures that metrics are properly initialized
20
- and accessible throughout the application.
21
-
22
- The registry supports:
23
- - Registering new metric classes with optional custom names
24
- - Retrieving metric instances by name or class
25
- - Listing all available metrics
26
-
27
- Usage:
28
- @MetricRegistry.register("custom_name")
29
- class MyMetric(MetricBase):
30
- ...
31
-
32
- # Get by name
33
- metric = MetricRegistry.get("custom_name")
34
-
35
- # Get by class
36
- metric = MetricRegistry.get(MyMetric)
37
-
38
- # List available metrics
39
- metrics = MetricRegistry.list_metrics()
40
- """
41
-
42
- _registry: Dict[str, Type[MetricBase]] = {}
43
-
44
- @classmethod
45
- def register(cls) -> Callable[[Type[MetricBase]], Type[MetricBase]]:
46
- """
47
- Register a metric class in the registry.
48
-
49
- Returns:
50
- A decorator that registers the class and returns it.
51
-
52
- Raises:
53
- ValueError: If a metric with the given name is already registered.
54
- """
55
-
56
- def decorator(metric_cls: Type[MetricBase]) -> Type[MetricBase]:
57
-
58
- key = metric_cls.__name__.lower()
59
- if key in cls._registry:
60
- raise ValueError(f"Metric '{key}' is already registered")
61
- cls._registry[key] = metric_cls
62
- return metric_cls
63
-
64
- return decorator
65
-
66
- @classmethod
67
- def get(cls, name_or_class: Union[str, Type[MetricBase]], **kwargs: Any) -> MetricBase:
68
- """
69
- Get an instance of a registered metric by name or class.
70
-
71
- Args:
72
- name_or_class: The metric name (string) or class (subclass of BaseMetric).
73
- **kwargs: Additional arguments to pass to the metric's constructor.
74
-
75
- Returns:
76
- An instance of the requested metric.
77
-
78
- Raises:
79
- ValueError: If the metric name is not registered.
80
- """
81
- # If input is a class that's a subclass of BaseMetric, instantiate it directly
82
- if isinstance(name_or_class, type) and issubclass(name_or_class, MetricBase):
83
- return name_or_class(**kwargs)
84
-
85
- # If input is a string, look up the class in the registry
86
- if isinstance(name_or_class, str):
87
- key = name_or_class.lower()
88
-
89
- if key not in cls._registry:
90
- raise ValueError(f"Metric '{name_or_class}' not registered.")
91
-
92
- return cls._registry[key](**kwargs)
93
-
94
- raise ValueError(
95
- f"Invalid metric type: {type(name_or_class)}."
96
- f"Must be string name or BaseMetric subclass"
97
- )
98
-
99
- @classmethod
100
- def list_metrics(cls) -> List[str]:
101
- """
102
- List all registered metrics.
103
-
104
- Returns:
105
- A list of metric names.
106
- """
107
- return list(cls._registry.keys())
@@ -1,53 +0,0 @@
1
- scorebook/__init__.py,sha256=S2JaZZsx76p0EjXtKz4UPdSzuO60jAjOvooYP-idBu8,1144
2
- scorebook/cli/__init__.py,sha256=E89jR1DljFSHhfjEGSRKLgz0KhxGyRQ9a3vpUOmQL9o,32
3
- scorebook/cli/auth.py,sha256=VGS5T0CSeS0n_7bntNggrYx-vDwxJJHdYxbKedFAq74,2939
4
- scorebook/cli/main.py,sha256=cEvShENl6L6feX_sa7FGNTeoz5UtwqzwenmcHaON1hg,1589
5
- scorebook/dashboard/__init__.py,sha256=36DxO3oXVcZ2I6kizLFCcJkLBpXOU8UIXFT_ZjeFTB4,50
6
- scorebook/dashboard/create_project.py,sha256=RK90aMN0_XVM-DnawTY_b59yPJaRnpb_GoidCqXB5Vw,2845
7
- scorebook/dashboard/credentials.py,sha256=Q_khY5AX3fnyWshHe6LaesBHcCmNBse6a_XFGT8OOaw,3474
8
- scorebook/dashboard/upload_results.py,sha256=sdgOEf0C7QLt7t2QiXvSoceQpAiiPmlG_4SFEEzVPlc,9738
9
- scorebook/eval_datasets/__init__.py,sha256=wsmFNyuZJdBxjokcKG4NRfuUzPZKuzsKX3aG21zfFV4,39
10
- scorebook/eval_datasets/eval_dataset.py,sha256=xnG7VaceWUmg8Wrk2IGnVFZs9umzmZrW8F7THvtWMqs,28041
11
- scorebook/evaluate/__init__.py,sha256=Qqe-l4y3Nu81Fdx83RbtCQESoXC0XukBgOC3DPSWZZA,39
12
- scorebook/evaluate/_async/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
- scorebook/evaluate/_async/evaluate_async.py,sha256=G0RB_A1f5mQ42D82DnxkzAZhyV5kgbxi9Lr7qKaKUyY,16590
14
- scorebook/evaluate/_sync/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
- scorebook/evaluate/_sync/evaluate.py,sha256=OIUsW2U1IrdwYIIPsfpTCOfJDAYJ6BYl-6pQQiafSNE,16364
16
- scorebook/evaluate/evaluate_helpers.py,sha256=NnanxLEeHwoZNztGXQJc6u_WqKfDkn1vYmck2BrKF-c,17028
17
- scorebook/exceptions.py,sha256=3sxCWhFqYgXiWNUAMRR2ggLfqvbDI8e5vLjnT9V7X1M,3649
18
- scorebook/inference/__init__.py,sha256=gGuZG1rdpxKYC54q0eAS6oTHQbRYhgxlBeAqonqHvRU,60
19
- scorebook/inference/clients/__init__.py,sha256=VaLW7mi4tywJtR3Q9wr2pPci8NlEQ3bJanZyM5S81Z4,51
20
- scorebook/inference/clients/bedrock.py,sha256=bsnz0IB6ufjZVPd6syD3yVaOelerB5G_YAmPAVqmBmI,10071
21
- scorebook/inference/clients/openai.py,sha256=JPPcJdxYwwZNUfXCGTRRzzUUA8I8WiV3bu6-pgS1_UE,9043
22
- scorebook/inference/clients/portkey.py,sha256=RCuEZB8xNAVeGEt3IJ0esh_wqreZNB3jrDKiWH6miV0,5949
23
- scorebook/inference/clients/vertex.py,sha256=g6oNnag0qcLOYCtQ4SXAXfnqKtvPAVdigB--I7wU1yM,9871
24
- scorebook/inference/inference_pipeline.py,sha256=1qSmfI4fBJFS3EcAhRlA-f4-8aI6wDiupSJu-vNXoYI,5571
25
- scorebook/metrics/__init__.py,sha256=bsEq15LpFt3h0AQQFbnvL4CU7KpIpifVdJAsfduPGXk,48
26
- scorebook/metrics/accuracy.py,sha256=5KQ4hfOn9M94sB7WsXUelJWJiuKfoCGQEl5q5q9vNfo,1467
27
- scorebook/metrics/metric_base.py,sha256=I3L0DGcRojFp93UGFnXG1tZ2UK9ilTcXXJG6lj5ddXA,857
28
- scorebook/metrics/metric_registry.py,sha256=YcbKGf2kPMQqyqJ9NYVq_-J19rARXSo22HjTW5WU-QU,3404
29
- scorebook/metrics/precision.py,sha256=AaYPYYKnY74Nwqp_p3jd2Ewf3VHNOJjoRWf5fhb-tXk,563
30
- scorebook/score/__init__.py,sha256=CqkslUvOw8QfCCbSfwZgGrbmXeSLpZqIVo4ntrctYuY,66
31
- scorebook/score/_async/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
32
- scorebook/score/_async/score_async.py,sha256=SatV9hEUT8MAru2ACSyM03weKX6VTFx7crW59_uX0L8,6155
33
- scorebook/score/_sync/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
34
- scorebook/score/_sync/score.py,sha256=nANQbuyYyIaWnoTQzyGMwPZRMFP6MmyIyHb1GO1mktQ,6101
35
- scorebook/score/score_helpers.py,sha256=Gjx2Lgd94ISvunb5CHj-tDWYVEOVj9ySjjVYnnhpk_Q,7086
36
- scorebook/settings.py,sha256=qZrNiki6rFXn43udmhjQSmdDKOEaX62WYoEs2Rbggr0,720
37
- scorebook/types.py,sha256=2lv1YUky7aDGIEPjgj18aKTpBMdmqD01TKLbwli19pQ,4904
38
- scorebook/utils/__init__.py,sha256=oBTybVHI5EdHIgzb0TeoAnSLMQdUh20Ww6vcL9542Pk,72
39
- scorebook/utils/async_utils.py,sha256=2ewk_VOePib8z7DTRl-pZQBGzVI3L3JvnEuYW-DTkRA,1325
40
- scorebook/utils/common_helpers.py,sha256=lJIqO9XGf1T3S3rdGBTjZJ1BzVPvaU_XTONEfPApnEM,1218
41
- scorebook/utils/io_helpers.py,sha256=ORO6DwtXOKWJq9v_isuunUrz0viE3xy2qYO4lrgU-TM,1437
42
- scorebook/utils/jinja_helpers.py,sha256=ksIKHiKdj8N0o7ZJZGasfbSNoAY6K5d9X_KM6mcKYD4,4208
43
- scorebook/utils/mappers.py,sha256=OcUnPBrnSUxZNhAzJhVmVWUWmqIKFXLTrK-xLi6_SUg,1259
44
- scorebook/utils/mock_llm/__init__.py,sha256=dK70wNVBKk4hv1o3fceDTBG1_maFbkMvoOtTriPCe78,1293
45
- scorebook/utils/mock_llm/data/mock_llm_data.json,sha256=b28j7OCR0igpP0rkXDJAR2NWIiuVkOaAkzB-Miv665Y,381567
46
- scorebook/utils/progress_bars.py,sha256=gdT6dJ9LMLYzs7TospP3wQNY9htm_FhVLdX0ueluC6E,31890
47
- scorebook/utils/render_template.py,sha256=NOaZt-N1WcR5MA7at1XxzD-4sFMFKo9X0k7fKq6oSSM,1654
48
- scorebook/utils/transform_helpers.py,sha256=UnVLtFvcJrtmBEmLsuA4rrX4iJlNUKxm2DkIOGLl-2o,1030
49
- scorebook-0.0.14.dist-info/METADATA,sha256=jPqVszfpCiAKf3yt45XD6lXfIJL1-TFvSMDVGrIoCPs,9491
50
- scorebook-0.0.14.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
51
- scorebook-0.0.14.dist-info/entry_points.txt,sha256=9gNd3Q0MEozhJ7fog-Q-Z_PrcGMnF-404Jon40MH2_U,53
52
- scorebook-0.0.14.dist-info/licenses/LICENSE,sha256=JLH1g9FhxHZf6CBCeQ_xAisPtICVObuNGW1bLPiTYEs,1068
53
- scorebook-0.0.14.dist-info/RECORD,,