scorebook 0.0.12__py3-none-any.whl → 0.0.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. scorebook/__init__.py +10 -5
  2. scorebook/cli/auth.py +1 -1
  3. scorebook/dashboard/__init__.py +1 -0
  4. scorebook/dashboard/create_project.py +91 -0
  5. scorebook/{trismik → dashboard}/credentials.py +24 -9
  6. scorebook/{trismik → dashboard}/upload_results.py +1 -1
  7. scorebook/eval_datasets/__init__.py +0 -4
  8. scorebook/eval_datasets/eval_dataset.py +22 -2
  9. scorebook/evaluate/__init__.py +1 -15
  10. scorebook/evaluate/_async/evaluate_async.py +25 -9
  11. scorebook/evaluate/_sync/evaluate.py +25 -9
  12. scorebook/evaluate/evaluate_helpers.py +79 -5
  13. scorebook/inference/__init__.py +1 -11
  14. scorebook/inference/clients/__init__.py +1 -8
  15. scorebook/inference/inference_pipeline.py +1 -1
  16. scorebook/metrics/__init__.py +1 -18
  17. scorebook/metrics/metric_registry.py +2 -0
  18. scorebook/score/__init__.py +0 -5
  19. scorebook/score/_async/score_async.py +3 -2
  20. scorebook/score/_sync/score.py +3 -2
  21. scorebook/score/score_helpers.py +1 -1
  22. scorebook/types.py +3 -1
  23. scorebook/utils/__init__.py +0 -22
  24. scorebook/utils/common_helpers.py +1 -1
  25. scorebook/utils/mock_llm/__init__.py +41 -0
  26. scorebook/utils/mock_llm/data/mock_llm_data.json +21970 -0
  27. scorebook-0.0.14.dist-info/METADATA +292 -0
  28. scorebook-0.0.14.dist-info/RECORD +53 -0
  29. scorebook/trismik/__init__.py +0 -10
  30. scorebook-0.0.12.dist-info/METADATA +0 -389
  31. scorebook-0.0.12.dist-info/RECORD +0 -50
  32. {scorebook-0.0.12.dist-info → scorebook-0.0.14.dist-info}/WHEEL +0 -0
  33. {scorebook-0.0.12.dist-info → scorebook-0.0.14.dist-info}/entry_points.txt +0 -0
  34. {scorebook-0.0.12.dist-info → scorebook-0.0.14.dist-info}/licenses/LICENSE +0 -0
@@ -1,6 +1,7 @@
1
1
  import logging
2
2
  from typing import Any, Dict, List, Literal, Optional, Union, cast
3
3
 
4
+ from scorebook.dashboard.upload_results import upload_result
4
5
  from scorebook.exceptions import DataMismatchError, ParameterValidationError
5
6
  from scorebook.score.score_helpers import (
6
7
  calculate_metric_scores,
@@ -8,9 +9,9 @@ from scorebook.score.score_helpers import (
8
9
  resolve_metrics,
9
10
  validate_items,
10
11
  )
11
- from scorebook.trismik.upload_results import upload_result
12
12
  from scorebook.types import Metrics
13
- from scorebook.utils import resolve_show_progress, resolve_upload_results, scoring_progress_context
13
+ from scorebook.utils.common_helpers import resolve_show_progress, resolve_upload_results
14
+ from scorebook.utils.progress_bars import scoring_progress_context
14
15
 
15
16
  logger = logging.getLogger(__name__)
16
17
 
@@ -7,7 +7,7 @@ from scorebook.exceptions import DataMismatchError, ParameterValidationError
7
7
  from scorebook.metrics.metric_base import MetricBase
8
8
  from scorebook.metrics.metric_registry import MetricRegistry
9
9
  from scorebook.types import MetricScore
10
- from scorebook.utils import is_awaitable
10
+ from scorebook.utils.async_utils import is_awaitable
11
11
 
12
12
  logger = logging.getLogger(__name__)
13
13
 
scorebook/types.py CHANGED
@@ -3,7 +3,7 @@
3
3
  from dataclasses import dataclass
4
4
  from typing import Any, Dict, List, Optional, Sequence, Type, Union
5
5
 
6
- from scorebook.eval_datasets import EvalDataset
6
+ from scorebook.eval_datasets.eval_dataset import EvalDataset
7
7
  from scorebook.metrics.metric_base import MetricBase
8
8
 
9
9
  # Type alias for metrics parameter
@@ -17,6 +17,7 @@ class AdaptiveEvalDataset:
17
17
  """Represents a dataset configured for adaptive evaluation."""
18
18
 
19
19
  name: str
20
+ split: Optional[str] = None
20
21
 
21
22
 
22
23
  @dataclass
@@ -50,6 +51,7 @@ class AdaptiveEvalRunSpec:
50
51
  hyperparameters_index: int
51
52
  experiment_id: str
52
53
  project_id: str
54
+ split: Optional[str] = None
53
55
  metadata: Optional[Dict[str, Any]] = None
54
56
 
55
57
 
@@ -1,23 +1 @@
1
1
  """Utility functions and common helpers for the Scorebook framework."""
2
-
3
- from contextlib import nullcontext
4
-
5
- from scorebook.utils.async_utils import async_nullcontext, is_awaitable
6
- from scorebook.utils.common_helpers import resolve_show_progress, resolve_upload_results
7
- from scorebook.utils.io_helpers import validate_path
8
- from scorebook.utils.progress_bars import evaluation_progress_context, scoring_progress_context
9
- from scorebook.utils.render_template import render_template
10
- from scorebook.utils.transform_helpers import expand_dict
11
-
12
- __all__ = [
13
- "async_nullcontext",
14
- "nullcontext",
15
- "is_awaitable",
16
- "resolve_show_progress",
17
- "resolve_upload_results",
18
- "validate_path",
19
- "expand_dict",
20
- "evaluation_progress_context",
21
- "scoring_progress_context",
22
- "render_template",
23
- ]
@@ -17,7 +17,7 @@ def resolve_upload_results(upload_results: Union[Literal["auto"], bool]) -> bool
17
17
  bool: Whether to upload results to Trismik
18
18
  """
19
19
  if upload_results == "auto":
20
- from scorebook.trismik.credentials import get_token
20
+ from scorebook.dashboard.credentials import get_token
21
21
 
22
22
  upload_results = get_token() is not None
23
23
  logger.debug("Auto upload results resolved to: %s", upload_results)
@@ -0,0 +1,41 @@
1
+ """Mock LLM utilities for testing and demonstrations."""
2
+
3
+ import json
4
+ import random
5
+ from pathlib import Path
6
+ from typing import Any, List
7
+
8
+ # Load the mock data once at module initialization
9
+ _DATA_PATH = Path(__file__).parent / "data" / "mock_llm_data.json"
10
+ with open(_DATA_PATH, "r", encoding="utf-8") as f:
11
+ _MOCK_DATA = json.load(f)
12
+
13
+
14
+ def mock_llm(inputs: List[Any], **hyperparameters: Any) -> List[str]:
15
+ """Mock LLM that returns answers based on pre-recorded accuracy data."""
16
+
17
+ results = []
18
+ all_choices = ["A", "B", "C", "D", "E"]
19
+
20
+ for item in inputs:
21
+ item_id = item["id"]
22
+
23
+ # Look up the item in our mock data
24
+ if item_id not in _MOCK_DATA:
25
+ # If item not found, return random answer
26
+ results.append(random.choice(all_choices))
27
+ continue
28
+
29
+ item_data = _MOCK_DATA[item_id]
30
+ correct_answer = item_data["answer"]
31
+ was_accurate = item_data["accuracy"]
32
+
33
+ if was_accurate:
34
+ # Return the correct answer
35
+ results.append(correct_answer)
36
+ else:
37
+ # Return a random incorrect answer
38
+ incorrect_choices = [choice for choice in all_choices if choice != correct_answer]
39
+ results.append(random.choice(incorrect_choices))
40
+
41
+ return results