themis-eval 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. themis/cli/__init__.py +5 -0
  2. themis/cli/__main__.py +6 -0
  3. themis/cli/commands/__init__.py +19 -0
  4. themis/cli/commands/benchmarks.py +221 -0
  5. themis/cli/commands/comparison.py +394 -0
  6. themis/cli/commands/config_commands.py +244 -0
  7. themis/cli/commands/cost.py +214 -0
  8. themis/cli/commands/demo.py +68 -0
  9. themis/cli/commands/info.py +90 -0
  10. themis/cli/commands/leaderboard.py +362 -0
  11. themis/cli/commands/math_benchmarks.py +318 -0
  12. themis/cli/commands/mcq_benchmarks.py +207 -0
  13. themis/cli/commands/sample_run.py +244 -0
  14. themis/cli/commands/visualize.py +299 -0
  15. themis/cli/main.py +93 -0
  16. themis/cli/new_project.py +33 -0
  17. themis/cli/utils.py +51 -0
  18. themis/config/__init__.py +19 -0
  19. themis/config/loader.py +27 -0
  20. themis/config/registry.py +34 -0
  21. themis/config/runtime.py +214 -0
  22. themis/config/schema.py +112 -0
  23. themis/core/__init__.py +5 -0
  24. themis/core/conversation.py +354 -0
  25. themis/core/entities.py +164 -0
  26. themis/core/serialization.py +231 -0
  27. themis/core/tools.py +393 -0
  28. themis/core/types.py +141 -0
  29. themis/datasets/__init__.py +273 -0
  30. themis/datasets/base.py +264 -0
  31. themis/datasets/commonsense_qa.py +174 -0
  32. themis/datasets/competition_math.py +265 -0
  33. themis/datasets/coqa.py +133 -0
  34. themis/datasets/gpqa.py +190 -0
  35. themis/datasets/gsm8k.py +123 -0
  36. themis/datasets/gsm_symbolic.py +124 -0
  37. themis/datasets/math500.py +122 -0
  38. themis/datasets/med_qa.py +179 -0
  39. themis/datasets/medmcqa.py +169 -0
  40. themis/datasets/mmlu_pro.py +262 -0
  41. themis/datasets/piqa.py +146 -0
  42. themis/datasets/registry.py +201 -0
  43. themis/datasets/schema.py +245 -0
  44. themis/datasets/sciq.py +150 -0
  45. themis/datasets/social_i_qa.py +151 -0
  46. themis/datasets/super_gpqa.py +263 -0
  47. themis/evaluation/__init__.py +1 -0
  48. themis/evaluation/conditional.py +410 -0
  49. themis/evaluation/extractors/__init__.py +19 -0
  50. themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
  51. themis/evaluation/extractors/exceptions.py +7 -0
  52. themis/evaluation/extractors/identity_extractor.py +29 -0
  53. themis/evaluation/extractors/json_field_extractor.py +45 -0
  54. themis/evaluation/extractors/math_verify_extractor.py +37 -0
  55. themis/evaluation/extractors/regex_extractor.py +43 -0
  56. themis/evaluation/math_verify_utils.py +87 -0
  57. themis/evaluation/metrics/__init__.py +21 -0
  58. themis/evaluation/metrics/composite_metric.py +47 -0
  59. themis/evaluation/metrics/consistency_metric.py +80 -0
  60. themis/evaluation/metrics/exact_match.py +51 -0
  61. themis/evaluation/metrics/length_difference_tolerance.py +33 -0
  62. themis/evaluation/metrics/math_verify_accuracy.py +40 -0
  63. themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
  64. themis/evaluation/metrics/response_length.py +33 -0
  65. themis/evaluation/metrics/rubric_judge_metric.py +134 -0
  66. themis/evaluation/pipeline.py +49 -0
  67. themis/evaluation/pipelines/__init__.py +15 -0
  68. themis/evaluation/pipelines/composable_pipeline.py +357 -0
  69. themis/evaluation/pipelines/standard_pipeline.py +288 -0
  70. themis/evaluation/reports.py +293 -0
  71. themis/evaluation/statistics/__init__.py +53 -0
  72. themis/evaluation/statistics/bootstrap.py +79 -0
  73. themis/evaluation/statistics/confidence_intervals.py +121 -0
  74. themis/evaluation/statistics/distributions.py +207 -0
  75. themis/evaluation/statistics/effect_sizes.py +124 -0
  76. themis/evaluation/statistics/hypothesis_tests.py +305 -0
  77. themis/evaluation/statistics/types.py +139 -0
  78. themis/evaluation/strategies/__init__.py +13 -0
  79. themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
  80. themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
  81. themis/evaluation/strategies/evaluation_strategy.py +24 -0
  82. themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
  83. themis/experiment/__init__.py +5 -0
  84. themis/experiment/builder.py +151 -0
  85. themis/experiment/cache_manager.py +129 -0
  86. themis/experiment/comparison.py +631 -0
  87. themis/experiment/cost.py +310 -0
  88. themis/experiment/definitions.py +62 -0
  89. themis/experiment/export.py +690 -0
  90. themis/experiment/export_csv.py +159 -0
  91. themis/experiment/integration_manager.py +104 -0
  92. themis/experiment/math.py +192 -0
  93. themis/experiment/mcq.py +169 -0
  94. themis/experiment/orchestrator.py +373 -0
  95. themis/experiment/pricing.py +317 -0
  96. themis/experiment/storage.py +255 -0
  97. themis/experiment/visualization.py +588 -0
  98. themis/generation/__init__.py +1 -0
  99. themis/generation/agentic_runner.py +420 -0
  100. themis/generation/batching.py +254 -0
  101. themis/generation/clients.py +143 -0
  102. themis/generation/conversation_runner.py +236 -0
  103. themis/generation/plan.py +456 -0
  104. themis/generation/providers/litellm_provider.py +221 -0
  105. themis/generation/providers/vllm_provider.py +135 -0
  106. themis/generation/router.py +34 -0
  107. themis/generation/runner.py +207 -0
  108. themis/generation/strategies.py +98 -0
  109. themis/generation/templates.py +71 -0
  110. themis/generation/turn_strategies.py +393 -0
  111. themis/generation/types.py +9 -0
  112. themis/integrations/__init__.py +0 -0
  113. themis/integrations/huggingface.py +61 -0
  114. themis/integrations/wandb.py +65 -0
  115. themis/interfaces/__init__.py +83 -0
  116. themis/project/__init__.py +20 -0
  117. themis/project/definitions.py +98 -0
  118. themis/project/patterns.py +230 -0
  119. themis/providers/__init__.py +5 -0
  120. themis/providers/registry.py +39 -0
  121. themis/utils/api_generator.py +379 -0
  122. themis/utils/cost_tracking.py +376 -0
  123. themis/utils/dashboard.py +452 -0
  124. themis/utils/logging_utils.py +41 -0
  125. themis/utils/progress.py +58 -0
  126. themis/utils/tracing.py +320 -0
  127. {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/METADATA +1 -1
  128. themis_eval-0.1.1.dist-info/RECORD +134 -0
  129. themis_eval-0.1.0.dist-info/RECORD +0 -8
  130. {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/WHEEL +0 -0
  131. {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/licenses/LICENSE +0 -0
  132. {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,393 @@
1
+ """Turn strategies for multi-turn conversations.
2
+
3
+ This module provides strategies for determining the next turn in a conversation.
4
+ Strategies can be fixed (predefined sequences), dynamic (generated based on context),
5
+ or interactive.
6
+
7
+ Examples:
8
+ # Fixed sequence
9
+ strategy = FixedSequenceTurnStrategy([
10
+ "What is 2+2?",
11
+ "What about 3+3?",
12
+ "And 5+5?"
13
+ ])
14
+
15
+ # Dynamic strategy
16
+ def planner(context):
17
+ if len(context) < 2:
18
+ return "Can you explain more?"
19
+ return None # Stop
20
+
21
+ strategy = DynamicTurnStrategy(planner)
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ from dataclasses import dataclass
27
+ from typing import Callable, Protocol
28
+
29
+ from themis.core import conversation as conv
30
+ from themis.core import entities as core_entities
31
+
32
+
33
+ class TurnStrategy(Protocol):
34
+ """Strategy for determining the next turn in a conversation.
35
+
36
+ A turn strategy decides what the user's next message should be
37
+ based on the current conversation state and the last model response.
38
+ """
39
+
40
+ def next_turn(
41
+ self,
42
+ context: conv.ConversationContext,
43
+ last_record: core_entities.GenerationRecord,
44
+ ) -> str | None:
45
+ """Determine the next user message.
46
+
47
+ Args:
48
+ context: Current conversation context
49
+ last_record: Last generation record
50
+
51
+ Returns:
52
+ Next user message, or None to end conversation
53
+ """
54
+ ...
55
+
56
+
57
+ @dataclass
58
+ class FixedSequenceTurnStrategy:
59
+ """Pre-determined sequence of user messages.
60
+
61
+ This strategy iterates through a fixed list of user messages,
62
+ useful for scripted conversations or testing.
63
+
64
+ Examples:
65
+ strategy = FixedSequenceTurnStrategy([
66
+ "Hello!",
67
+ "How are you?",
68
+ "Goodbye!"
69
+ ])
70
+ """
71
+
72
+ messages: list[str]
73
+ _index: int = 0
74
+
75
+ def next_turn(
76
+ self,
77
+ context: conv.ConversationContext,
78
+ last_record: core_entities.GenerationRecord,
79
+ ) -> str | None:
80
+ """Return next message from sequence.
81
+
82
+ Args:
83
+ context: Current conversation context
84
+ last_record: Last generation record
85
+
86
+ Returns:
87
+ Next message or None if sequence exhausted
88
+ """
89
+ if self._index >= len(self.messages):
90
+ return None
91
+
92
+ message = self.messages[self._index]
93
+ self._index += 1
94
+ return message
95
+
96
+ def reset(self) -> None:
97
+ """Reset strategy to beginning of sequence."""
98
+ self._index = 0
99
+
100
+
101
+ @dataclass
102
+ class DynamicTurnStrategy:
103
+ """Generate next message based on conversation state.
104
+
105
+ This strategy uses a function to dynamically determine the next
106
+ user message based on the conversation context.
107
+
108
+ Examples:
109
+ def planner(context, record):
110
+ outputs = [msg.content for msg in context.get_messages_by_role("assistant")]
111
+ if "error" in outputs[-1].lower():
112
+ return "Can you try again?"
113
+ elif len(context) >= 10:
114
+ return None # Stop after 10 messages
115
+ else:
116
+ return "Please continue."
117
+
118
+ strategy = DynamicTurnStrategy(planner)
119
+ """
120
+
121
+ planner: Callable[
122
+ [conv.ConversationContext, core_entities.GenerationRecord], str | None
123
+ ]
124
+
125
+ def next_turn(
126
+ self,
127
+ context: conv.ConversationContext,
128
+ last_record: core_entities.GenerationRecord,
129
+ ) -> str | None:
130
+ """Generate next message using planner function.
131
+
132
+ Args:
133
+ context: Current conversation context
134
+ last_record: Last generation record
135
+
136
+ Returns:
137
+ Next message or None to stop
138
+ """
139
+ return self.planner(context, last_record)
140
+
141
+
142
+ @dataclass
143
+ class RepeatUntilSuccessTurnStrategy:
144
+ """Repeat the same question until getting a successful response.
145
+
146
+ This strategy is useful for testing robustness or debugging.
147
+
148
+ Examples:
149
+ strategy = RepeatUntilSuccessTurnStrategy(
150
+ question="What is 2+2?",
151
+ success_checker=lambda output: "4" in output,
152
+ max_attempts=5
153
+ )
154
+ """
155
+
156
+ question: str
157
+ success_checker: Callable[[str], bool]
158
+ max_attempts: int = 5
159
+ _attempts: int = 0
160
+
161
+ def next_turn(
162
+ self,
163
+ context: conv.ConversationContext,
164
+ last_record: core_entities.GenerationRecord,
165
+ ) -> str | None:
166
+ """Repeat question until success or max attempts.
167
+
168
+ Args:
169
+ context: Current conversation context
170
+ last_record: Last generation record
171
+
172
+ Returns:
173
+ Question or None if success/max attempts reached
174
+ """
175
+ # Check if this is first turn
176
+ if self._attempts == 0:
177
+ self._attempts += 1
178
+ return self.question
179
+
180
+ # Check if last response was successful
181
+ if last_record.output:
182
+ if self.success_checker(last_record.output.text):
183
+ return None # Success, stop
184
+
185
+ # Check if we've exhausted attempts
186
+ if self._attempts >= self.max_attempts:
187
+ return None # Give up
188
+
189
+ self._attempts += 1
190
+ return self.question
191
+
192
+ def reset(self) -> None:
193
+ """Reset attempt counter."""
194
+ self._attempts = 0
195
+
196
+
197
+ @dataclass
198
+ class ConditionalTurnStrategy:
199
+ """Choose next message based on conditions.
200
+
201
+ This strategy evaluates conditions and returns different messages
202
+ based on which condition matches.
203
+
204
+ Examples:
205
+ strategy = ConditionalTurnStrategy(
206
+ conditions=[
207
+ (lambda ctx, rec: "error" in rec.output.text.lower(), "Please try again."),
208
+ (lambda ctx, rec: len(ctx) >= 5, None), # Stop after 5 turns
209
+ ],
210
+ default="Continue."
211
+ )
212
+ """
213
+
214
+ conditions: list[
215
+ tuple[
216
+ Callable[[conv.ConversationContext, core_entities.GenerationRecord], bool],
217
+ str | None,
218
+ ]
219
+ ]
220
+ default: str | None = None
221
+
222
+ def next_turn(
223
+ self,
224
+ context: conv.ConversationContext,
225
+ last_record: core_entities.GenerationRecord,
226
+ ) -> str | None:
227
+ """Evaluate conditions and return matching message.
228
+
229
+ Args:
230
+ context: Current conversation context
231
+ last_record: Last generation record
232
+
233
+ Returns:
234
+ Message from first matching condition, or default
235
+ """
236
+ for condition, message in self.conditions:
237
+ try:
238
+ if condition(context, last_record):
239
+ return message
240
+ except Exception:
241
+ # Skip conditions that fail
242
+ continue
243
+
244
+ return self.default
245
+
246
+
247
+ @dataclass
248
+ class ChainedTurnStrategy:
249
+ """Chain multiple strategies together.
250
+
251
+ This strategy tries strategies in sequence until one returns
252
+ a non-None message.
253
+
254
+ Examples:
255
+ strategy = ChainedTurnStrategy([
256
+ FixedSequenceTurnStrategy(["Hello", "How are you?"]),
257
+ DynamicTurnStrategy(lambda ctx, rec: "Goodbye" if len(ctx) > 5 else None)
258
+ ])
259
+ """
260
+
261
+ strategies: list[TurnStrategy]
262
+
263
+ def next_turn(
264
+ self,
265
+ context: conv.ConversationContext,
266
+ last_record: core_entities.GenerationRecord,
267
+ ) -> str | None:
268
+ """Try each strategy until one returns a message.
269
+
270
+ Args:
271
+ context: Current conversation context
272
+ last_record: Last generation record
273
+
274
+ Returns:
275
+ First non-None message, or None if all return None
276
+ """
277
+ for strategy in self.strategies:
278
+ message = strategy.next_turn(context, last_record)
279
+ if message is not None:
280
+ return message
281
+
282
+ return None
283
+
284
+
285
+ # Helper functions for creating common strategies
286
+
287
+
288
+ def create_qa_strategy(questions: list[str]) -> FixedSequenceTurnStrategy:
289
+ """Create a simple Q&A strategy from a list of questions.
290
+
291
+ Args:
292
+ questions: List of questions to ask
293
+
294
+ Returns:
295
+ FixedSequenceTurnStrategy with questions
296
+ """
297
+ return FixedSequenceTurnStrategy(messages=questions)
298
+
299
+
300
+ def create_max_turns_strategy(
301
+ max_turns: int, message: str = "Continue."
302
+ ) -> DynamicTurnStrategy:
303
+ """Create strategy that stops after max turns.
304
+
305
+ Args:
306
+ max_turns: Maximum number of turns
307
+ message: Message to send each turn
308
+
309
+ Returns:
310
+ DynamicTurnStrategy that stops after max_turns
311
+ """
312
+
313
+ def planner(
314
+ context: conv.ConversationContext, record: core_entities.GenerationRecord
315
+ ) -> str | None:
316
+ if len(context) >= max_turns:
317
+ return None
318
+ return message
319
+
320
+ return DynamicTurnStrategy(planner=planner)
321
+
322
+
323
+ def create_keyword_stop_strategy(
324
+ keywords: list[str], message: str = "Continue."
325
+ ) -> DynamicTurnStrategy:
326
+ """Create strategy that stops when any keyword appears in response.
327
+
328
+ Args:
329
+ keywords: List of keywords to trigger stop
330
+ message: Message to send each turn
331
+
332
+ Returns:
333
+ DynamicTurnStrategy that stops on keywords
334
+ """
335
+
336
+ def planner(
337
+ context: conv.ConversationContext, record: core_entities.GenerationRecord
338
+ ) -> str | None:
339
+ if record.output:
340
+ text_lower = record.output.text.lower()
341
+ if any(kw.lower() in text_lower for kw in keywords):
342
+ return None
343
+ return message
344
+
345
+ return DynamicTurnStrategy(planner=planner)
346
+
347
+
348
+ # Prompt perturbation and seed helpers for robustness sweeps
349
+
350
+ import random
351
+
352
+
353
+ def set_sampling_seed(task_metadata: dict[str, object], seed: int) -> dict[str, object]:
354
+ """Attach a deterministic seed to task metadata for providers that support it.
355
+
356
+ This does not enforce provider behavior but offers a convention: 'sampling_seed'.
357
+ """
358
+ md = dict(task_metadata)
359
+ md["sampling_seed"] = int(seed)
360
+ return md
361
+
362
+
363
+ def perturb_prompt(text: str, *, seed: int | None = None, max_changes: int = 2) -> str:
364
+ """Apply small, semantics-preserving perturbations to a prompt.
365
+
366
+ Changes include optional punctuation tweaks and inserting polite filler words.
367
+ """
368
+ rng = random.Random(seed)
369
+ t = text
370
+ changes = 0
371
+ # Optional punctuation swap
372
+ if "?" in t and changes < max_changes and rng.random() < 0.5:
373
+ t = t.replace("?", "??", 1)
374
+ changes += 1
375
+ # Optional polite filler insertion
376
+ fillers = ["please", "kindly", "if possible"]
377
+ if changes < max_changes and rng.random() < 0.5:
378
+ words = t.split()
379
+ if words:
380
+ idx = rng.randint(0, len(words) - 1)
381
+ words.insert(idx, rng.choice(fillers))
382
+ t = " ".join(words)
383
+ changes += 1
384
+ return t
385
+
386
+
387
+ def create_prompt_variants(base_text: str, *, count: int, seed: int) -> list[str]:
388
+ """Create multiple perturbed variants of a base prompt with deterministic seeding."""
389
+ rng = random.Random(seed)
390
+ return [
391
+ perturb_prompt(base_text, seed=rng.randint(0, 1_000_000))
392
+ for _ in range(max(1, count))
393
+ ]
@@ -0,0 +1,9 @@
1
+ """Backwards-compatible aliases for core entities."""
2
+
3
+ from themis.core import entities as core_entities
4
+
5
+ SamplingParameters = core_entities.SamplingConfig
6
+ ModelOutput = core_entities.ModelOutput
7
+ GenerationError = core_entities.ModelError
8
+ GenerationRequest = core_entities.GenerationTask
9
+ GenerationResult = core_entities.GenerationRecord
File without changes
@@ -0,0 +1,61 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from dataclasses import asdict, is_dataclass
5
+ from pathlib import Path
6
+
7
+ from huggingface_hub import HfApi
8
+
9
+ from themis.config.schema import HuggingFaceHubConfig
10
+ from themis.core.entities import ExperimentReport
11
+
12
+
13
+ def to_dict(obj):
14
+ if is_dataclass(obj):
15
+ return asdict(obj)
16
+ if hasattr(obj, "to_dict"):
17
+ return obj.to_dict()
18
+ if isinstance(obj, (list, tuple)):
19
+ return [to_dict(item) for item in obj]
20
+ if isinstance(obj, dict):
21
+ return {key: to_dict(value) for key, value in obj.items()}
22
+ return obj
23
+
24
+
25
+ class HuggingFaceHubUploader:
26
+ def __init__(self, config: HuggingFaceHubConfig):
27
+ self.config = config
28
+ self.api = HfApi()
29
+
30
+ def upload_results(self, report: ExperimentReport, storage_path: Path) -> None:
31
+ if not self.config.enable or not self.config.repository:
32
+ return
33
+
34
+ report_dict = to_dict(report)
35
+
36
+ # Upload the full report as a JSON file
37
+ report_path = storage_path / "report.json"
38
+ with open(report_path, "w") as f:
39
+ json.dump(report_dict, f, indent=4)
40
+
41
+ self.api.upload_file(
42
+ path_or_fileobj=str(report_path),
43
+ path_in_repo=f"{report.metadata.get('run_id')}/report.json",
44
+ repo_id=self.config.repository,
45
+ repo_type="dataset",
46
+ )
47
+
48
+ # Upload individual generation results
49
+ for record in report.generation_results:
50
+ record_dict = to_dict(record)
51
+ record_path = (
52
+ storage_path / f"{record.task.metadata.get('dataset_id')}.json"
53
+ )
54
+ with open(record_path, "w") as f:
55
+ json.dump(record_dict, f, indent=4)
56
+ self.api.upload_file(
57
+ path_or_fileobj=str(record_path),
58
+ path_in_repo=f"{report.metadata.get('run_id')}/generations/{record.task.metadata.get('dataset_id')}.json",
59
+ repo_id=self.config.repository,
60
+ repo_type="dataset",
61
+ )
@@ -0,0 +1,65 @@
1
+ from __future__ import annotations
2
+
3
+ import wandb
4
+
5
+ from themis.config.schema import WandbConfig
6
+ from themis.core.entities import ExperimentReport
7
+
8
+
9
+ class WandbTracker:
10
+ def __init__(self, config: WandbConfig):
11
+ self.config = config
12
+
13
+ def init(self, experiment_config: dict) -> None:
14
+ if not self.config.enable:
15
+ return
16
+ wandb.init(
17
+ project=self.config.project,
18
+ entity=self.config.entity,
19
+ tags=self.config.tags,
20
+ config=experiment_config,
21
+ )
22
+
23
+ def log_results(self, report: ExperimentReport) -> None:
24
+ if not self.config.enable:
25
+ return
26
+ summary = {
27
+ "total_samples": report.metadata.get("total_samples"),
28
+ "successful_generations": report.metadata.get("successful_generations"),
29
+ "failed_generations": report.metadata.get("failed_generations"),
30
+ "evaluation_failures": report.metadata.get("evaluation_failures"),
31
+ }
32
+ for name, aggregate in report.evaluation_report.metrics.items():
33
+ summary[f"{name}_mean"] = aggregate.mean
34
+ wandb.summary.update(summary)
35
+
36
+ records_table = wandb.Table(
37
+ columns=[
38
+ "sample_id",
39
+ "prompt",
40
+ "raw_response",
41
+ "parsed_response",
42
+ "error",
43
+ "metric_scores",
44
+ ]
45
+ )
46
+ for record in report.generation_results:
47
+ eval_record = next(
48
+ (
49
+ r
50
+ for r in report.evaluation_report.records
51
+ if r.sample_id == record.task.metadata.get("dataset_id")
52
+ ),
53
+ None,
54
+ )
55
+ records_table.add_data(
56
+ record.task.metadata.get("dataset_id"),
57
+ record.task.prompt,
58
+ [resp.text for resp in record.responses],
59
+ eval_record.parsed_response if eval_record else None,
60
+ record.error.message if record.error else None,
61
+ {s.metric_name: s.value for s in eval_record.scores}
62
+ if eval_record
63
+ else None,
64
+ )
65
+ wandb.log({"generation_results": records_table})
@@ -0,0 +1,83 @@
1
+ """Interfaces (ports) that external adapters must implement."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from abc import ABC, abstractmethod
6
+ from typing import Any, Iterable, Protocol, Sequence, runtime_checkable
7
+
8
+ from themis.core import entities
9
+
10
+
11
+ class ModelProvider(ABC):
12
+ """Abstract interface for anything capable of fulfilling generation tasks."""
13
+
14
+ @abstractmethod
15
+ def generate(
16
+ self, task: entities.GenerationTask
17
+ ) -> entities.GenerationRecord: # pragma: no cover - abstract
18
+ raise NotImplementedError
19
+
20
+
21
+ @runtime_checkable
22
+ class DatasetAdapter(Protocol):
23
+ """Protocol for dataset adapters that produce raw samples for experiments.
24
+
25
+ This is a structural protocol that can be satisfied by any class implementing
26
+ the required methods, without explicit inheritance. The @runtime_checkable
27
+ decorator allows isinstance() checks at runtime.
28
+
29
+ Required Methods:
30
+ iter_samples: Returns an iterable of sample dictionaries
31
+
32
+ Example:
33
+ >>> class MyDataset:
34
+ ... def iter_samples(self):
35
+ ... return iter([{"id": "1", "text": "sample"}])
36
+ ...
37
+ >>> isinstance(MyDataset(), DatasetAdapter) # True at runtime
38
+
39
+ Note:
40
+ Classes do not need to explicitly inherit from this protocol.
41
+ Duck typing is sufficient - any class with an iter_samples() method
42
+ will be recognized as a DatasetAdapter at runtime.
43
+ """
44
+
45
+ def iter_samples(self) -> Iterable[dict[str, Any]]: # pragma: no cover - protocol
46
+ """Iterate over dataset samples.
47
+
48
+ Returns:
49
+ Iterable of dictionaries, each representing a dataset sample
50
+
51
+ Example:
52
+ >>> for sample in dataset.iter_samples():
53
+ ... print(sample["id"])
54
+ """
55
+ ...
56
+
57
+
58
+ class Extractor(Protocol):
59
+ def extract(self, raw_output: str) -> Any: # pragma: no cover - protocol
60
+ ...
61
+
62
+
63
+ class Metric(ABC):
64
+ name: str
65
+ requires_reference: bool = True
66
+
67
+ @abstractmethod
68
+ def compute(
69
+ self,
70
+ *,
71
+ prediction: Any,
72
+ references: Sequence[Any],
73
+ metadata: dict[str, Any] | None = None,
74
+ ) -> entities.MetricScore: # pragma: no cover - abstract
75
+ raise NotImplementedError
76
+
77
+
78
+ __all__ = [
79
+ "ModelProvider",
80
+ "DatasetAdapter",
81
+ "Extractor",
82
+ "Metric",
83
+ ]
@@ -0,0 +1,20 @@
1
+ """Project helpers for managing experiment collections."""
2
+
3
+ from themis.project.definitions import Project, ProjectExperiment
4
+ from themis.project.patterns import (
5
+ AblationChart,
6
+ AblationChartPoint,
7
+ AblationVariant,
8
+ XAbationPattern,
9
+ XAbationPatternApplication,
10
+ )
11
+
12
+ __all__ = [
13
+ "Project",
14
+ "ProjectExperiment",
15
+ "AblationChart",
16
+ "AblationChartPoint",
17
+ "AblationVariant",
18
+ "XAbationPattern",
19
+ "XAbationPatternApplication",
20
+ ]