themis-eval 0.1.1__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. themis/__init__.py +12 -1
  2. themis/_version.py +2 -2
  3. themis/api.py +429 -0
  4. themis/backends/__init__.py +17 -0
  5. themis/backends/execution.py +197 -0
  6. themis/backends/storage.py +260 -0
  7. themis/cli/commands/results.py +252 -0
  8. themis/cli/main.py +427 -57
  9. themis/comparison/__init__.py +25 -0
  10. themis/comparison/engine.py +348 -0
  11. themis/comparison/reports.py +283 -0
  12. themis/comparison/statistics.py +402 -0
  13. themis/core/entities.py +23 -3
  14. themis/evaluation/metrics/code/__init__.py +19 -0
  15. themis/evaluation/metrics/code/codebleu.py +144 -0
  16. themis/evaluation/metrics/code/execution.py +280 -0
  17. themis/evaluation/metrics/code/pass_at_k.py +181 -0
  18. themis/evaluation/metrics/nlp/__init__.py +21 -0
  19. themis/evaluation/metrics/nlp/bertscore.py +138 -0
  20. themis/evaluation/metrics/nlp/bleu.py +129 -0
  21. themis/evaluation/metrics/nlp/meteor.py +153 -0
  22. themis/evaluation/metrics/nlp/rouge.py +136 -0
  23. themis/evaluation/pipelines/standard_pipeline.py +68 -8
  24. themis/experiment/cache_manager.py +8 -3
  25. themis/experiment/export.py +110 -2
  26. themis/experiment/orchestrator.py +109 -11
  27. themis/experiment/storage.py +1457 -110
  28. themis/generation/providers/litellm_provider.py +46 -0
  29. themis/generation/runner.py +22 -6
  30. themis/integrations/huggingface.py +12 -1
  31. themis/integrations/wandb.py +13 -1
  32. themis/interfaces/__init__.py +86 -0
  33. themis/presets/__init__.py +10 -0
  34. themis/presets/benchmarks.py +354 -0
  35. themis/presets/models.py +190 -0
  36. themis/server/__init__.py +28 -0
  37. themis/server/app.py +337 -0
  38. themis_eval-0.2.1.dist-info/METADATA +596 -0
  39. {themis_eval-0.1.1.dist-info → themis_eval-0.2.1.dist-info}/RECORD +42 -19
  40. {themis_eval-0.1.1.dist-info → themis_eval-0.2.1.dist-info}/WHEEL +1 -1
  41. themis_eval-0.1.1.dist-info/METADATA +0 -758
  42. {themis_eval-0.1.1.dist-info → themis_eval-0.2.1.dist-info}/licenses/LICENSE +0 -0
  43. {themis_eval-0.1.1.dist-info → themis_eval-0.2.1.dist-info}/top_level.txt +0 -0
themis/__init__.py CHANGED
@@ -1,14 +1,25 @@
1
- """Themis experiment platform."""
1
+ """Themis experiment platform - Dead simple LLM evaluation.
2
+
3
+ The primary interface is the `evaluate()` function:
4
+
5
+ import themis
6
+ report = themis.evaluate("math500", model="gpt-4", limit=100)
7
+ """
2
8
 
3
9
  from themis import config, core, evaluation, experiment, generation, project
4
10
  from themis._version import __version__
11
+ from themis.api import evaluate
5
12
 
6
13
  __all__ = [
14
+ # Main API
15
+ "evaluate",
16
+ # Submodules
7
17
  "config",
8
18
  "core",
9
19
  "evaluation",
10
20
  "experiment",
11
21
  "generation",
12
22
  "project",
23
+ # Version
13
24
  "__version__",
14
25
  ]
themis/_version.py CHANGED
@@ -7,9 +7,9 @@ from importlib import metadata
7
7
 
8
8
  def _detect_version() -> str:
9
9
  try:
10
- return metadata.version("themis")
10
+ return metadata.version("themis-eval")
11
11
  except metadata.PackageNotFoundError: # pragma: no cover - local dev only
12
- return "0.0.0"
12
+ return "0.2.1" # Fallback for development
13
13
 
14
14
 
15
15
  __version__ = _detect_version()
themis/api.py ADDED
@@ -0,0 +1,429 @@
1
+ """Unified API for Themis - The primary interface for all evaluations.
2
+
3
+ This module provides the main entry point for running evaluations:
4
+ - Simple one-liner for benchmarks
5
+ - Custom datasets with minimal configuration
6
+ - Distributed execution and cloud storage support
7
+ - Auto-configuration of prompts, metrics, and extractors
8
+
9
+ Example:
10
+ ```python
11
+ import themis
12
+
13
+ # Simple benchmark evaluation
14
+ report = themis.evaluate("math500", model="gpt-4", limit=100)
15
+
16
+ # Custom dataset
17
+ report = themis.evaluate(
18
+ dataset=[{"id": "1", "question": "...", "answer": "..."}],
19
+ model="claude-3-opus",
20
+ prompt="Solve: {question}"
21
+ )
22
+
23
+ # Distributed with cloud storage
24
+ report = themis.evaluate(
25
+ "gsm8k",
26
+ model="gpt-4",
27
+ distributed=True,
28
+ workers=8,
29
+ storage="s3://my-bucket/experiments"
30
+ )
31
+ ```
32
+ """
33
+
34
+ from __future__ import annotations
35
+
36
+ import logging
37
+ from datetime import datetime
38
+ from pathlib import Path
39
+ from typing import Any, Callable, Sequence
40
+
41
+ from themis.core.entities import (
42
+ ExperimentReport,
43
+ GenerationRecord,
44
+ ModelSpec,
45
+ PromptSpec,
46
+ SamplingConfig,
47
+ )
48
+ from themis.evaluation.pipeline import EvaluationPipeline
49
+ from themis.experiment.orchestrator import ExperimentOrchestrator
50
+ from themis.generation.plan import GenerationPlan
51
+ from themis.generation.router import ProviderRouter
52
+ from themis.generation.runner import GenerationRunner
53
+ from themis.generation.templates import PromptTemplate
54
+ from themis.providers import create_provider
55
+
56
+ # Import provider modules to ensure they register themselves
57
+ try:
58
+ from themis.generation import clients # noqa: F401 - registers fake provider
59
+ from themis.generation.providers import (
60
+ litellm_provider, # noqa: F401
61
+ vllm_provider, # noqa: F401
62
+ )
63
+ except ImportError:
64
+ pass
65
+
66
+ logger = logging.getLogger(__name__)
67
+
68
+
69
+ def evaluate(
70
+ benchmark_or_dataset: str | Sequence[dict[str, Any]],
71
+ *,
72
+ model: str,
73
+ limit: int | None = None,
74
+ prompt: str | None = None,
75
+ metrics: list[str] | None = None,
76
+ temperature: float = 0.0,
77
+ max_tokens: int = 512,
78
+ num_samples: int = 1,
79
+ distributed: bool = False,
80
+ workers: int = 4,
81
+ storage: str | Path | None = None,
82
+ run_id: str | None = None,
83
+ resume: bool = True,
84
+ on_result: Callable[[GenerationRecord], None] | None = None,
85
+ **kwargs: Any,
86
+ ) -> ExperimentReport:
87
+ """Run an LLM evaluation with automatic configuration.
88
+
89
+ This is the primary API for Themis. It auto-configures prompts, metrics,
90
+ and extractors based on the benchmark name, or allows full customization
91
+ for custom datasets.
92
+
93
+ Args:
94
+ benchmark_or_dataset: Either a benchmark name (e.g., "math500", "gsm8k")
95
+ or a list of dataset samples as dictionaries. For custom datasets,
96
+ each dict should have: prompt/question (input), answer/reference (output),
97
+ and optionally id (unique identifier).
98
+ model: Model identifier for LiteLLM (e.g., "gpt-4", "claude-3-opus-20240229",
99
+ "azure/gpt-4", "ollama/llama3"). Provider is auto-detected from the name.
100
+ limit: Maximum number of samples to evaluate. Use for testing or when you
101
+ want to evaluate a subset. None means evaluate all samples.
102
+ prompt: Custom prompt template using Python format strings. Variables like
103
+ {prompt}, {question}, {context} will be replaced with dataset fields.
104
+ If None, uses the benchmark's default prompt template.
105
+ metrics: List of metric names to compute. Available: "ExactMatch", "MathVerify",
106
+ "BLEU", "ROUGE", "BERTScore", "METEOR", "PassAtK", "CodeBLEU",
107
+ "ExecutionAccuracy". If None, uses benchmark defaults.
108
+ temperature: Sampling temperature (0.0 = deterministic/greedy, 1.0 = standard,
109
+ 2.0 = very random). Recommended: 0.0 for evaluation reproducibility.
110
+ max_tokens: Maximum tokens in model response. Typical values: 256 for short
111
+ answers, 512 for medium, 2048 for long explanations or code.
112
+ num_samples: Number of responses to generate per prompt. Use >1 for Pass@K
113
+ metrics, ensembling, or measuring response variance.
114
+ distributed: Whether to use distributed execution. Currently a placeholder
115
+ for future Ray integration.
116
+ workers: Number of parallel workers for generation. Higher = faster but may
117
+ hit rate limits. Recommended: 4-16 for APIs, 32+ for local models.
118
+ storage: Storage location for results and cache. Defaults to ".cache/experiments".
119
+ Can be a local path or (future) cloud storage URI.
120
+ run_id: Unique identifier for this run. If None, auto-generated from timestamp
121
+ (e.g., "run-2024-01-15-123456"). Use meaningful IDs for tracking experiments.
122
+ resume: Whether to resume from cached results.
123
+ on_result: Optional callback function called for each result.
124
+ **kwargs: Additional provider-specific options.
125
+
126
+ Returns:
127
+ ExperimentReport containing generation results, evaluation metrics,
128
+ and metadata.
129
+
130
+ Raises:
131
+ ValueError: If benchmark is unknown or configuration is invalid.
132
+ RuntimeError: If evaluation fails.
133
+
134
+ Example:
135
+ >>> report = themis.evaluate("math500", model="gpt-4", limit=10)
136
+ >>> print(f"Accuracy: {report.evaluation_report.metrics['accuracy']:.2%}")
137
+ Accuracy: 85.00%
138
+ """
139
+ logger.info("=" * 60)
140
+ logger.info("Starting Themis evaluation")
141
+ logger.info(f"Model: {model}")
142
+ logger.info(f"Workers: {workers}")
143
+ logger.info(f"Temperature: {temperature}, Max tokens: {max_tokens}")
144
+ if "api_base" in kwargs:
145
+ logger.info(f"Custom API base: {kwargs['api_base']}")
146
+ if "api_key" in kwargs:
147
+ logger.info("API key: <provided>")
148
+ else:
149
+ logger.warning("⚠️ No api_key provided - may fail for custom API endpoints")
150
+ logger.info("=" * 60)
151
+
152
+ # Import presets system (lazy import to avoid circular dependencies)
153
+ from themis.presets import get_benchmark_preset, parse_model_name
154
+
155
+ # Determine if we're using a benchmark or custom dataset
156
+ is_benchmark = isinstance(benchmark_or_dataset, str)
157
+
158
+ if is_benchmark:
159
+ benchmark_name = benchmark_or_dataset
160
+ logger.info(f"Loading benchmark: {benchmark_name}")
161
+
162
+ # Get preset configuration
163
+ try:
164
+ preset = get_benchmark_preset(benchmark_name)
165
+ except Exception as e:
166
+ logger.error(f"❌ Failed to get benchmark preset '{benchmark_name}': {e}")
167
+ raise
168
+
169
+ # Load dataset using preset loader
170
+ logger.info(f"Loading dataset (limit={limit})...")
171
+ try:
172
+ dataset = preset.load_dataset(limit=limit)
173
+ logger.info(f"✅ Loaded {len(dataset)} samples from {benchmark_name}")
174
+ except Exception as e:
175
+ logger.error(f"❌ Failed to load dataset: {e}")
176
+ raise
177
+
178
+ # Use preset prompt if not overridden
179
+ if prompt is None:
180
+ prompt_template = preset.prompt_template
181
+ else:
182
+ prompt_template = PromptTemplate(name="custom", template=prompt)
183
+
184
+ # Use preset metrics if not overridden
185
+ if metrics is None:
186
+ metrics_list = preset.metrics
187
+ else:
188
+ metrics_list = _resolve_metrics(metrics)
189
+
190
+ # Use preset extractor
191
+ extractor = preset.extractor
192
+
193
+ # Use preset metadata fields
194
+ metadata_fields = preset.metadata_fields
195
+ reference_field = preset.reference_field
196
+ dataset_id_field = preset.dataset_id_field
197
+ else:
198
+ # Custom dataset
199
+ logger.info("Using custom dataset")
200
+ dataset = list(benchmark_or_dataset)
201
+ logger.info(f"Custom dataset has {len(dataset)} samples")
202
+
203
+ # Limit dataset if requested
204
+ if limit is not None:
205
+ dataset = dataset[:limit]
206
+ logger.info(f"Limited to {len(dataset)} samples")
207
+
208
+ # Use provided prompt or default
209
+ if prompt is None:
210
+ raise ValueError(
211
+ "Custom datasets require a prompt template. "
212
+ "Example: prompt='Solve: {question}'"
213
+ )
214
+ prompt_template = PromptTemplate(name="custom", template=prompt)
215
+
216
+ # Use provided metrics or defaults
217
+ if metrics is None:
218
+ metrics_list = _resolve_metrics(["exact_match"])
219
+ else:
220
+ metrics_list = _resolve_metrics(metrics)
221
+
222
+ # Use identity extractor by default
223
+ from themis.evaluation.extractors import IdentityExtractor
224
+ extractor = IdentityExtractor()
225
+
226
+ # Use standard field names
227
+ metadata_fields = ()
228
+ reference_field = "answer"
229
+ dataset_id_field = "id"
230
+
231
+ # Parse model name to get provider and options
232
+ logger.info(f"Parsing model configuration...")
233
+ try:
234
+ provider_name, model_id, provider_options = parse_model_name(model, **kwargs)
235
+ logger.info(f"Provider: {provider_name}")
236
+ logger.info(f"Model ID: {model_id}")
237
+ logger.debug(f"Provider options: {provider_options}")
238
+ except Exception as e:
239
+ logger.error(f"❌ Failed to parse model name '{model}': {e}")
240
+ raise
241
+
242
+ # Create model spec
243
+ model_spec = ModelSpec(
244
+ identifier=model_id,
245
+ provider=provider_name,
246
+ )
247
+
248
+ # Create sampling config
249
+ sampling_config = SamplingConfig(
250
+ temperature=temperature,
251
+ top_p=kwargs.get("top_p", 0.95),
252
+ max_tokens=max_tokens,
253
+ )
254
+
255
+ # Create generation plan
256
+ plan = GenerationPlan(
257
+ templates=[prompt_template],
258
+ models=[model_spec],
259
+ sampling_parameters=[sampling_config],
260
+ dataset_id_field=dataset_id_field,
261
+ reference_field=reference_field,
262
+ metadata_fields=metadata_fields,
263
+ )
264
+
265
+ # Create provider and router
266
+ logger.info(f"Creating provider '{provider_name}'...")
267
+ try:
268
+ provider = create_provider(provider_name, **provider_options)
269
+ logger.info(f"✅ Provider created successfully")
270
+ except KeyError as e:
271
+ logger.error(f"❌ Provider '{provider_name}' not registered. Available providers: fake, litellm, openai, anthropic, azure, bedrock, gemini, cohere, vllm")
272
+ logger.error(f" This usually means the provider module wasn't imported.")
273
+ raise
274
+ except Exception as e:
275
+ logger.error(f"❌ Failed to create provider: {e}")
276
+ raise
277
+
278
+ router = ProviderRouter({model_id: provider})
279
+ logger.debug(f"Router configured for model: {model_id}")
280
+
281
+ # Create runner
282
+ runner = GenerationRunner(provider=router, max_parallel=workers)
283
+ logger.info(f"Runner configured with {workers} parallel workers")
284
+
285
+ # Create evaluation pipeline
286
+ pipeline = EvaluationPipeline(
287
+ extractor=extractor,
288
+ metrics=metrics_list,
289
+ )
290
+ logger.info(f"Evaluation metrics: {[m.name for m in metrics_list]}")
291
+
292
+ # Determine storage location
293
+ if storage is None:
294
+ storage_dir = Path.home() / ".themis" / "runs"
295
+ else:
296
+ storage_dir = Path(storage) if not str(storage).startswith(("s3://", "gs://", "azure://")) else storage
297
+
298
+ # Generate run ID if not provided
299
+ if run_id is None:
300
+ run_id = f"run-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
301
+ logger.info(f"Run ID: {run_id}")
302
+ logger.info(f"Storage: {storage_dir}")
303
+ logger.info(f"Resume: {resume}")
304
+
305
+ # Create storage backend
306
+ if isinstance(storage_dir, Path):
307
+ from themis.experiment.storage import ExperimentStorage
308
+ storage_backend = ExperimentStorage(storage_dir)
309
+ logger.debug(f"Storage backend created at {storage_dir}")
310
+ else:
311
+ # Cloud storage (to be implemented in Phase 3)
312
+ raise NotImplementedError(
313
+ f"Cloud storage not yet implemented. Use local path for now. "
314
+ f"Requested: {storage_dir}"
315
+ )
316
+
317
+ # Create orchestrator
318
+ orchestrator = ExperimentOrchestrator(
319
+ generation_plan=plan,
320
+ generation_runner=runner,
321
+ evaluation_pipeline=pipeline,
322
+ storage=storage_backend,
323
+ )
324
+
325
+ # Run evaluation
326
+ if distributed:
327
+ # Distributed execution (to be implemented in Phase 3)
328
+ raise NotImplementedError(
329
+ "Distributed execution not yet implemented. "
330
+ "Set distributed=False to use local execution."
331
+ )
332
+
333
+ # Run locally
334
+ logger.info("=" * 60)
335
+ logger.info("🚀 Starting experiment execution...")
336
+ logger.info("=" * 60)
337
+
338
+ try:
339
+ report = orchestrator.run(
340
+ dataset=dataset,
341
+ max_samples=limit,
342
+ run_id=run_id,
343
+ resume=resume,
344
+ on_result=on_result,
345
+ )
346
+
347
+ logger.info("=" * 60)
348
+ logger.info("✅ Evaluation completed successfully!")
349
+ logger.info(f" Total samples: {len(report.generation_results)}")
350
+ logger.info(f" Successful: {report.metadata.get('successful_generations', 0)}")
351
+ logger.info(f" Failed: {report.metadata.get('failed_generations', 0)}")
352
+ if report.evaluation_report.metrics:
353
+ logger.info(f" Metrics: {list(report.evaluation_report.metrics.keys())}")
354
+ logger.info("=" * 60)
355
+
356
+ return report
357
+ except Exception as e:
358
+ logger.error("=" * 60)
359
+ logger.error(f"❌ Evaluation failed: {e}")
360
+ logger.error("=" * 60)
361
+ raise
362
+
363
+
364
+ def _resolve_metrics(metric_names: list[str]) -> list:
365
+ """Resolve metric names to metric instances.
366
+
367
+ Args:
368
+ metric_names: List of metric names (e.g., ["exact_match", "bleu"])
369
+
370
+ Returns:
371
+ List of metric instances
372
+
373
+ Raises:
374
+ ValueError: If a metric name is unknown
375
+ """
376
+ from themis.evaluation.metrics.exact_match import ExactMatch
377
+ from themis.evaluation.metrics.math_verify_accuracy import MathVerifyAccuracy
378
+ from themis.evaluation.metrics.response_length import ResponseLength
379
+
380
+ # NLP metrics (Phase 2)
381
+ try:
382
+ from themis.evaluation.metrics.nlp import BLEU, ROUGE, BERTScore, METEOR, ROUGEVariant
383
+ nlp_available = True
384
+ except ImportError:
385
+ nlp_available = False
386
+
387
+ # Metric registry
388
+ METRICS_REGISTRY = {
389
+ # Core metrics
390
+ "exact_match": ExactMatch,
391
+ "math_verify": MathVerifyAccuracy,
392
+ "response_length": ResponseLength,
393
+ }
394
+
395
+ # Add NLP metrics if available
396
+ if nlp_available:
397
+ METRICS_REGISTRY.update({
398
+ "bleu": BLEU,
399
+ "rouge1": lambda: ROUGE(variant=ROUGEVariant.ROUGE_1),
400
+ "rouge2": lambda: ROUGE(variant=ROUGEVariant.ROUGE_2),
401
+ "rougeL": lambda: ROUGE(variant=ROUGEVariant.ROUGE_L),
402
+ "bertscore": BERTScore,
403
+ "meteor": METEOR,
404
+ })
405
+
406
+ # Code metrics (to be added later in Phase 2)
407
+ # "pass_at_k": PassAtK,
408
+ # "codebleu": CodeBLEU,
409
+
410
+ metrics = []
411
+ for name in metric_names:
412
+ if name not in METRICS_REGISTRY:
413
+ available = ", ".join(sorted(METRICS_REGISTRY.keys()))
414
+ raise ValueError(
415
+ f"Unknown metric: {name}. "
416
+ f"Available metrics: {available}"
417
+ )
418
+
419
+ metric_cls = METRICS_REGISTRY[name]
420
+ # Handle both class and lambda factory
421
+ if callable(metric_cls) and not isinstance(metric_cls, type):
422
+ metrics.append(metric_cls())
423
+ else:
424
+ metrics.append(metric_cls())
425
+
426
+ return metrics
427
+
428
+
429
+ __all__ = ["evaluate"]
@@ -0,0 +1,17 @@
1
+ """Backend interfaces for extending Themis.
2
+
3
+ This module provides abstract interfaces for implementing custom backends:
4
+ - StorageBackend: Custom storage implementations (cloud, databases, etc.)
5
+ - ExecutionBackend: Custom execution strategies (distributed, async, etc.)
6
+
7
+ These interfaces allow advanced users to extend Themis without modifying core code.
8
+ """
9
+
10
+ from themis.backends.execution import ExecutionBackend, LocalExecutionBackend
11
+ from themis.backends.storage import StorageBackend
12
+
13
+ __all__ = [
14
+ "StorageBackend",
15
+ "ExecutionBackend",
16
+ "LocalExecutionBackend",
17
+ ]
@@ -0,0 +1,197 @@
1
+ """Execution backend interface for custom execution strategies.
2
+
3
+ This module defines the abstract interface for execution backends, allowing
4
+ users to implement custom execution strategies (distributed, GPU-accelerated,
5
+ async, etc.) without modifying Themis core code.
6
+
7
+ Example implementations:
8
+ - RayExecutionBackend: Distributed execution with Ray
9
+ - DaskExecutionBackend: Distributed execution with Dask
10
+ - AsyncExecutionBackend: Async/await based execution
11
+ - GPUBatchExecutionBackend: Batched GPU execution for vLLM
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ from abc import ABC, abstractmethod
17
+ from concurrent.futures import ThreadPoolExecutor, as_completed
18
+ from typing import Any, Callable, Iterable, Iterator, TypeVar
19
+
20
+ T = TypeVar("T")
21
+ R = TypeVar("R")
22
+
23
+
24
+ class ExecutionBackend(ABC):
25
+ """Abstract interface for execution backends.
26
+
27
+ Implement this interface to create custom execution strategies.
28
+
29
+ Example:
30
+ >>> class RayExecutionBackend(ExecutionBackend):
31
+ ... def __init__(self, num_cpus: int = 4):
32
+ ... import ray
33
+ ... if not ray.is_initialized():
34
+ ... ray.init(num_cpus=num_cpus)
35
+ ...
36
+ ... def map(self, func, items, max_workers=None):
37
+ ... import ray
38
+ ... # Convert to Ray remote function
39
+ ... remote_func = ray.remote(func)
40
+ ... # Submit all tasks
41
+ ... futures = [remote_func.remote(item) for item in items]
42
+ ... # Get results as they complete
43
+ ... for future in futures:
44
+ ... yield ray.get(future)
45
+ """
46
+
47
+ @abstractmethod
48
+ def map(
49
+ self,
50
+ func: Callable[[T], R],
51
+ items: Iterable[T],
52
+ *,
53
+ max_workers: int | None = None,
54
+ timeout: float | None = None,
55
+ **kwargs: Any,
56
+ ) -> Iterator[R]:
57
+ """Execute function over items in parallel.
58
+
59
+ Args:
60
+ func: Function to apply to each item
61
+ items: Iterable of items to process
62
+ max_workers: Maximum number of parallel workers
63
+ timeout: Timeout for each execution (seconds)
64
+ **kwargs: Additional backend-specific options
65
+
66
+ Yields:
67
+ Results as they complete
68
+
69
+ Note:
70
+ Results may be yielded in any order (not necessarily input order).
71
+ Implementation should handle errors gracefully.
72
+ """
73
+ pass
74
+
75
+ @abstractmethod
76
+ def shutdown(self) -> None:
77
+ """Shutdown the execution backend and release resources.
78
+
79
+ Called when execution is complete. Should cleanup workers,
80
+ connections, and other resources.
81
+ """
82
+ pass
83
+
84
+ def __enter__(self):
85
+ """Context manager entry."""
86
+ return self
87
+
88
+ def __exit__(self, exc_type, exc_val, exc_tb):
89
+ """Context manager exit."""
90
+ self.shutdown()
91
+ return False
92
+
93
+
94
+ class LocalExecutionBackend(ExecutionBackend):
95
+ """Local multi-threaded execution using ThreadPoolExecutor.
96
+
97
+ This is the default execution backend, using Python's built-in
98
+ ThreadPoolExecutor for parallel execution.
99
+
100
+ Attributes:
101
+ executor: ThreadPoolExecutor instance
102
+ """
103
+
104
+ def __init__(self, max_workers: int = 4):
105
+ """Initialize with number of workers.
106
+
107
+ Args:
108
+ max_workers: Maximum number of worker threads
109
+ """
110
+ self._max_workers = max_workers
111
+ self._executor: ThreadPoolExecutor | None = None
112
+
113
+ def map(
114
+ self,
115
+ func: Callable[[T], R],
116
+ items: Iterable[T],
117
+ *,
118
+ max_workers: int | None = None,
119
+ timeout: float | None = None,
120
+ **kwargs: Any,
121
+ ) -> Iterator[R]:
122
+ """Execute function over items using ThreadPoolExecutor.
123
+
124
+ Args:
125
+ func: Function to apply to each item
126
+ items: Iterable of items to process
127
+ max_workers: Override default max_workers
128
+ timeout: Timeout for each task (seconds)
129
+ **kwargs: Ignored (for interface compatibility)
130
+
131
+ Yields:
132
+ Results as they complete
133
+ """
134
+ workers = max_workers or self._max_workers
135
+
136
+ # Create executor if not exists
137
+ if self._executor is None:
138
+ self._executor = ThreadPoolExecutor(max_workers=workers)
139
+
140
+ # Submit all tasks
141
+ items_list = list(items) # Materialize iterator
142
+ futures = [self._executor.submit(func, item) for item in items_list]
143
+
144
+ # Yield results as they complete
145
+ for future in as_completed(futures, timeout=timeout):
146
+ result = future.result()
147
+ yield result
148
+
149
+ def shutdown(self) -> None:
150
+ """Shutdown the executor."""
151
+ if self._executor is not None:
152
+ self._executor.shutdown(wait=True)
153
+ self._executor = None
154
+
155
+
156
+ class SequentialExecutionBackend(ExecutionBackend):
157
+ """Sequential execution backend for debugging.
158
+
159
+ Executes tasks one at a time without parallelism.
160
+ Useful for debugging, testing, or when parallelism causes issues.
161
+ """
162
+
163
+ def map(
164
+ self,
165
+ func: Callable[[T], R],
166
+ items: Iterable[T],
167
+ *,
168
+ max_workers: int | None = None,
169
+ timeout: float | None = None,
170
+ **kwargs: Any,
171
+ ) -> Iterator[R]:
172
+ """Execute function sequentially.
173
+
174
+ Args:
175
+ func: Function to apply to each item
176
+ items: Iterable of items to process
177
+ max_workers: Ignored (no parallelism)
178
+ timeout: Timeout for each task (seconds)
179
+ **kwargs: Ignored
180
+
181
+ Yields:
182
+ Results in input order
183
+ """
184
+ for item in items:
185
+ result = func(item)
186
+ yield result
187
+
188
+ def shutdown(self) -> None:
189
+ """No-op for sequential execution."""
190
+ pass
191
+
192
+
193
+ __all__ = [
194
+ "ExecutionBackend",
195
+ "LocalExecutionBackend",
196
+ "SequentialExecutionBackend",
197
+ ]