themis-eval 0.2.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
themis/_version.py CHANGED
@@ -9,7 +9,7 @@ def _detect_version() -> str:
9
9
  try:
10
10
  return metadata.version("themis-eval")
11
11
  except metadata.PackageNotFoundError: # pragma: no cover - local dev only
12
- return "0.2.0" # Fallback for development
12
+ return "0.2.1" # Fallback for development
13
13
 
14
14
 
15
15
  __version__ = _detect_version()
themis/api.py CHANGED
@@ -33,6 +33,7 @@ Example:
33
33
 
34
34
  from __future__ import annotations
35
35
 
36
+ import logging
36
37
  from datetime import datetime
37
38
  from pathlib import Path
38
39
  from typing import Any, Callable, Sequence
@@ -52,6 +53,18 @@ from themis.generation.runner import GenerationRunner
52
53
  from themis.generation.templates import PromptTemplate
53
54
  from themis.providers import create_provider
54
55
 
56
+ # Import provider modules to ensure they register themselves
57
+ try:
58
+ from themis.generation import clients # noqa: F401 - registers fake provider
59
+ from themis.generation.providers import (
60
+ litellm_provider, # noqa: F401
61
+ vllm_provider, # noqa: F401
62
+ )
63
+ except ImportError:
64
+ pass
65
+
66
+ logger = logging.getLogger(__name__)
67
+
55
68
 
56
69
  def evaluate(
57
70
  benchmark_or_dataset: str | Sequence[dict[str, Any]],
@@ -123,6 +136,19 @@ def evaluate(
123
136
  >>> print(f"Accuracy: {report.evaluation_report.metrics['accuracy']:.2%}")
124
137
  Accuracy: 85.00%
125
138
  """
139
+ logger.info("=" * 60)
140
+ logger.info("Starting Themis evaluation")
141
+ logger.info(f"Model: {model}")
142
+ logger.info(f"Workers: {workers}")
143
+ logger.info(f"Temperature: {temperature}, Max tokens: {max_tokens}")
144
+ if "api_base" in kwargs:
145
+ logger.info(f"Custom API base: {kwargs['api_base']}")
146
+ if "api_key" in kwargs:
147
+ logger.info("API key: <provided>")
148
+ else:
149
+ logger.warning("⚠️ No api_key provided - may fail for custom API endpoints")
150
+ logger.info("=" * 60)
151
+
126
152
  # Import presets system (lazy import to avoid circular dependencies)
127
153
  from themis.presets import get_benchmark_preset, parse_model_name
128
154
 
@@ -131,11 +157,23 @@ def evaluate(
131
157
 
132
158
  if is_benchmark:
133
159
  benchmark_name = benchmark_or_dataset
160
+ logger.info(f"Loading benchmark: {benchmark_name}")
161
+
134
162
  # Get preset configuration
135
- preset = get_benchmark_preset(benchmark_name)
163
+ try:
164
+ preset = get_benchmark_preset(benchmark_name)
165
+ except Exception as e:
166
+ logger.error(f"❌ Failed to get benchmark preset '{benchmark_name}': {e}")
167
+ raise
136
168
 
137
169
  # Load dataset using preset loader
138
- dataset = preset.load_dataset(limit=limit)
170
+ logger.info(f"Loading dataset (limit={limit})...")
171
+ try:
172
+ dataset = preset.load_dataset(limit=limit)
173
+ logger.info(f"✅ Loaded {len(dataset)} samples from {benchmark_name}")
174
+ except Exception as e:
175
+ logger.error(f"❌ Failed to load dataset: {e}")
176
+ raise
139
177
 
140
178
  # Use preset prompt if not overridden
141
179
  if prompt is None:
@@ -158,11 +196,14 @@ def evaluate(
158
196
  dataset_id_field = preset.dataset_id_field
159
197
  else:
160
198
  # Custom dataset
199
+ logger.info("Using custom dataset")
161
200
  dataset = list(benchmark_or_dataset)
201
+ logger.info(f"Custom dataset has {len(dataset)} samples")
162
202
 
163
203
  # Limit dataset if requested
164
204
  if limit is not None:
165
205
  dataset = dataset[:limit]
206
+ logger.info(f"Limited to {len(dataset)} samples")
166
207
 
167
208
  # Use provided prompt or default
168
209
  if prompt is None:
@@ -188,7 +229,15 @@ def evaluate(
188
229
  dataset_id_field = "id"
189
230
 
190
231
  # Parse model name to get provider and options
191
- provider_name, model_id, provider_options = parse_model_name(model, **kwargs)
232
+ logger.info(f"Parsing model configuration...")
233
+ try:
234
+ provider_name, model_id, provider_options = parse_model_name(model, **kwargs)
235
+ logger.info(f"Provider: {provider_name}")
236
+ logger.info(f"Model ID: {model_id}")
237
+ logger.debug(f"Provider options: {provider_options}")
238
+ except Exception as e:
239
+ logger.error(f"❌ Failed to parse model name '{model}': {e}")
240
+ raise
192
241
 
193
242
  # Create model spec
194
243
  model_spec = ModelSpec(
@@ -214,17 +263,31 @@ def evaluate(
214
263
  )
215
264
 
216
265
  # Create provider and router
217
- provider = create_provider(provider_name, **provider_options)
266
+ logger.info(f"Creating provider '{provider_name}'...")
267
+ try:
268
+ provider = create_provider(provider_name, **provider_options)
269
+ logger.info(f"✅ Provider created successfully")
270
+ except KeyError as e:
271
+ logger.error(f"❌ Provider '{provider_name}' not registered. Available providers: fake, litellm, openai, anthropic, azure, bedrock, gemini, cohere, vllm")
272
+ logger.error(f" This usually means the provider module wasn't imported.")
273
+ raise
274
+ except Exception as e:
275
+ logger.error(f"❌ Failed to create provider: {e}")
276
+ raise
277
+
218
278
  router = ProviderRouter({model_id: provider})
279
+ logger.debug(f"Router configured for model: {model_id}")
219
280
 
220
281
  # Create runner
221
- runner = GenerationRunner(provider=router)
282
+ runner = GenerationRunner(provider=router, max_parallel=workers)
283
+ logger.info(f"Runner configured with {workers} parallel workers")
222
284
 
223
285
  # Create evaluation pipeline
224
286
  pipeline = EvaluationPipeline(
225
287
  extractor=extractor,
226
288
  metrics=metrics_list,
227
289
  )
290
+ logger.info(f"Evaluation metrics: {[m.name for m in metrics_list]}")
228
291
 
229
292
  # Determine storage location
230
293
  if storage is None:
@@ -235,11 +298,15 @@ def evaluate(
235
298
  # Generate run ID if not provided
236
299
  if run_id is None:
237
300
  run_id = f"run-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
301
+ logger.info(f"Run ID: {run_id}")
302
+ logger.info(f"Storage: {storage_dir}")
303
+ logger.info(f"Resume: {resume}")
238
304
 
239
305
  # Create storage backend
240
306
  if isinstance(storage_dir, Path):
241
307
  from themis.experiment.storage import ExperimentStorage
242
308
  storage_backend = ExperimentStorage(storage_dir)
309
+ logger.debug(f"Storage backend created at {storage_dir}")
243
310
  else:
244
311
  # Cloud storage (to be implemented in Phase 3)
245
312
  raise NotImplementedError(
@@ -264,15 +331,34 @@ def evaluate(
264
331
  )
265
332
 
266
333
  # Run locally
267
- report = orchestrator.run(
268
- dataset=dataset,
269
- max_samples=limit,
270
- run_id=run_id,
271
- resume=resume,
272
- on_result=on_result,
273
- )
334
+ logger.info("=" * 60)
335
+ logger.info("🚀 Starting experiment execution...")
336
+ logger.info("=" * 60)
274
337
 
275
- return report
338
+ try:
339
+ report = orchestrator.run(
340
+ dataset=dataset,
341
+ max_samples=limit,
342
+ run_id=run_id,
343
+ resume=resume,
344
+ on_result=on_result,
345
+ )
346
+
347
+ logger.info("=" * 60)
348
+ logger.info("✅ Evaluation completed successfully!")
349
+ logger.info(f" Total samples: {len(report.generation_results)}")
350
+ logger.info(f" Successful: {report.metadata.get('successful_generations', 0)}")
351
+ logger.info(f" Failed: {report.metadata.get('failed_generations', 0)}")
352
+ if report.evaluation_report.metrics:
353
+ logger.info(f" Metrics: {list(report.evaluation_report.metrics.keys())}")
354
+ logger.info("=" * 60)
355
+
356
+ return report
357
+ except Exception as e:
358
+ logger.error("=" * 60)
359
+ logger.error(f"❌ Evaluation failed: {e}")
360
+ logger.error("=" * 60)
361
+ raise
276
362
 
277
363
 
278
364
  def _resolve_metrics(metric_names: list[str]) -> list:
@@ -2,10 +2,13 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ import logging
5
6
  from datetime import datetime, timezone
6
7
  from typing import Callable, Sequence
7
8
 
8
9
  from themis.config.schema import IntegrationsConfig
10
+
11
+ logger = logging.getLogger(__name__)
9
12
  from themis.core.entities import (
10
13
  EvaluationRecord,
11
14
  ExperimentFailure,
@@ -102,6 +105,8 @@ class ExperimentOrchestrator:
102
105
  Returns:
103
106
  ExperimentReport with generation results, evaluation, and metadata
104
107
  """
108
+ logger.info("Orchestrator: Initializing experiment run")
109
+
105
110
  # Initialize integrations
106
111
  self._integrations.initialize_run(
107
112
  {
@@ -112,13 +117,23 @@ class ExperimentOrchestrator:
112
117
  )
113
118
 
114
119
  # Prepare dataset
115
- dataset_list = self._resolve_dataset(
116
- dataset=dataset, dataset_loader=dataset_loader, run_id=run_id
117
- )
120
+ logger.info("Orchestrator: Loading dataset...")
121
+ try:
122
+ dataset_list = self._resolve_dataset(
123
+ dataset=dataset, dataset_loader=dataset_loader, run_id=run_id
124
+ )
125
+ logger.info(f"Orchestrator: Dataset loaded ({len(dataset_list)} total samples)")
126
+ except Exception as e:
127
+ logger.error(f"Orchestrator: ❌ Failed to load dataset: {e}")
128
+ raise
129
+
118
130
  selected_dataset = (
119
131
  dataset_list[:max_samples] if max_samples is not None else dataset_list
120
132
  )
121
133
  run_identifier = run_id or self._default_run_id()
134
+
135
+ logger.info(f"Orchestrator: Processing {len(selected_dataset)} samples")
136
+ logger.info(f"Orchestrator: Run ID = {run_identifier}")
122
137
 
123
138
  # Initialize run in storage (if storage exists and run doesn't exist)
124
139
  if self._cache.has_storage:
@@ -130,18 +145,30 @@ class ExperimentOrchestrator:
130
145
  self._cache.cache_dataset(run_identifier, dataset_list)
131
146
 
132
147
  # Expand dataset into generation tasks
133
- tasks = list(self._plan.expand(selected_dataset))
148
+ logger.info("Orchestrator: Expanding dataset into generation tasks...")
149
+ try:
150
+ tasks = list(self._plan.expand(selected_dataset))
151
+ logger.info(f"Orchestrator: Created {len(tasks)} generation tasks")
152
+ except Exception as e:
153
+ logger.error(f"Orchestrator: ❌ Failed to expand dataset: {e}")
154
+ raise
134
155
 
135
156
  # Build evaluation configuration for cache invalidation
136
157
  evaluation_config = self._build_evaluation_config()
137
158
 
138
159
  # Load cached results if resuming
160
+ if resume:
161
+ logger.info("Orchestrator: Loading cached results...")
139
162
  cached_records = (
140
163
  self._cache.load_cached_records(run_identifier) if resume else {}
141
164
  )
142
165
  cached_evaluations = (
143
166
  self._cache.load_cached_evaluations(run_identifier, evaluation_config) if resume else {}
144
167
  )
168
+ if resume and cached_records:
169
+ logger.info(f"Orchestrator: Found {len(cached_records)} cached generation records")
170
+ if resume and cached_evaluations:
171
+ logger.info(f"Orchestrator: Found {len(cached_evaluations)} cached evaluation records")
145
172
 
146
173
  # Process tasks: use cached or run new generations
147
174
  generation_results: list[GenerationRecord] = []
@@ -178,9 +205,18 @@ class ExperimentOrchestrator:
178
205
 
179
206
  # Run pending generation tasks
180
207
  if pending_tasks:
208
+ logger.info(f"Orchestrator: Running {len(pending_tasks)} generation tasks...")
209
+ completed = 0
181
210
  for record in self._runner.run(pending_tasks):
211
+ logger.debug(f"Orchestrator: Received generation record")
182
212
  generation_results.append(record)
213
+ completed += 1
214
+
215
+ # Log progress every 10 samples or at key milestones
216
+ if completed % 10 == 0 or completed == len(pending_tasks):
217
+ logger.info(f"Orchestrator: Generation progress: {completed}/{len(pending_tasks)} ({100*completed//len(pending_tasks)}%)")
183
218
 
219
+ logger.debug(f"Orchestrator: Processing record (cost tracking...)")
184
220
  # Track cost for successful generations
185
221
  if record.output and record.output.usage:
186
222
  usage = record.output.usage
@@ -197,6 +233,7 @@ class ExperimentOrchestrator:
197
233
  cost=cost,
198
234
  )
199
235
 
236
+ logger.debug(f"Orchestrator: Processing record (error handling...)")
200
237
  if record.error:
201
238
  failures.append(
202
239
  ExperimentFailure(
@@ -204,20 +241,35 @@ class ExperimentOrchestrator:
204
241
  message=record.error.message,
205
242
  )
206
243
  )
244
+
245
+ logger.debug(f"Orchestrator: Processing record (caching...)")
207
246
  cache_key = experiment_storage.task_cache_key(record.task)
208
247
  if cache_results:
209
248
  self._cache.save_generation_record(
210
249
  run_identifier, record, cache_key
211
250
  )
251
+
252
+ logger.debug(f"Orchestrator: Processing record (adding to pending...)")
212
253
  pending_records.append(record)
213
254
  pending_keys.append(cache_key)
255
+
256
+ logger.debug(f"Orchestrator: Processing record (callback...)")
214
257
  if on_result:
215
258
  on_result(record)
259
+ logger.debug(f"Orchestrator: Record processing complete")
216
260
 
217
261
  # Evaluate pending records
262
+ logger.info(f"Orchestrator: Preparing to evaluate {len(pending_records)} pending records...")
218
263
  if pending_records:
219
- new_evaluation_report = self._evaluation.evaluate(pending_records)
264
+ logger.info(f"Orchestrator: Starting evaluation of {len(pending_records)} records...")
265
+ try:
266
+ new_evaluation_report = self._evaluation.evaluate(pending_records)
267
+ logger.info(f"Orchestrator: ✅ Evaluation complete - got {len(new_evaluation_report.records)} results")
268
+ except Exception as e:
269
+ logger.error(f"Orchestrator: ❌ Evaluation failed: {e}")
270
+ raise
220
271
  else:
272
+ logger.info("Orchestrator: No new records to evaluate (all cached)")
221
273
  new_evaluation_report = evaluation_pipeline.EvaluationReport(
222
274
  metrics={}, failures=[], records=[]
223
275
  )
@@ -229,12 +281,16 @@ class ExperimentOrchestrator:
229
281
  )
230
282
 
231
283
  # Combine cached and new evaluations
284
+ logger.info("Orchestrator: Combining cached and new evaluations...")
232
285
  evaluation_report = self._combine_evaluations(
233
286
  cached_eval_records, new_evaluation_report
234
287
  )
288
+ logger.info(f"Orchestrator: Total evaluation records: {len(evaluation_report.records)}")
235
289
 
236
290
  # Get cost breakdown
237
291
  cost_breakdown = self._cost_tracker.get_breakdown()
292
+ if cost_breakdown.total_cost > 0:
293
+ logger.info(f"Orchestrator: Total cost: ${cost_breakdown.total_cost:.4f}")
238
294
 
239
295
  # Build metadata
240
296
  metadata = {
@@ -184,7 +184,7 @@ class ExperimentStorage:
184
184
  # In-memory caches
185
185
  self._task_index: dict[str, set[str]] = {}
186
186
  self._template_index: dict[str, dict[str, str]] = {}
187
- self._locks: dict[str, int] = {} # fd for lock files
187
+ self._locks: dict[str, tuple[int, int]] = {} # (fd, count) for reentrant locks
188
188
 
189
189
  def _init_database(self):
190
190
  """Initialize SQLite metadata database."""
@@ -253,34 +253,175 @@ class ExperimentStorage:
253
253
 
254
254
  @contextlib.contextmanager
255
255
  def _acquire_lock(self, run_id: str):
256
- """Acquire exclusive lock for run directory."""
256
+ """Acquire exclusive lock for run directory with timeout (reentrant).
257
+
258
+ This lock is reentrant within the same thread to prevent deadlocks when
259
+ the same process acquires the lock multiple times (e.g., start_run()
260
+ followed by append_record()).
261
+
262
+ The lock uses OS-specific file locking:
263
+ - Unix/Linux/macOS: fcntl.flock with non-blocking retry
264
+ - Windows: msvcrt.locking
265
+ - Fallback: No locking (single-process mode)
266
+
267
+ Args:
268
+ run_id: Unique run identifier
269
+
270
+ Yields:
271
+ Context manager that holds the lock
272
+
273
+ Raises:
274
+ TimeoutError: If lock cannot be acquired within 30 seconds
275
+ """
276
+ import time
277
+
278
+ # Check if we already hold the lock (reentrant)
279
+ if run_id in self._locks:
280
+ lock_fd, count = self._locks[run_id]
281
+ self._locks[run_id] = (lock_fd, count + 1)
282
+ try:
283
+ yield
284
+ finally:
285
+ # Check if lock still exists (might have been cleaned up by another thread)
286
+ if run_id in self._locks:
287
+ lock_fd, count = self._locks[run_id]
288
+ if count > 1:
289
+ self._locks[run_id] = (lock_fd, count - 1)
290
+ else:
291
+ # Last unlock - release the actual lock
292
+ self._release_os_lock(lock_fd, run_id)
293
+ return
294
+
295
+ # First time acquiring lock for this run_id
257
296
  lock_path = self._get_run_dir(run_id) / ".lock"
258
297
  lock_path.parent.mkdir(parents=True, exist_ok=True)
259
298
 
260
- # Open lock file
261
- lock_fd = os.open(lock_path, os.O_CREAT | os.O_RDWR)
299
+ # Open lock file (OS-independent flags)
300
+ lock_fd = os.open(str(lock_path), os.O_CREAT | os.O_RDWR)
262
301
 
263
302
  try:
264
- # Acquire exclusive lock (blocking)
265
- if sys.platform == "win32":
266
- # Windows file locking
267
- msvcrt.locking(lock_fd, msvcrt.LK_LOCK, 1)
268
- elif FCNTL_AVAILABLE:
269
- # Unix file locking
270
- fcntl.flock(lock_fd, fcntl.LOCK_EX)
271
- # If neither available, proceed without locking (single-process only)
303
+ # Acquire exclusive lock with timeout
304
+ self._acquire_os_lock(lock_fd, run_id, lock_path, timeout=30)
272
305
 
273
- self._locks[run_id] = lock_fd
306
+ self._locks[run_id] = (lock_fd, 1)
274
307
  yield
275
308
  finally:
276
- # Release lock
277
- if sys.platform == "win32":
309
+ # Release lock (only if this was the outermost lock)
310
+ if run_id in self._locks:
311
+ lock_fd, count = self._locks[run_id]
312
+ if count == 1:
313
+ self._release_os_lock(lock_fd, run_id)
314
+ else:
315
+ # Decrement count
316
+ self._locks[run_id] = (lock_fd, count - 1)
317
+
318
+ def _acquire_os_lock(
319
+ self,
320
+ lock_fd: int,
321
+ run_id: str,
322
+ lock_path: Path,
323
+ timeout: int = 30
324
+ ) -> None:
325
+ """Acquire OS-specific file lock with timeout.
326
+
327
+ Args:
328
+ lock_fd: File descriptor for lock file
329
+ run_id: Run identifier (for error messages)
330
+ lock_path: Path to lock file (for error messages)
331
+ timeout: Timeout in seconds
332
+
333
+ Raises:
334
+ TimeoutError: If lock cannot be acquired within timeout
335
+ """
336
+ import time
337
+
338
+ if sys.platform == "win32":
339
+ # Windows file locking with retry
340
+ try:
341
+ import msvcrt
342
+ except ImportError:
343
+ # msvcrt not available - single-process mode
344
+ import logging
345
+ logger = logging.getLogger(__name__)
346
+ logger.debug("msvcrt not available. Single-process mode only.")
347
+ return
348
+
349
+ start_time = time.time()
350
+ while True:
351
+ try:
352
+ msvcrt.locking(lock_fd, msvcrt.LK_NBLCK, 1)
353
+ break # Lock acquired
354
+ except OSError as e:
355
+ # Lock is held by another thread/process (errno 13 Permission denied)
356
+ if time.time() - start_time > timeout:
357
+ try:
358
+ os.close(lock_fd)
359
+ except:
360
+ pass
361
+ raise TimeoutError(
362
+ f"Failed to acquire lock for run {run_id} after {timeout}s on Windows. "
363
+ f"This usually means another process is holding the lock or a previous process crashed. "
364
+ f"Try deleting: {lock_path}"
365
+ ) from e
366
+ time.sleep(0.1) # Wait 100ms before retry
367
+ elif FCNTL_AVAILABLE:
368
+ # Unix file locking with non-blocking retry
369
+ start_time = time.time()
370
+ while True:
371
+ try:
372
+ fcntl.flock(lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
373
+ break # Lock acquired
374
+ except (IOError, OSError) as e:
375
+ # Lock is held by another process
376
+ if time.time() - start_time > timeout:
377
+ try:
378
+ os.close(lock_fd)
379
+ except:
380
+ pass
381
+ raise TimeoutError(
382
+ f"Failed to acquire lock for run {run_id} after {timeout}s. "
383
+ f"This usually means another process is holding the lock or a previous process crashed. "
384
+ f"Try: rm -f {lock_path}"
385
+ ) from e
386
+ time.sleep(0.1) # Wait 100ms before retry
387
+ else:
388
+ # No locking available - single-process mode
389
+ # This is safe for single-process usage (most common case)
390
+ import logging
391
+ logger = logging.getLogger(__name__)
392
+ logger.debug(
393
+ f"File locking not available on this platform. "
394
+ f"Storage will work in single-process mode only."
395
+ )
396
+
397
+ def _release_os_lock(self, lock_fd: int, run_id: str) -> None:
398
+ """Release OS-specific file lock.
399
+
400
+ Args:
401
+ lock_fd: File descriptor to close
402
+ run_id: Run identifier (for cleanup)
403
+ """
404
+ # Release lock
405
+ if sys.platform == "win32":
406
+ try:
407
+ import msvcrt
278
408
  msvcrt.locking(lock_fd, msvcrt.LK_UNLCK, 1)
279
- elif FCNTL_AVAILABLE:
409
+ except (ImportError, OSError):
410
+ pass # Lock may already be released
411
+ elif FCNTL_AVAILABLE:
412
+ try:
280
413
  fcntl.flock(lock_fd, fcntl.LOCK_UN)
281
-
414
+ except (IOError, OSError):
415
+ pass # Lock may already be released
416
+
417
+ # Close file descriptor
418
+ try:
282
419
  os.close(lock_fd)
283
- self._locks.pop(run_id, None)
420
+ except OSError:
421
+ pass # FD may already be closed
422
+
423
+ # Clean up tracking
424
+ self._locks.pop(run_id, None)
284
425
 
285
426
  def start_run(
286
427
  self,
@@ -456,16 +597,19 @@ class ExperimentStorage:
456
597
 
457
598
  try:
458
599
  if self._config.compression == "gzip":
600
+ # Close the fd first since gzip.open will open by path
601
+ os.close(temp_fd)
459
602
  with gzip.open(temp_path, "wt", encoding="utf-8") as f:
460
603
  f.write(json_line)
461
604
  f.flush()
462
605
  os.fsync(f.fileno())
463
606
  else:
607
+ # Use the fd directly
464
608
  with open(temp_fd, "w", encoding="utf-8") as f:
465
609
  f.write(json_line)
466
610
  f.flush()
467
611
  os.fsync(f.fileno())
468
- os.close(temp_fd)
612
+ # fd is closed by context manager, don't close again
469
613
 
470
614
  # Get target path with compression
471
615
  target_path = (
@@ -2,6 +2,7 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ import logging
5
6
  import threading
6
7
  from dataclasses import dataclass
7
8
  from typing import Any, Dict
@@ -10,6 +11,8 @@ from themis.core import entities as core_entities
10
11
  from themis.interfaces import ModelProvider
11
12
  from themis.providers import register_provider
12
13
 
14
+ logger = logging.getLogger(__name__)
15
+
13
16
 
14
17
  @dataclass
15
18
  class LiteLLMProvider(ModelProvider):
@@ -57,7 +60,22 @@ class LiteLLMProvider(ModelProvider):
57
60
  litellm.drop_params = self.drop_params
58
61
  if self.max_retries > 0:
59
62
  litellm.num_retries = self.max_retries
63
+
64
+ logger.debug(f"LiteLLMProvider initialized:")
65
+ logger.debug(f" api_base: {self.api_base or 'default'}")
66
+ logger.debug(f" timeout: {self.timeout}s")
67
+ logger.debug(f" max_retries: {self.max_retries}")
68
+ logger.debug(f" n_parallel: {self.n_parallel}")
69
+
70
+ # Warn if api_base is set but no api_key
71
+ if self.api_base and not self.api_key:
72
+ logger.warning(
73
+ "⚠️ LiteLLMProvider: api_base is set but api_key is not. "
74
+ "This may cause authentication errors. "
75
+ "Set api_key='dummy' for local servers."
76
+ )
60
77
  except ImportError as exc:
78
+ logger.error("❌ LiteLLM is not installed")
61
79
  raise RuntimeError(
62
80
  "LiteLLM is not installed. Install via `pip install litellm` or "
63
81
  "`uv add litellm` to use LiteLLMProvider."
@@ -70,6 +88,10 @@ class LiteLLMProvider(ModelProvider):
70
88
 
71
89
  messages = self._build_messages(task)
72
90
  completion_kwargs = self._build_completion_kwargs(task, messages)
91
+
92
+ logger.debug(f"LiteLLMProvider: Calling model={completion_kwargs.get('model')}")
93
+ if self.api_base:
94
+ logger.debug(f"LiteLLMProvider: Using custom api_base={self.api_base}")
73
95
 
74
96
  try:
75
97
  with self._semaphore:
@@ -131,6 +153,30 @@ class LiteLLMProvider(ModelProvider):
131
153
  details["status_code"] = exc.status_code # type: ignore
132
154
  if hasattr(exc, "llm_provider"):
133
155
  details["llm_provider"] = exc.llm_provider # type: ignore
156
+
157
+ # Log with helpful context
158
+ if "AuthenticationError" in error_type or "api_key" in error_message.lower():
159
+ logger.error(
160
+ f"LiteLLMProvider: ❌ Authentication error for model {task.model.identifier}"
161
+ )
162
+ logger.error(
163
+ f" Error: {error_message[:200]}"
164
+ )
165
+ logger.error(
166
+ f" Hint: If using a custom api_base, ensure you also pass api_key='dummy'"
167
+ )
168
+ elif "Connection" in error_type or "timeout" in error_message.lower():
169
+ logger.error(
170
+ f"LiteLLMProvider: ❌ Connection error for model {task.model.identifier}"
171
+ )
172
+ logger.error(f" Error: {error_message[:200]}")
173
+ if self.api_base:
174
+ logger.error(f" Check that the server at {self.api_base} is running")
175
+ else:
176
+ logger.error(
177
+ f"LiteLLMProvider: ❌ Generation failed for {task.model.identifier}: "
178
+ f"{error_type}: {error_message[:200]}"
179
+ )
134
180
 
135
181
  return core_entities.GenerationRecord(
136
182
  task=task,
@@ -49,16 +49,32 @@ class GenerationRunner:
49
49
  ) -> Iterator[core_entities.GenerationRecord]:
50
50
  task_list = list(tasks)
51
51
  if not task_list:
52
+ logger.info("Runner: No tasks to execute")
52
53
  return
54
+
55
+ logger.info(f"Runner: Starting execution of {len(task_list)} tasks with {self._max_parallel} workers")
56
+
53
57
  if self._max_parallel <= 1:
54
- for task in task_list:
58
+ logger.info("Runner: Using sequential execution (1 worker)")
59
+ for i, task in enumerate(task_list, 1):
60
+ logger.debug(f"Runner: Processing task {i}/{len(task_list)}")
55
61
  yield self._execute_task(task)
56
62
  return
57
63
 
64
+ logger.info(f"Runner: Using parallel execution ({self._max_parallel} workers)")
58
65
  with ThreadPoolExecutor(max_workers=self._max_parallel) as executor:
59
66
  futures = [executor.submit(self._execute_task, task) for task in task_list]
67
+ completed = 0
60
68
  for future in futures:
61
- yield future.result()
69
+ try:
70
+ result = future.result()
71
+ completed += 1
72
+ if completed % max(1, len(task_list) // 10) == 0 or completed == len(task_list):
73
+ logger.debug(f"Runner: Completed {completed}/{len(task_list)} tasks")
74
+ yield result
75
+ except Exception as e:
76
+ logger.error(f"Runner: Task execution failed: {e}")
77
+ raise
62
78
 
63
79
  def _run_single_attempt(
64
80
  self, task: core_entities.GenerationTask
@@ -70,7 +86,7 @@ class GenerationRunner:
70
86
  for attempt in range(1, self._max_retries + 1):
71
87
  try:
72
88
  logger.debug(
73
- "Starting generation for %s attempt %s/%s",
89
+ "Runner: Starting generation for %s (attempt %s/%s)",
74
90
  task_label,
75
91
  attempt,
76
92
  self._max_retries,
@@ -79,16 +95,16 @@ class GenerationRunner:
79
95
  record.metrics["generation_attempts"] = attempt
80
96
  if attempt_errors:
81
97
  record.metrics.setdefault("retry_errors", attempt_errors)
82
- logger.debug("Completed %s in %s attempt(s)", task_label, attempt)
98
+ logger.debug("Runner: ✅ Completed %s in %s attempt(s)", task_label, attempt)
83
99
  return record
84
100
  except Exception as exc: # pragma: no cover - defensive path
85
101
  last_error = exc
86
102
  logger.warning(
87
- "Attempt %s/%s for %s failed: %s",
103
+ "Runner: ⚠️ Attempt %s/%s for %s failed: %s",
88
104
  attempt,
89
105
  self._max_retries,
90
106
  task_label,
91
- exc,
107
+ str(exc)[:100], # Truncate long error messages
92
108
  )
93
109
  attempt_errors.append(
94
110
  {
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: themis-eval
3
- Version: 0.2.0
3
+ Version: 0.2.1
4
4
  Summary: Lightweight evaluation platform for LLM experiments
5
5
  Author: Pittawat Taveekitworachai
6
6
  License: MIT
@@ -1,6 +1,6 @@
1
1
  themis/__init__.py,sha256=Pswn5ZiXyU5ANoknjdBLkqouZQdeWMm3DoUMVzU_j8M,543
2
- themis/_version.py,sha256=xRJB6N107oMsasuLYKaoIzuBo5Oe2hlK3-lGyTzxAC8,378
3
- themis/api.py,sha256=myHeMaWQMnyjCUAlr9P6cX2Awt50q1XGtyKDCimJgCg,12077
2
+ themis/_version.py,sha256=R6LtutHSlN-yNUXHD-aPwhshiv94GS8wU_HzIsShIy4,378
3
+ themis/api.py,sha256=l_xRpFQ4U4dJtosm-nVudn1My2qTalMmafBky7e_m6M,15705
4
4
  themis/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
5
  themis/backends/__init__.py,sha256=RWM5SnV5FrS_cVjpHHeZZM_b9CgqBu1rPS5DlT5YQTY,578
6
6
  themis/backends/execution.py,sha256=RAFuB9ri8TMil5PcnsisypKO2ViyLFXj08P_vjNYguU,6095
@@ -112,9 +112,9 @@ themis/experiment/export_csv.py,sha256=80w3gEGjeLjuiNq539rRP73k3MBtwrzJy90hgE91A
112
112
  themis/experiment/integration_manager.py,sha256=wTVTjDGcUkzz4tfnwSxa5nK1A4e2FKCPazDYGcdzYS8,3325
113
113
  themis/experiment/math.py,sha256=P2E9F_UKI7pb-aXepSztGdr_g309WEMe83zqg1nWO7A,6973
114
114
  themis/experiment/mcq.py,sha256=DDB99FHQsU_5vMIRDRhSZ7pReYvVf57wLmmo3OU_An4,6276
115
- themis/experiment/orchestrator.py,sha256=-6epspKnPoAJQPKzoNAxd54MrEX3lIhrKyqQ9dmD00A,16120
115
+ themis/experiment/orchestrator.py,sha256=VeSasDmCXrYlrv1r47I698RUq14vEBR7c_uyZzM01hw,19304
116
116
  themis/experiment/pricing.py,sha256=fTM32yE3L8vahMP4sr1zr7dbp9zYCjiPN4D4VuZ8-q8,9346
117
- themis/experiment/storage.py,sha256=QS3fJD79bzgodM5x79yJ2A69O5hTL2r2ROAKSvtRnkI,49471
117
+ themis/experiment/storage.py,sha256=ujGiQTeRPOfS8hYHB1a7F9t-dQnXquhqomI1vDjqmno,55250
118
118
  themis/experiment/visualization.py,sha256=dJYHrp3mntl8CPc5HPI3iKqPztVsddQB3ogRkd_FCNc,18473
119
119
  themis/generation/__init__.py,sha256=6KVwCQYMpPIsXNuWDZOGuqHkUkA45lbSacIFn8ZbD4s,36
120
120
  themis/generation/agentic_runner.py,sha256=armBQBk7qZDBEwT8HqjIWomYDQm57NfrP5CZJzay2uA,13669
@@ -123,12 +123,12 @@ themis/generation/clients.py,sha256=6apXCp_VNQosnpnmohTHOhHGXw-VZgsUyLds8MwtYUE,
123
123
  themis/generation/conversation_runner.py,sha256=kSZHwEvfqzxZ-eQYxmg5OkNZcgEHggZExjad6nBOeTM,7980
124
124
  themis/generation/plan.py,sha256=RmPIdefXkQMHYv5EWiilpx91I9a-svw31imvG0wV3fE,15961
125
125
  themis/generation/router.py,sha256=jZc0KFL483f8TrYtt9yxzFKs-T9CG2CoE2kfOQdHMEc,1082
126
- themis/generation/runner.py,sha256=iHTE5vSMWMYRrv4PEWMaZflF939nv1wWccK8V0e092c,8009
126
+ themis/generation/runner.py,sha256=pH4Dw77qskMQk3yxEkaHYAl1PItTofI7OXdvevnFiCA,8984
127
127
  themis/generation/strategies.py,sha256=hjqaVkNycFxJWh_edJ7ilBl7HS6bL-8pYm24zTfoAvg,2975
128
128
  themis/generation/templates.py,sha256=ut_6akp8Y6Ey_9O3s64jDbwCB74pw62Zf8URlYcKHkA,2325
129
129
  themis/generation/turn_strategies.py,sha256=w33qhzpQbGTsfeOgOgMDovV0wEeXeNZUUBm5yZy1naw,10973
130
130
  themis/generation/types.py,sha256=MkJnZk6lMHmHzlJVEsuIC9ioRW8XhWcSk9AdDeb_aLE,338
131
- themis/generation/providers/litellm_provider.py,sha256=rlTuglIwhcvSakCo5G-ffgQtEHbCEX0ZeKk6M1MaWmU,8155
131
+ themis/generation/providers/litellm_provider.py,sha256=tvLY8hrSjo4CnyWzccFp1PkXj8R2j8pda5irJiarWd8,10334
132
132
  themis/generation/providers/vllm_provider.py,sha256=0K4we6xDrRXlBXseC1ixLq2sJpRF4T8Ikv45dw-zNk4,4625
133
133
  themis/integrations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
134
134
  themis/integrations/huggingface.py,sha256=vrLwYwn65pU4W3FUe0ImCOZxKKlpRshDqMoLFsclB3E,2370
@@ -150,8 +150,8 @@ themis/utils/dashboard.py,sha256=2yiIu9_oENglTde_J3G1d5cpQ5VtSnfbUvdliw5Og1E,130
150
150
  themis/utils/logging_utils.py,sha256=YNSiDfO4LsciSzUhHF1aTVI5rkfnWiVbn1NcGjjmJuQ,1019
151
151
  themis/utils/progress.py,sha256=b3YwHKV5x3Cvr5rBukqifJimK3Si4CGY2fpN6a_ZySI,1434
152
152
  themis/utils/tracing.py,sha256=VTeiRjcW_B5fOOoSeAp37nrmlwP1DiqPcoe6OtIQ7dk,8468
153
- themis_eval-0.2.0.dist-info/licenses/LICENSE,sha256=K5FLE7iqn5-_6k1sf3IGy7w-Wx_Vdx3t0sOVJByNlF0,1076
154
- themis_eval-0.2.0.dist-info/METADATA,sha256=S4dy0AD2REsRtPfULUYMiYC2Zk8nWgz4BWjBBJz2gHU,15173
155
- themis_eval-0.2.0.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
156
- themis_eval-0.2.0.dist-info/top_level.txt,sha256=QGIl4v-KB32upFS5UTXMJxHVX3vF7yBso82wJFI1Vbs,7
157
- themis_eval-0.2.0.dist-info/RECORD,,
153
+ themis_eval-0.2.1.dist-info/licenses/LICENSE,sha256=K5FLE7iqn5-_6k1sf3IGy7w-Wx_Vdx3t0sOVJByNlF0,1076
154
+ themis_eval-0.2.1.dist-info/METADATA,sha256=h1lFivm8bgwIbh4Fw0JCii1-f3JNMDxiyPAJM3qEtV0,15173
155
+ themis_eval-0.2.1.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
156
+ themis_eval-0.2.1.dist-info/top_level.txt,sha256=QGIl4v-KB32upFS5UTXMJxHVX3vF7yBso82wJFI1Vbs,7
157
+ themis_eval-0.2.1.dist-info/RECORD,,