themis-eval 0.2.0__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- themis/_version.py +1 -1
- themis/api.py +99 -13
- themis/experiment/orchestrator.py +61 -5
- themis/experiment/storage.py +163 -19
- themis/generation/providers/litellm_provider.py +46 -0
- themis/generation/runner.py +22 -6
- {themis_eval-0.2.0.dist-info → themis_eval-0.2.1.dist-info}/METADATA +1 -1
- {themis_eval-0.2.0.dist-info → themis_eval-0.2.1.dist-info}/RECORD +11 -11
- {themis_eval-0.2.0.dist-info → themis_eval-0.2.1.dist-info}/WHEEL +0 -0
- {themis_eval-0.2.0.dist-info → themis_eval-0.2.1.dist-info}/licenses/LICENSE +0 -0
- {themis_eval-0.2.0.dist-info → themis_eval-0.2.1.dist-info}/top_level.txt +0 -0
themis/_version.py
CHANGED
|
@@ -9,7 +9,7 @@ def _detect_version() -> str:
|
|
|
9
9
|
try:
|
|
10
10
|
return metadata.version("themis-eval")
|
|
11
11
|
except metadata.PackageNotFoundError: # pragma: no cover - local dev only
|
|
12
|
-
return "0.2.
|
|
12
|
+
return "0.2.1" # Fallback for development
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
__version__ = _detect_version()
|
themis/api.py
CHANGED
|
@@ -33,6 +33,7 @@ Example:
|
|
|
33
33
|
|
|
34
34
|
from __future__ import annotations
|
|
35
35
|
|
|
36
|
+
import logging
|
|
36
37
|
from datetime import datetime
|
|
37
38
|
from pathlib import Path
|
|
38
39
|
from typing import Any, Callable, Sequence
|
|
@@ -52,6 +53,18 @@ from themis.generation.runner import GenerationRunner
|
|
|
52
53
|
from themis.generation.templates import PromptTemplate
|
|
53
54
|
from themis.providers import create_provider
|
|
54
55
|
|
|
56
|
+
# Import provider modules to ensure they register themselves
|
|
57
|
+
try:
|
|
58
|
+
from themis.generation import clients # noqa: F401 - registers fake provider
|
|
59
|
+
from themis.generation.providers import (
|
|
60
|
+
litellm_provider, # noqa: F401
|
|
61
|
+
vllm_provider, # noqa: F401
|
|
62
|
+
)
|
|
63
|
+
except ImportError:
|
|
64
|
+
pass
|
|
65
|
+
|
|
66
|
+
logger = logging.getLogger(__name__)
|
|
67
|
+
|
|
55
68
|
|
|
56
69
|
def evaluate(
|
|
57
70
|
benchmark_or_dataset: str | Sequence[dict[str, Any]],
|
|
@@ -123,6 +136,19 @@ def evaluate(
|
|
|
123
136
|
>>> print(f"Accuracy: {report.evaluation_report.metrics['accuracy']:.2%}")
|
|
124
137
|
Accuracy: 85.00%
|
|
125
138
|
"""
|
|
139
|
+
logger.info("=" * 60)
|
|
140
|
+
logger.info("Starting Themis evaluation")
|
|
141
|
+
logger.info(f"Model: {model}")
|
|
142
|
+
logger.info(f"Workers: {workers}")
|
|
143
|
+
logger.info(f"Temperature: {temperature}, Max tokens: {max_tokens}")
|
|
144
|
+
if "api_base" in kwargs:
|
|
145
|
+
logger.info(f"Custom API base: {kwargs['api_base']}")
|
|
146
|
+
if "api_key" in kwargs:
|
|
147
|
+
logger.info("API key: <provided>")
|
|
148
|
+
else:
|
|
149
|
+
logger.warning("⚠️ No api_key provided - may fail for custom API endpoints")
|
|
150
|
+
logger.info("=" * 60)
|
|
151
|
+
|
|
126
152
|
# Import presets system (lazy import to avoid circular dependencies)
|
|
127
153
|
from themis.presets import get_benchmark_preset, parse_model_name
|
|
128
154
|
|
|
@@ -131,11 +157,23 @@ def evaluate(
|
|
|
131
157
|
|
|
132
158
|
if is_benchmark:
|
|
133
159
|
benchmark_name = benchmark_or_dataset
|
|
160
|
+
logger.info(f"Loading benchmark: {benchmark_name}")
|
|
161
|
+
|
|
134
162
|
# Get preset configuration
|
|
135
|
-
|
|
163
|
+
try:
|
|
164
|
+
preset = get_benchmark_preset(benchmark_name)
|
|
165
|
+
except Exception as e:
|
|
166
|
+
logger.error(f"❌ Failed to get benchmark preset '{benchmark_name}': {e}")
|
|
167
|
+
raise
|
|
136
168
|
|
|
137
169
|
# Load dataset using preset loader
|
|
138
|
-
dataset
|
|
170
|
+
logger.info(f"Loading dataset (limit={limit})...")
|
|
171
|
+
try:
|
|
172
|
+
dataset = preset.load_dataset(limit=limit)
|
|
173
|
+
logger.info(f"✅ Loaded {len(dataset)} samples from {benchmark_name}")
|
|
174
|
+
except Exception as e:
|
|
175
|
+
logger.error(f"❌ Failed to load dataset: {e}")
|
|
176
|
+
raise
|
|
139
177
|
|
|
140
178
|
# Use preset prompt if not overridden
|
|
141
179
|
if prompt is None:
|
|
@@ -158,11 +196,14 @@ def evaluate(
|
|
|
158
196
|
dataset_id_field = preset.dataset_id_field
|
|
159
197
|
else:
|
|
160
198
|
# Custom dataset
|
|
199
|
+
logger.info("Using custom dataset")
|
|
161
200
|
dataset = list(benchmark_or_dataset)
|
|
201
|
+
logger.info(f"Custom dataset has {len(dataset)} samples")
|
|
162
202
|
|
|
163
203
|
# Limit dataset if requested
|
|
164
204
|
if limit is not None:
|
|
165
205
|
dataset = dataset[:limit]
|
|
206
|
+
logger.info(f"Limited to {len(dataset)} samples")
|
|
166
207
|
|
|
167
208
|
# Use provided prompt or default
|
|
168
209
|
if prompt is None:
|
|
@@ -188,7 +229,15 @@ def evaluate(
|
|
|
188
229
|
dataset_id_field = "id"
|
|
189
230
|
|
|
190
231
|
# Parse model name to get provider and options
|
|
191
|
-
|
|
232
|
+
logger.info(f"Parsing model configuration...")
|
|
233
|
+
try:
|
|
234
|
+
provider_name, model_id, provider_options = parse_model_name(model, **kwargs)
|
|
235
|
+
logger.info(f"Provider: {provider_name}")
|
|
236
|
+
logger.info(f"Model ID: {model_id}")
|
|
237
|
+
logger.debug(f"Provider options: {provider_options}")
|
|
238
|
+
except Exception as e:
|
|
239
|
+
logger.error(f"❌ Failed to parse model name '{model}': {e}")
|
|
240
|
+
raise
|
|
192
241
|
|
|
193
242
|
# Create model spec
|
|
194
243
|
model_spec = ModelSpec(
|
|
@@ -214,17 +263,31 @@ def evaluate(
|
|
|
214
263
|
)
|
|
215
264
|
|
|
216
265
|
# Create provider and router
|
|
217
|
-
provider
|
|
266
|
+
logger.info(f"Creating provider '{provider_name}'...")
|
|
267
|
+
try:
|
|
268
|
+
provider = create_provider(provider_name, **provider_options)
|
|
269
|
+
logger.info(f"✅ Provider created successfully")
|
|
270
|
+
except KeyError as e:
|
|
271
|
+
logger.error(f"❌ Provider '{provider_name}' not registered. Available providers: fake, litellm, openai, anthropic, azure, bedrock, gemini, cohere, vllm")
|
|
272
|
+
logger.error(f" This usually means the provider module wasn't imported.")
|
|
273
|
+
raise
|
|
274
|
+
except Exception as e:
|
|
275
|
+
logger.error(f"❌ Failed to create provider: {e}")
|
|
276
|
+
raise
|
|
277
|
+
|
|
218
278
|
router = ProviderRouter({model_id: provider})
|
|
279
|
+
logger.debug(f"Router configured for model: {model_id}")
|
|
219
280
|
|
|
220
281
|
# Create runner
|
|
221
|
-
runner = GenerationRunner(provider=router)
|
|
282
|
+
runner = GenerationRunner(provider=router, max_parallel=workers)
|
|
283
|
+
logger.info(f"Runner configured with {workers} parallel workers")
|
|
222
284
|
|
|
223
285
|
# Create evaluation pipeline
|
|
224
286
|
pipeline = EvaluationPipeline(
|
|
225
287
|
extractor=extractor,
|
|
226
288
|
metrics=metrics_list,
|
|
227
289
|
)
|
|
290
|
+
logger.info(f"Evaluation metrics: {[m.name for m in metrics_list]}")
|
|
228
291
|
|
|
229
292
|
# Determine storage location
|
|
230
293
|
if storage is None:
|
|
@@ -235,11 +298,15 @@ def evaluate(
|
|
|
235
298
|
# Generate run ID if not provided
|
|
236
299
|
if run_id is None:
|
|
237
300
|
run_id = f"run-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
|
|
301
|
+
logger.info(f"Run ID: {run_id}")
|
|
302
|
+
logger.info(f"Storage: {storage_dir}")
|
|
303
|
+
logger.info(f"Resume: {resume}")
|
|
238
304
|
|
|
239
305
|
# Create storage backend
|
|
240
306
|
if isinstance(storage_dir, Path):
|
|
241
307
|
from themis.experiment.storage import ExperimentStorage
|
|
242
308
|
storage_backend = ExperimentStorage(storage_dir)
|
|
309
|
+
logger.debug(f"Storage backend created at {storage_dir}")
|
|
243
310
|
else:
|
|
244
311
|
# Cloud storage (to be implemented in Phase 3)
|
|
245
312
|
raise NotImplementedError(
|
|
@@ -264,15 +331,34 @@ def evaluate(
|
|
|
264
331
|
)
|
|
265
332
|
|
|
266
333
|
# Run locally
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
run_id=run_id,
|
|
271
|
-
resume=resume,
|
|
272
|
-
on_result=on_result,
|
|
273
|
-
)
|
|
334
|
+
logger.info("=" * 60)
|
|
335
|
+
logger.info("🚀 Starting experiment execution...")
|
|
336
|
+
logger.info("=" * 60)
|
|
274
337
|
|
|
275
|
-
|
|
338
|
+
try:
|
|
339
|
+
report = orchestrator.run(
|
|
340
|
+
dataset=dataset,
|
|
341
|
+
max_samples=limit,
|
|
342
|
+
run_id=run_id,
|
|
343
|
+
resume=resume,
|
|
344
|
+
on_result=on_result,
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
logger.info("=" * 60)
|
|
348
|
+
logger.info("✅ Evaluation completed successfully!")
|
|
349
|
+
logger.info(f" Total samples: {len(report.generation_results)}")
|
|
350
|
+
logger.info(f" Successful: {report.metadata.get('successful_generations', 0)}")
|
|
351
|
+
logger.info(f" Failed: {report.metadata.get('failed_generations', 0)}")
|
|
352
|
+
if report.evaluation_report.metrics:
|
|
353
|
+
logger.info(f" Metrics: {list(report.evaluation_report.metrics.keys())}")
|
|
354
|
+
logger.info("=" * 60)
|
|
355
|
+
|
|
356
|
+
return report
|
|
357
|
+
except Exception as e:
|
|
358
|
+
logger.error("=" * 60)
|
|
359
|
+
logger.error(f"❌ Evaluation failed: {e}")
|
|
360
|
+
logger.error("=" * 60)
|
|
361
|
+
raise
|
|
276
362
|
|
|
277
363
|
|
|
278
364
|
def _resolve_metrics(metric_names: list[str]) -> list:
|
|
@@ -2,10 +2,13 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
import logging
|
|
5
6
|
from datetime import datetime, timezone
|
|
6
7
|
from typing import Callable, Sequence
|
|
7
8
|
|
|
8
9
|
from themis.config.schema import IntegrationsConfig
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
9
12
|
from themis.core.entities import (
|
|
10
13
|
EvaluationRecord,
|
|
11
14
|
ExperimentFailure,
|
|
@@ -102,6 +105,8 @@ class ExperimentOrchestrator:
|
|
|
102
105
|
Returns:
|
|
103
106
|
ExperimentReport with generation results, evaluation, and metadata
|
|
104
107
|
"""
|
|
108
|
+
logger.info("Orchestrator: Initializing experiment run")
|
|
109
|
+
|
|
105
110
|
# Initialize integrations
|
|
106
111
|
self._integrations.initialize_run(
|
|
107
112
|
{
|
|
@@ -112,13 +117,23 @@ class ExperimentOrchestrator:
|
|
|
112
117
|
)
|
|
113
118
|
|
|
114
119
|
# Prepare dataset
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
120
|
+
logger.info("Orchestrator: Loading dataset...")
|
|
121
|
+
try:
|
|
122
|
+
dataset_list = self._resolve_dataset(
|
|
123
|
+
dataset=dataset, dataset_loader=dataset_loader, run_id=run_id
|
|
124
|
+
)
|
|
125
|
+
logger.info(f"Orchestrator: Dataset loaded ({len(dataset_list)} total samples)")
|
|
126
|
+
except Exception as e:
|
|
127
|
+
logger.error(f"Orchestrator: ❌ Failed to load dataset: {e}")
|
|
128
|
+
raise
|
|
129
|
+
|
|
118
130
|
selected_dataset = (
|
|
119
131
|
dataset_list[:max_samples] if max_samples is not None else dataset_list
|
|
120
132
|
)
|
|
121
133
|
run_identifier = run_id or self._default_run_id()
|
|
134
|
+
|
|
135
|
+
logger.info(f"Orchestrator: Processing {len(selected_dataset)} samples")
|
|
136
|
+
logger.info(f"Orchestrator: Run ID = {run_identifier}")
|
|
122
137
|
|
|
123
138
|
# Initialize run in storage (if storage exists and run doesn't exist)
|
|
124
139
|
if self._cache.has_storage:
|
|
@@ -130,18 +145,30 @@ class ExperimentOrchestrator:
|
|
|
130
145
|
self._cache.cache_dataset(run_identifier, dataset_list)
|
|
131
146
|
|
|
132
147
|
# Expand dataset into generation tasks
|
|
133
|
-
|
|
148
|
+
logger.info("Orchestrator: Expanding dataset into generation tasks...")
|
|
149
|
+
try:
|
|
150
|
+
tasks = list(self._plan.expand(selected_dataset))
|
|
151
|
+
logger.info(f"Orchestrator: Created {len(tasks)} generation tasks")
|
|
152
|
+
except Exception as e:
|
|
153
|
+
logger.error(f"Orchestrator: ❌ Failed to expand dataset: {e}")
|
|
154
|
+
raise
|
|
134
155
|
|
|
135
156
|
# Build evaluation configuration for cache invalidation
|
|
136
157
|
evaluation_config = self._build_evaluation_config()
|
|
137
158
|
|
|
138
159
|
# Load cached results if resuming
|
|
160
|
+
if resume:
|
|
161
|
+
logger.info("Orchestrator: Loading cached results...")
|
|
139
162
|
cached_records = (
|
|
140
163
|
self._cache.load_cached_records(run_identifier) if resume else {}
|
|
141
164
|
)
|
|
142
165
|
cached_evaluations = (
|
|
143
166
|
self._cache.load_cached_evaluations(run_identifier, evaluation_config) if resume else {}
|
|
144
167
|
)
|
|
168
|
+
if resume and cached_records:
|
|
169
|
+
logger.info(f"Orchestrator: Found {len(cached_records)} cached generation records")
|
|
170
|
+
if resume and cached_evaluations:
|
|
171
|
+
logger.info(f"Orchestrator: Found {len(cached_evaluations)} cached evaluation records")
|
|
145
172
|
|
|
146
173
|
# Process tasks: use cached or run new generations
|
|
147
174
|
generation_results: list[GenerationRecord] = []
|
|
@@ -178,9 +205,18 @@ class ExperimentOrchestrator:
|
|
|
178
205
|
|
|
179
206
|
# Run pending generation tasks
|
|
180
207
|
if pending_tasks:
|
|
208
|
+
logger.info(f"Orchestrator: Running {len(pending_tasks)} generation tasks...")
|
|
209
|
+
completed = 0
|
|
181
210
|
for record in self._runner.run(pending_tasks):
|
|
211
|
+
logger.debug(f"Orchestrator: Received generation record")
|
|
182
212
|
generation_results.append(record)
|
|
213
|
+
completed += 1
|
|
214
|
+
|
|
215
|
+
# Log progress every 10 samples or at key milestones
|
|
216
|
+
if completed % 10 == 0 or completed == len(pending_tasks):
|
|
217
|
+
logger.info(f"Orchestrator: Generation progress: {completed}/{len(pending_tasks)} ({100*completed//len(pending_tasks)}%)")
|
|
183
218
|
|
|
219
|
+
logger.debug(f"Orchestrator: Processing record (cost tracking...)")
|
|
184
220
|
# Track cost for successful generations
|
|
185
221
|
if record.output and record.output.usage:
|
|
186
222
|
usage = record.output.usage
|
|
@@ -197,6 +233,7 @@ class ExperimentOrchestrator:
|
|
|
197
233
|
cost=cost,
|
|
198
234
|
)
|
|
199
235
|
|
|
236
|
+
logger.debug(f"Orchestrator: Processing record (error handling...)")
|
|
200
237
|
if record.error:
|
|
201
238
|
failures.append(
|
|
202
239
|
ExperimentFailure(
|
|
@@ -204,20 +241,35 @@ class ExperimentOrchestrator:
|
|
|
204
241
|
message=record.error.message,
|
|
205
242
|
)
|
|
206
243
|
)
|
|
244
|
+
|
|
245
|
+
logger.debug(f"Orchestrator: Processing record (caching...)")
|
|
207
246
|
cache_key = experiment_storage.task_cache_key(record.task)
|
|
208
247
|
if cache_results:
|
|
209
248
|
self._cache.save_generation_record(
|
|
210
249
|
run_identifier, record, cache_key
|
|
211
250
|
)
|
|
251
|
+
|
|
252
|
+
logger.debug(f"Orchestrator: Processing record (adding to pending...)")
|
|
212
253
|
pending_records.append(record)
|
|
213
254
|
pending_keys.append(cache_key)
|
|
255
|
+
|
|
256
|
+
logger.debug(f"Orchestrator: Processing record (callback...)")
|
|
214
257
|
if on_result:
|
|
215
258
|
on_result(record)
|
|
259
|
+
logger.debug(f"Orchestrator: Record processing complete")
|
|
216
260
|
|
|
217
261
|
# Evaluate pending records
|
|
262
|
+
logger.info(f"Orchestrator: Preparing to evaluate {len(pending_records)} pending records...")
|
|
218
263
|
if pending_records:
|
|
219
|
-
|
|
264
|
+
logger.info(f"Orchestrator: Starting evaluation of {len(pending_records)} records...")
|
|
265
|
+
try:
|
|
266
|
+
new_evaluation_report = self._evaluation.evaluate(pending_records)
|
|
267
|
+
logger.info(f"Orchestrator: ✅ Evaluation complete - got {len(new_evaluation_report.records)} results")
|
|
268
|
+
except Exception as e:
|
|
269
|
+
logger.error(f"Orchestrator: ❌ Evaluation failed: {e}")
|
|
270
|
+
raise
|
|
220
271
|
else:
|
|
272
|
+
logger.info("Orchestrator: No new records to evaluate (all cached)")
|
|
221
273
|
new_evaluation_report = evaluation_pipeline.EvaluationReport(
|
|
222
274
|
metrics={}, failures=[], records=[]
|
|
223
275
|
)
|
|
@@ -229,12 +281,16 @@ class ExperimentOrchestrator:
|
|
|
229
281
|
)
|
|
230
282
|
|
|
231
283
|
# Combine cached and new evaluations
|
|
284
|
+
logger.info("Orchestrator: Combining cached and new evaluations...")
|
|
232
285
|
evaluation_report = self._combine_evaluations(
|
|
233
286
|
cached_eval_records, new_evaluation_report
|
|
234
287
|
)
|
|
288
|
+
logger.info(f"Orchestrator: Total evaluation records: {len(evaluation_report.records)}")
|
|
235
289
|
|
|
236
290
|
# Get cost breakdown
|
|
237
291
|
cost_breakdown = self._cost_tracker.get_breakdown()
|
|
292
|
+
if cost_breakdown.total_cost > 0:
|
|
293
|
+
logger.info(f"Orchestrator: Total cost: ${cost_breakdown.total_cost:.4f}")
|
|
238
294
|
|
|
239
295
|
# Build metadata
|
|
240
296
|
metadata = {
|
themis/experiment/storage.py
CHANGED
|
@@ -184,7 +184,7 @@ class ExperimentStorage:
|
|
|
184
184
|
# In-memory caches
|
|
185
185
|
self._task_index: dict[str, set[str]] = {}
|
|
186
186
|
self._template_index: dict[str, dict[str, str]] = {}
|
|
187
|
-
self._locks: dict[str, int] = {} # fd for
|
|
187
|
+
self._locks: dict[str, tuple[int, int]] = {} # (fd, count) for reentrant locks
|
|
188
188
|
|
|
189
189
|
def _init_database(self):
|
|
190
190
|
"""Initialize SQLite metadata database."""
|
|
@@ -253,34 +253,175 @@ class ExperimentStorage:
|
|
|
253
253
|
|
|
254
254
|
@contextlib.contextmanager
|
|
255
255
|
def _acquire_lock(self, run_id: str):
|
|
256
|
-
"""Acquire exclusive lock for run directory.
|
|
256
|
+
"""Acquire exclusive lock for run directory with timeout (reentrant).
|
|
257
|
+
|
|
258
|
+
This lock is reentrant within the same thread to prevent deadlocks when
|
|
259
|
+
the same process acquires the lock multiple times (e.g., start_run()
|
|
260
|
+
followed by append_record()).
|
|
261
|
+
|
|
262
|
+
The lock uses OS-specific file locking:
|
|
263
|
+
- Unix/Linux/macOS: fcntl.flock with non-blocking retry
|
|
264
|
+
- Windows: msvcrt.locking
|
|
265
|
+
- Fallback: No locking (single-process mode)
|
|
266
|
+
|
|
267
|
+
Args:
|
|
268
|
+
run_id: Unique run identifier
|
|
269
|
+
|
|
270
|
+
Yields:
|
|
271
|
+
Context manager that holds the lock
|
|
272
|
+
|
|
273
|
+
Raises:
|
|
274
|
+
TimeoutError: If lock cannot be acquired within 30 seconds
|
|
275
|
+
"""
|
|
276
|
+
import time
|
|
277
|
+
|
|
278
|
+
# Check if we already hold the lock (reentrant)
|
|
279
|
+
if run_id in self._locks:
|
|
280
|
+
lock_fd, count = self._locks[run_id]
|
|
281
|
+
self._locks[run_id] = (lock_fd, count + 1)
|
|
282
|
+
try:
|
|
283
|
+
yield
|
|
284
|
+
finally:
|
|
285
|
+
# Check if lock still exists (might have been cleaned up by another thread)
|
|
286
|
+
if run_id in self._locks:
|
|
287
|
+
lock_fd, count = self._locks[run_id]
|
|
288
|
+
if count > 1:
|
|
289
|
+
self._locks[run_id] = (lock_fd, count - 1)
|
|
290
|
+
else:
|
|
291
|
+
# Last unlock - release the actual lock
|
|
292
|
+
self._release_os_lock(lock_fd, run_id)
|
|
293
|
+
return
|
|
294
|
+
|
|
295
|
+
# First time acquiring lock for this run_id
|
|
257
296
|
lock_path = self._get_run_dir(run_id) / ".lock"
|
|
258
297
|
lock_path.parent.mkdir(parents=True, exist_ok=True)
|
|
259
298
|
|
|
260
|
-
# Open lock file
|
|
261
|
-
lock_fd = os.open(lock_path, os.O_CREAT | os.O_RDWR)
|
|
299
|
+
# Open lock file (OS-independent flags)
|
|
300
|
+
lock_fd = os.open(str(lock_path), os.O_CREAT | os.O_RDWR)
|
|
262
301
|
|
|
263
302
|
try:
|
|
264
|
-
# Acquire exclusive lock
|
|
265
|
-
|
|
266
|
-
# Windows file locking
|
|
267
|
-
msvcrt.locking(lock_fd, msvcrt.LK_LOCK, 1)
|
|
268
|
-
elif FCNTL_AVAILABLE:
|
|
269
|
-
# Unix file locking
|
|
270
|
-
fcntl.flock(lock_fd, fcntl.LOCK_EX)
|
|
271
|
-
# If neither available, proceed without locking (single-process only)
|
|
303
|
+
# Acquire exclusive lock with timeout
|
|
304
|
+
self._acquire_os_lock(lock_fd, run_id, lock_path, timeout=30)
|
|
272
305
|
|
|
273
|
-
self._locks[run_id] = lock_fd
|
|
306
|
+
self._locks[run_id] = (lock_fd, 1)
|
|
274
307
|
yield
|
|
275
308
|
finally:
|
|
276
|
-
# Release lock
|
|
277
|
-
if
|
|
309
|
+
# Release lock (only if this was the outermost lock)
|
|
310
|
+
if run_id in self._locks:
|
|
311
|
+
lock_fd, count = self._locks[run_id]
|
|
312
|
+
if count == 1:
|
|
313
|
+
self._release_os_lock(lock_fd, run_id)
|
|
314
|
+
else:
|
|
315
|
+
# Decrement count
|
|
316
|
+
self._locks[run_id] = (lock_fd, count - 1)
|
|
317
|
+
|
|
318
|
+
def _acquire_os_lock(
|
|
319
|
+
self,
|
|
320
|
+
lock_fd: int,
|
|
321
|
+
run_id: str,
|
|
322
|
+
lock_path: Path,
|
|
323
|
+
timeout: int = 30
|
|
324
|
+
) -> None:
|
|
325
|
+
"""Acquire OS-specific file lock with timeout.
|
|
326
|
+
|
|
327
|
+
Args:
|
|
328
|
+
lock_fd: File descriptor for lock file
|
|
329
|
+
run_id: Run identifier (for error messages)
|
|
330
|
+
lock_path: Path to lock file (for error messages)
|
|
331
|
+
timeout: Timeout in seconds
|
|
332
|
+
|
|
333
|
+
Raises:
|
|
334
|
+
TimeoutError: If lock cannot be acquired within timeout
|
|
335
|
+
"""
|
|
336
|
+
import time
|
|
337
|
+
|
|
338
|
+
if sys.platform == "win32":
|
|
339
|
+
# Windows file locking with retry
|
|
340
|
+
try:
|
|
341
|
+
import msvcrt
|
|
342
|
+
except ImportError:
|
|
343
|
+
# msvcrt not available - single-process mode
|
|
344
|
+
import logging
|
|
345
|
+
logger = logging.getLogger(__name__)
|
|
346
|
+
logger.debug("msvcrt not available. Single-process mode only.")
|
|
347
|
+
return
|
|
348
|
+
|
|
349
|
+
start_time = time.time()
|
|
350
|
+
while True:
|
|
351
|
+
try:
|
|
352
|
+
msvcrt.locking(lock_fd, msvcrt.LK_NBLCK, 1)
|
|
353
|
+
break # Lock acquired
|
|
354
|
+
except OSError as e:
|
|
355
|
+
# Lock is held by another thread/process (errno 13 Permission denied)
|
|
356
|
+
if time.time() - start_time > timeout:
|
|
357
|
+
try:
|
|
358
|
+
os.close(lock_fd)
|
|
359
|
+
except:
|
|
360
|
+
pass
|
|
361
|
+
raise TimeoutError(
|
|
362
|
+
f"Failed to acquire lock for run {run_id} after {timeout}s on Windows. "
|
|
363
|
+
f"This usually means another process is holding the lock or a previous process crashed. "
|
|
364
|
+
f"Try deleting: {lock_path}"
|
|
365
|
+
) from e
|
|
366
|
+
time.sleep(0.1) # Wait 100ms before retry
|
|
367
|
+
elif FCNTL_AVAILABLE:
|
|
368
|
+
# Unix file locking with non-blocking retry
|
|
369
|
+
start_time = time.time()
|
|
370
|
+
while True:
|
|
371
|
+
try:
|
|
372
|
+
fcntl.flock(lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
|
373
|
+
break # Lock acquired
|
|
374
|
+
except (IOError, OSError) as e:
|
|
375
|
+
# Lock is held by another process
|
|
376
|
+
if time.time() - start_time > timeout:
|
|
377
|
+
try:
|
|
378
|
+
os.close(lock_fd)
|
|
379
|
+
except:
|
|
380
|
+
pass
|
|
381
|
+
raise TimeoutError(
|
|
382
|
+
f"Failed to acquire lock for run {run_id} after {timeout}s. "
|
|
383
|
+
f"This usually means another process is holding the lock or a previous process crashed. "
|
|
384
|
+
f"Try: rm -f {lock_path}"
|
|
385
|
+
) from e
|
|
386
|
+
time.sleep(0.1) # Wait 100ms before retry
|
|
387
|
+
else:
|
|
388
|
+
# No locking available - single-process mode
|
|
389
|
+
# This is safe for single-process usage (most common case)
|
|
390
|
+
import logging
|
|
391
|
+
logger = logging.getLogger(__name__)
|
|
392
|
+
logger.debug(
|
|
393
|
+
f"File locking not available on this platform. "
|
|
394
|
+
f"Storage will work in single-process mode only."
|
|
395
|
+
)
|
|
396
|
+
|
|
397
|
+
def _release_os_lock(self, lock_fd: int, run_id: str) -> None:
|
|
398
|
+
"""Release OS-specific file lock.
|
|
399
|
+
|
|
400
|
+
Args:
|
|
401
|
+
lock_fd: File descriptor to close
|
|
402
|
+
run_id: Run identifier (for cleanup)
|
|
403
|
+
"""
|
|
404
|
+
# Release lock
|
|
405
|
+
if sys.platform == "win32":
|
|
406
|
+
try:
|
|
407
|
+
import msvcrt
|
|
278
408
|
msvcrt.locking(lock_fd, msvcrt.LK_UNLCK, 1)
|
|
279
|
-
|
|
409
|
+
except (ImportError, OSError):
|
|
410
|
+
pass # Lock may already be released
|
|
411
|
+
elif FCNTL_AVAILABLE:
|
|
412
|
+
try:
|
|
280
413
|
fcntl.flock(lock_fd, fcntl.LOCK_UN)
|
|
281
|
-
|
|
414
|
+
except (IOError, OSError):
|
|
415
|
+
pass # Lock may already be released
|
|
416
|
+
|
|
417
|
+
# Close file descriptor
|
|
418
|
+
try:
|
|
282
419
|
os.close(lock_fd)
|
|
283
|
-
|
|
420
|
+
except OSError:
|
|
421
|
+
pass # FD may already be closed
|
|
422
|
+
|
|
423
|
+
# Clean up tracking
|
|
424
|
+
self._locks.pop(run_id, None)
|
|
284
425
|
|
|
285
426
|
def start_run(
|
|
286
427
|
self,
|
|
@@ -456,16 +597,19 @@ class ExperimentStorage:
|
|
|
456
597
|
|
|
457
598
|
try:
|
|
458
599
|
if self._config.compression == "gzip":
|
|
600
|
+
# Close the fd first since gzip.open will open by path
|
|
601
|
+
os.close(temp_fd)
|
|
459
602
|
with gzip.open(temp_path, "wt", encoding="utf-8") as f:
|
|
460
603
|
f.write(json_line)
|
|
461
604
|
f.flush()
|
|
462
605
|
os.fsync(f.fileno())
|
|
463
606
|
else:
|
|
607
|
+
# Use the fd directly
|
|
464
608
|
with open(temp_fd, "w", encoding="utf-8") as f:
|
|
465
609
|
f.write(json_line)
|
|
466
610
|
f.flush()
|
|
467
611
|
os.fsync(f.fileno())
|
|
468
|
-
|
|
612
|
+
# fd is closed by context manager, don't close again
|
|
469
613
|
|
|
470
614
|
# Get target path with compression
|
|
471
615
|
target_path = (
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
import logging
|
|
5
6
|
import threading
|
|
6
7
|
from dataclasses import dataclass
|
|
7
8
|
from typing import Any, Dict
|
|
@@ -10,6 +11,8 @@ from themis.core import entities as core_entities
|
|
|
10
11
|
from themis.interfaces import ModelProvider
|
|
11
12
|
from themis.providers import register_provider
|
|
12
13
|
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
13
16
|
|
|
14
17
|
@dataclass
|
|
15
18
|
class LiteLLMProvider(ModelProvider):
|
|
@@ -57,7 +60,22 @@ class LiteLLMProvider(ModelProvider):
|
|
|
57
60
|
litellm.drop_params = self.drop_params
|
|
58
61
|
if self.max_retries > 0:
|
|
59
62
|
litellm.num_retries = self.max_retries
|
|
63
|
+
|
|
64
|
+
logger.debug(f"LiteLLMProvider initialized:")
|
|
65
|
+
logger.debug(f" api_base: {self.api_base or 'default'}")
|
|
66
|
+
logger.debug(f" timeout: {self.timeout}s")
|
|
67
|
+
logger.debug(f" max_retries: {self.max_retries}")
|
|
68
|
+
logger.debug(f" n_parallel: {self.n_parallel}")
|
|
69
|
+
|
|
70
|
+
# Warn if api_base is set but no api_key
|
|
71
|
+
if self.api_base and not self.api_key:
|
|
72
|
+
logger.warning(
|
|
73
|
+
"⚠️ LiteLLMProvider: api_base is set but api_key is not. "
|
|
74
|
+
"This may cause authentication errors. "
|
|
75
|
+
"Set api_key='dummy' for local servers."
|
|
76
|
+
)
|
|
60
77
|
except ImportError as exc:
|
|
78
|
+
logger.error("❌ LiteLLM is not installed")
|
|
61
79
|
raise RuntimeError(
|
|
62
80
|
"LiteLLM is not installed. Install via `pip install litellm` or "
|
|
63
81
|
"`uv add litellm` to use LiteLLMProvider."
|
|
@@ -70,6 +88,10 @@ class LiteLLMProvider(ModelProvider):
|
|
|
70
88
|
|
|
71
89
|
messages = self._build_messages(task)
|
|
72
90
|
completion_kwargs = self._build_completion_kwargs(task, messages)
|
|
91
|
+
|
|
92
|
+
logger.debug(f"LiteLLMProvider: Calling model={completion_kwargs.get('model')}")
|
|
93
|
+
if self.api_base:
|
|
94
|
+
logger.debug(f"LiteLLMProvider: Using custom api_base={self.api_base}")
|
|
73
95
|
|
|
74
96
|
try:
|
|
75
97
|
with self._semaphore:
|
|
@@ -131,6 +153,30 @@ class LiteLLMProvider(ModelProvider):
|
|
|
131
153
|
details["status_code"] = exc.status_code # type: ignore
|
|
132
154
|
if hasattr(exc, "llm_provider"):
|
|
133
155
|
details["llm_provider"] = exc.llm_provider # type: ignore
|
|
156
|
+
|
|
157
|
+
# Log with helpful context
|
|
158
|
+
if "AuthenticationError" in error_type or "api_key" in error_message.lower():
|
|
159
|
+
logger.error(
|
|
160
|
+
f"LiteLLMProvider: ❌ Authentication error for model {task.model.identifier}"
|
|
161
|
+
)
|
|
162
|
+
logger.error(
|
|
163
|
+
f" Error: {error_message[:200]}"
|
|
164
|
+
)
|
|
165
|
+
logger.error(
|
|
166
|
+
f" Hint: If using a custom api_base, ensure you also pass api_key='dummy'"
|
|
167
|
+
)
|
|
168
|
+
elif "Connection" in error_type or "timeout" in error_message.lower():
|
|
169
|
+
logger.error(
|
|
170
|
+
f"LiteLLMProvider: ❌ Connection error for model {task.model.identifier}"
|
|
171
|
+
)
|
|
172
|
+
logger.error(f" Error: {error_message[:200]}")
|
|
173
|
+
if self.api_base:
|
|
174
|
+
logger.error(f" Check that the server at {self.api_base} is running")
|
|
175
|
+
else:
|
|
176
|
+
logger.error(
|
|
177
|
+
f"LiteLLMProvider: ❌ Generation failed for {task.model.identifier}: "
|
|
178
|
+
f"{error_type}: {error_message[:200]}"
|
|
179
|
+
)
|
|
134
180
|
|
|
135
181
|
return core_entities.GenerationRecord(
|
|
136
182
|
task=task,
|
themis/generation/runner.py
CHANGED
|
@@ -49,16 +49,32 @@ class GenerationRunner:
|
|
|
49
49
|
) -> Iterator[core_entities.GenerationRecord]:
|
|
50
50
|
task_list = list(tasks)
|
|
51
51
|
if not task_list:
|
|
52
|
+
logger.info("Runner: No tasks to execute")
|
|
52
53
|
return
|
|
54
|
+
|
|
55
|
+
logger.info(f"Runner: Starting execution of {len(task_list)} tasks with {self._max_parallel} workers")
|
|
56
|
+
|
|
53
57
|
if self._max_parallel <= 1:
|
|
54
|
-
|
|
58
|
+
logger.info("Runner: Using sequential execution (1 worker)")
|
|
59
|
+
for i, task in enumerate(task_list, 1):
|
|
60
|
+
logger.debug(f"Runner: Processing task {i}/{len(task_list)}")
|
|
55
61
|
yield self._execute_task(task)
|
|
56
62
|
return
|
|
57
63
|
|
|
64
|
+
logger.info(f"Runner: Using parallel execution ({self._max_parallel} workers)")
|
|
58
65
|
with ThreadPoolExecutor(max_workers=self._max_parallel) as executor:
|
|
59
66
|
futures = [executor.submit(self._execute_task, task) for task in task_list]
|
|
67
|
+
completed = 0
|
|
60
68
|
for future in futures:
|
|
61
|
-
|
|
69
|
+
try:
|
|
70
|
+
result = future.result()
|
|
71
|
+
completed += 1
|
|
72
|
+
if completed % max(1, len(task_list) // 10) == 0 or completed == len(task_list):
|
|
73
|
+
logger.debug(f"Runner: Completed {completed}/{len(task_list)} tasks")
|
|
74
|
+
yield result
|
|
75
|
+
except Exception as e:
|
|
76
|
+
logger.error(f"Runner: Task execution failed: {e}")
|
|
77
|
+
raise
|
|
62
78
|
|
|
63
79
|
def _run_single_attempt(
|
|
64
80
|
self, task: core_entities.GenerationTask
|
|
@@ -70,7 +86,7 @@ class GenerationRunner:
|
|
|
70
86
|
for attempt in range(1, self._max_retries + 1):
|
|
71
87
|
try:
|
|
72
88
|
logger.debug(
|
|
73
|
-
"Starting generation for %s attempt %s/%s",
|
|
89
|
+
"Runner: Starting generation for %s (attempt %s/%s)",
|
|
74
90
|
task_label,
|
|
75
91
|
attempt,
|
|
76
92
|
self._max_retries,
|
|
@@ -79,16 +95,16 @@ class GenerationRunner:
|
|
|
79
95
|
record.metrics["generation_attempts"] = attempt
|
|
80
96
|
if attempt_errors:
|
|
81
97
|
record.metrics.setdefault("retry_errors", attempt_errors)
|
|
82
|
-
logger.debug("Completed %s in %s attempt(s)", task_label, attempt)
|
|
98
|
+
logger.debug("Runner: ✅ Completed %s in %s attempt(s)", task_label, attempt)
|
|
83
99
|
return record
|
|
84
100
|
except Exception as exc: # pragma: no cover - defensive path
|
|
85
101
|
last_error = exc
|
|
86
102
|
logger.warning(
|
|
87
|
-
"Attempt %s/%s for %s failed: %s",
|
|
103
|
+
"Runner: ⚠️ Attempt %s/%s for %s failed: %s",
|
|
88
104
|
attempt,
|
|
89
105
|
self._max_retries,
|
|
90
106
|
task_label,
|
|
91
|
-
exc,
|
|
107
|
+
str(exc)[:100], # Truncate long error messages
|
|
92
108
|
)
|
|
93
109
|
attempt_errors.append(
|
|
94
110
|
{
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
themis/__init__.py,sha256=Pswn5ZiXyU5ANoknjdBLkqouZQdeWMm3DoUMVzU_j8M,543
|
|
2
|
-
themis/_version.py,sha256=
|
|
3
|
-
themis/api.py,sha256=
|
|
2
|
+
themis/_version.py,sha256=R6LtutHSlN-yNUXHD-aPwhshiv94GS8wU_HzIsShIy4,378
|
|
3
|
+
themis/api.py,sha256=l_xRpFQ4U4dJtosm-nVudn1My2qTalMmafBky7e_m6M,15705
|
|
4
4
|
themis/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
5
|
themis/backends/__init__.py,sha256=RWM5SnV5FrS_cVjpHHeZZM_b9CgqBu1rPS5DlT5YQTY,578
|
|
6
6
|
themis/backends/execution.py,sha256=RAFuB9ri8TMil5PcnsisypKO2ViyLFXj08P_vjNYguU,6095
|
|
@@ -112,9 +112,9 @@ themis/experiment/export_csv.py,sha256=80w3gEGjeLjuiNq539rRP73k3MBtwrzJy90hgE91A
|
|
|
112
112
|
themis/experiment/integration_manager.py,sha256=wTVTjDGcUkzz4tfnwSxa5nK1A4e2FKCPazDYGcdzYS8,3325
|
|
113
113
|
themis/experiment/math.py,sha256=P2E9F_UKI7pb-aXepSztGdr_g309WEMe83zqg1nWO7A,6973
|
|
114
114
|
themis/experiment/mcq.py,sha256=DDB99FHQsU_5vMIRDRhSZ7pReYvVf57wLmmo3OU_An4,6276
|
|
115
|
-
themis/experiment/orchestrator.py,sha256
|
|
115
|
+
themis/experiment/orchestrator.py,sha256=VeSasDmCXrYlrv1r47I698RUq14vEBR7c_uyZzM01hw,19304
|
|
116
116
|
themis/experiment/pricing.py,sha256=fTM32yE3L8vahMP4sr1zr7dbp9zYCjiPN4D4VuZ8-q8,9346
|
|
117
|
-
themis/experiment/storage.py,sha256=
|
|
117
|
+
themis/experiment/storage.py,sha256=ujGiQTeRPOfS8hYHB1a7F9t-dQnXquhqomI1vDjqmno,55250
|
|
118
118
|
themis/experiment/visualization.py,sha256=dJYHrp3mntl8CPc5HPI3iKqPztVsddQB3ogRkd_FCNc,18473
|
|
119
119
|
themis/generation/__init__.py,sha256=6KVwCQYMpPIsXNuWDZOGuqHkUkA45lbSacIFn8ZbD4s,36
|
|
120
120
|
themis/generation/agentic_runner.py,sha256=armBQBk7qZDBEwT8HqjIWomYDQm57NfrP5CZJzay2uA,13669
|
|
@@ -123,12 +123,12 @@ themis/generation/clients.py,sha256=6apXCp_VNQosnpnmohTHOhHGXw-VZgsUyLds8MwtYUE,
|
|
|
123
123
|
themis/generation/conversation_runner.py,sha256=kSZHwEvfqzxZ-eQYxmg5OkNZcgEHggZExjad6nBOeTM,7980
|
|
124
124
|
themis/generation/plan.py,sha256=RmPIdefXkQMHYv5EWiilpx91I9a-svw31imvG0wV3fE,15961
|
|
125
125
|
themis/generation/router.py,sha256=jZc0KFL483f8TrYtt9yxzFKs-T9CG2CoE2kfOQdHMEc,1082
|
|
126
|
-
themis/generation/runner.py,sha256=
|
|
126
|
+
themis/generation/runner.py,sha256=pH4Dw77qskMQk3yxEkaHYAl1PItTofI7OXdvevnFiCA,8984
|
|
127
127
|
themis/generation/strategies.py,sha256=hjqaVkNycFxJWh_edJ7ilBl7HS6bL-8pYm24zTfoAvg,2975
|
|
128
128
|
themis/generation/templates.py,sha256=ut_6akp8Y6Ey_9O3s64jDbwCB74pw62Zf8URlYcKHkA,2325
|
|
129
129
|
themis/generation/turn_strategies.py,sha256=w33qhzpQbGTsfeOgOgMDovV0wEeXeNZUUBm5yZy1naw,10973
|
|
130
130
|
themis/generation/types.py,sha256=MkJnZk6lMHmHzlJVEsuIC9ioRW8XhWcSk9AdDeb_aLE,338
|
|
131
|
-
themis/generation/providers/litellm_provider.py,sha256=
|
|
131
|
+
themis/generation/providers/litellm_provider.py,sha256=tvLY8hrSjo4CnyWzccFp1PkXj8R2j8pda5irJiarWd8,10334
|
|
132
132
|
themis/generation/providers/vllm_provider.py,sha256=0K4we6xDrRXlBXseC1ixLq2sJpRF4T8Ikv45dw-zNk4,4625
|
|
133
133
|
themis/integrations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
134
134
|
themis/integrations/huggingface.py,sha256=vrLwYwn65pU4W3FUe0ImCOZxKKlpRshDqMoLFsclB3E,2370
|
|
@@ -150,8 +150,8 @@ themis/utils/dashboard.py,sha256=2yiIu9_oENglTde_J3G1d5cpQ5VtSnfbUvdliw5Og1E,130
|
|
|
150
150
|
themis/utils/logging_utils.py,sha256=YNSiDfO4LsciSzUhHF1aTVI5rkfnWiVbn1NcGjjmJuQ,1019
|
|
151
151
|
themis/utils/progress.py,sha256=b3YwHKV5x3Cvr5rBukqifJimK3Si4CGY2fpN6a_ZySI,1434
|
|
152
152
|
themis/utils/tracing.py,sha256=VTeiRjcW_B5fOOoSeAp37nrmlwP1DiqPcoe6OtIQ7dk,8468
|
|
153
|
-
themis_eval-0.2.
|
|
154
|
-
themis_eval-0.2.
|
|
155
|
-
themis_eval-0.2.
|
|
156
|
-
themis_eval-0.2.
|
|
157
|
-
themis_eval-0.2.
|
|
153
|
+
themis_eval-0.2.1.dist-info/licenses/LICENSE,sha256=K5FLE7iqn5-_6k1sf3IGy7w-Wx_Vdx3t0sOVJByNlF0,1076
|
|
154
|
+
themis_eval-0.2.1.dist-info/METADATA,sha256=h1lFivm8bgwIbh4Fw0JCii1-f3JNMDxiyPAJM3qEtV0,15173
|
|
155
|
+
themis_eval-0.2.1.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
|
|
156
|
+
themis_eval-0.2.1.dist-info/top_level.txt,sha256=QGIl4v-KB32upFS5UTXMJxHVX3vF7yBso82wJFI1Vbs,7
|
|
157
|
+
themis_eval-0.2.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|