edsl 0.1.54__py3-none-any.whl → 0.1.55__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- edsl/__init__.py +8 -1
- edsl/__init__original.py +134 -0
- edsl/__version__.py +1 -1
- edsl/agents/agent.py +29 -0
- edsl/agents/agent_list.py +36 -1
- edsl/base/base_class.py +281 -151
- edsl/buckets/__init__.py +8 -3
- edsl/buckets/bucket_collection.py +9 -3
- edsl/buckets/model_buckets.py +4 -2
- edsl/buckets/token_bucket.py +2 -2
- edsl/buckets/token_bucket_client.py +5 -3
- edsl/caching/cache.py +131 -62
- edsl/caching/cache_entry.py +70 -58
- edsl/caching/sql_dict.py +17 -0
- edsl/cli.py +99 -0
- edsl/config/config_class.py +16 -0
- edsl/conversation/__init__.py +31 -0
- edsl/coop/coop.py +276 -242
- edsl/coop/coop_jobs_objects.py +59 -0
- edsl/coop/coop_objects.py +29 -0
- edsl/coop/coop_regular_objects.py +26 -0
- edsl/coop/utils.py +24 -19
- edsl/dataset/dataset.py +338 -101
- edsl/db_list/sqlite_list.py +349 -0
- edsl/inference_services/__init__.py +40 -5
- edsl/inference_services/exceptions.py +11 -0
- edsl/inference_services/services/anthropic_service.py +5 -2
- edsl/inference_services/services/aws_bedrock.py +6 -2
- edsl/inference_services/services/azure_ai.py +6 -2
- edsl/inference_services/services/google_service.py +3 -2
- edsl/inference_services/services/mistral_ai_service.py +6 -2
- edsl/inference_services/services/open_ai_service.py +6 -2
- edsl/inference_services/services/perplexity_service.py +6 -2
- edsl/inference_services/services/test_service.py +94 -5
- edsl/interviews/answering_function.py +167 -59
- edsl/interviews/interview.py +124 -72
- edsl/interviews/interview_task_manager.py +10 -0
- edsl/invigilators/invigilators.py +9 -0
- edsl/jobs/async_interview_runner.py +146 -104
- edsl/jobs/data_structures.py +6 -4
- edsl/jobs/decorators.py +61 -0
- edsl/jobs/fetch_invigilator.py +61 -18
- edsl/jobs/html_table_job_logger.py +14 -2
- edsl/jobs/jobs.py +180 -104
- edsl/jobs/jobs_component_constructor.py +2 -2
- edsl/jobs/jobs_interview_constructor.py +2 -0
- edsl/jobs/jobs_remote_inference_logger.py +4 -0
- edsl/jobs/jobs_runner_status.py +30 -25
- edsl/jobs/progress_bar_manager.py +79 -0
- edsl/jobs/remote_inference.py +35 -1
- edsl/key_management/key_lookup_builder.py +6 -1
- edsl/language_models/language_model.py +86 -6
- edsl/language_models/model.py +10 -3
- edsl/language_models/price_manager.py +45 -75
- edsl/language_models/registry.py +5 -0
- edsl/notebooks/notebook.py +77 -10
- edsl/questions/VALIDATION_README.md +134 -0
- edsl/questions/__init__.py +24 -1
- edsl/questions/exceptions.py +21 -0
- edsl/questions/question_dict.py +201 -16
- edsl/questions/question_multiple_choice_with_other.py +624 -0
- edsl/questions/question_registry.py +2 -1
- edsl/questions/templates/multiple_choice_with_other/__init__.py +0 -0
- edsl/questions/templates/multiple_choice_with_other/answering_instructions.jinja +15 -0
- edsl/questions/templates/multiple_choice_with_other/question_presentation.jinja +17 -0
- edsl/questions/validation_analysis.py +185 -0
- edsl/questions/validation_cli.py +131 -0
- edsl/questions/validation_html_report.py +404 -0
- edsl/questions/validation_logger.py +136 -0
- edsl/results/result.py +63 -16
- edsl/results/results.py +702 -171
- edsl/scenarios/construct_download_link.py +16 -3
- edsl/scenarios/directory_scanner.py +226 -226
- edsl/scenarios/file_methods.py +5 -0
- edsl/scenarios/file_store.py +117 -6
- edsl/scenarios/handlers/__init__.py +5 -1
- edsl/scenarios/handlers/mp4_file_store.py +104 -0
- edsl/scenarios/handlers/webm_file_store.py +104 -0
- edsl/scenarios/scenario.py +120 -101
- edsl/scenarios/scenario_list.py +800 -727
- edsl/scenarios/scenario_list_gc_test.py +146 -0
- edsl/scenarios/scenario_list_memory_test.py +214 -0
- edsl/scenarios/scenario_list_source_refactor.md +35 -0
- edsl/scenarios/scenario_selector.py +5 -4
- edsl/scenarios/scenario_source.py +1990 -0
- edsl/scenarios/tests/test_scenario_list_sources.py +52 -0
- edsl/surveys/survey.py +22 -0
- edsl/tasks/__init__.py +4 -2
- edsl/tasks/task_history.py +198 -36
- edsl/tests/scenarios/test_ScenarioSource.py +51 -0
- edsl/tests/scenarios/test_scenario_list_sources.py +51 -0
- edsl/utilities/__init__.py +2 -1
- edsl/utilities/decorators.py +121 -0
- edsl/utilities/memory_debugger.py +1010 -0
- {edsl-0.1.54.dist-info → edsl-0.1.55.dist-info}/METADATA +51 -76
- {edsl-0.1.54.dist-info → edsl-0.1.55.dist-info}/RECORD +99 -75
- edsl/jobs/jobs_runner_asyncio.py +0 -281
- edsl/language_models/unused/fake_openai_service.py +0 -60
- {edsl-0.1.54.dist-info → edsl-0.1.55.dist-info}/LICENSE +0 -0
- {edsl-0.1.54.dist-info → edsl-0.1.55.dist-info}/WHEEL +0 -0
- {edsl-0.1.54.dist-info → edsl-0.1.55.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,404 @@
|
|
1
|
+
"""Generate an HTML report for validation failures.
|
2
|
+
|
3
|
+
This module provides functionality to create an HTML report of validation failures,
|
4
|
+
including statistics, suggestions for improvements, and examples of common failures.
|
5
|
+
"""
|
6
|
+
|
7
|
+
import json
|
8
|
+
import os
|
9
|
+
from datetime import datetime
|
10
|
+
from pathlib import Path
|
11
|
+
from typing import Dict, List, Optional
|
12
|
+
|
13
|
+
from ..config import CONFIG
|
14
|
+
from .validation_analysis import (
|
15
|
+
get_validation_failure_stats,
|
16
|
+
suggest_fix_improvements,
|
17
|
+
export_improvements_report
|
18
|
+
)
|
19
|
+
from .validation_logger import get_validation_failure_logs
|
20
|
+
|
21
|
+
HTML_TEMPLATE = """
|
22
|
+
<!DOCTYPE html>
|
23
|
+
<html lang="en">
|
24
|
+
<head>
|
25
|
+
<meta charset="UTF-8">
|
26
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
27
|
+
<title>EDSL Validation Failures Report</title>
|
28
|
+
<style>
|
29
|
+
body {
|
30
|
+
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif;
|
31
|
+
line-height: 1.6;
|
32
|
+
color: #333;
|
33
|
+
max-width: 1200px;
|
34
|
+
margin: 0 auto;
|
35
|
+
padding: 20px;
|
36
|
+
}
|
37
|
+
h1, h2, h3, h4 {
|
38
|
+
color: #2c3e50;
|
39
|
+
}
|
40
|
+
.header {
|
41
|
+
border-bottom: 1px solid #eee;
|
42
|
+
padding-bottom: 10px;
|
43
|
+
margin-bottom: 20px;
|
44
|
+
display: flex;
|
45
|
+
justify-content: space-between;
|
46
|
+
align-items: center;
|
47
|
+
}
|
48
|
+
.timestamp {
|
49
|
+
color: #7f8c8d;
|
50
|
+
font-size: 0.9em;
|
51
|
+
}
|
52
|
+
.summary {
|
53
|
+
background-color: #f8f9fa;
|
54
|
+
border-radius: 5px;
|
55
|
+
padding: 15px;
|
56
|
+
margin-bottom: 20px;
|
57
|
+
}
|
58
|
+
.stats-container, .suggestions-container, .examples-container {
|
59
|
+
margin-bottom: 30px;
|
60
|
+
}
|
61
|
+
table {
|
62
|
+
width: 100%;
|
63
|
+
border-collapse: collapse;
|
64
|
+
margin-bottom: 20px;
|
65
|
+
}
|
66
|
+
th, td {
|
67
|
+
padding: 12px 15px;
|
68
|
+
text-align: left;
|
69
|
+
border-bottom: 1px solid #ddd;
|
70
|
+
}
|
71
|
+
th {
|
72
|
+
background-color: #f8f9fa;
|
73
|
+
font-weight: 600;
|
74
|
+
}
|
75
|
+
tr:hover {
|
76
|
+
background-color: #f5f5f5;
|
77
|
+
}
|
78
|
+
.suggestion {
|
79
|
+
background-color: #e3f2fd;
|
80
|
+
border-left: 4px solid #2196f3;
|
81
|
+
padding: 10px 15px;
|
82
|
+
margin-bottom: 10px;
|
83
|
+
border-radius: 0 4px 4px 0;
|
84
|
+
}
|
85
|
+
.card {
|
86
|
+
border: 1px solid #ddd;
|
87
|
+
border-radius: 4px;
|
88
|
+
padding: 15px;
|
89
|
+
margin-bottom: 20px;
|
90
|
+
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
|
91
|
+
}
|
92
|
+
.card-header {
|
93
|
+
font-weight: 600;
|
94
|
+
margin-bottom: 10px;
|
95
|
+
padding-bottom: 10px;
|
96
|
+
border-bottom: 1px solid #eee;
|
97
|
+
}
|
98
|
+
.example {
|
99
|
+
background-color: #fff8e1;
|
100
|
+
border-left: 4px solid #ffc107;
|
101
|
+
padding: 10px 15px;
|
102
|
+
margin-bottom: 10px;
|
103
|
+
border-radius: 0 4px 4px 0;
|
104
|
+
overflow-x: auto;
|
105
|
+
}
|
106
|
+
pre {
|
107
|
+
background-color: #f5f5f5;
|
108
|
+
padding: 10px;
|
109
|
+
border-radius: 4px;
|
110
|
+
overflow-x: auto;
|
111
|
+
}
|
112
|
+
code {
|
113
|
+
font-family: SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace;
|
114
|
+
font-size: 0.9em;
|
115
|
+
}
|
116
|
+
.badge {
|
117
|
+
display: inline-block;
|
118
|
+
padding: 3px 7px;
|
119
|
+
font-size: 0.75em;
|
120
|
+
font-weight: 600;
|
121
|
+
line-height: 1;
|
122
|
+
text-align: center;
|
123
|
+
white-space: nowrap;
|
124
|
+
vertical-align: baseline;
|
125
|
+
border-radius: 10px;
|
126
|
+
background-color: #e9ecef;
|
127
|
+
margin-right: 5px;
|
128
|
+
}
|
129
|
+
.badge-warning {
|
130
|
+
background-color: #fff3cd;
|
131
|
+
color: #856404;
|
132
|
+
}
|
133
|
+
.badge-primary {
|
134
|
+
background-color: #cfe2ff;
|
135
|
+
color: #084298;
|
136
|
+
}
|
137
|
+
.badge-success {
|
138
|
+
background-color: #d1e7dd;
|
139
|
+
color: #0f5132;
|
140
|
+
}
|
141
|
+
.fix-method {
|
142
|
+
background-color: #e8f5e9;
|
143
|
+
border-left: 4px solid #4caf50;
|
144
|
+
padding: 10px 15px;
|
145
|
+
margin: 10px 0;
|
146
|
+
border-radius: 0 4px 4px 0;
|
147
|
+
}
|
148
|
+
</style>
|
149
|
+
</head>
|
150
|
+
<body>
|
151
|
+
<div class="header">
|
152
|
+
<h1>EDSL Validation Failures Report</h1>
|
153
|
+
<span class="timestamp">Generated on {{timestamp}}</span>
|
154
|
+
</div>
|
155
|
+
|
156
|
+
<div class="summary">
|
157
|
+
<h2>Summary</h2>
|
158
|
+
<p>This report analyzes validation failures that occurred when question answers didn't meet the expected format or constraints.
|
159
|
+
It provides statistics, improvement suggestions for fix methods, and examples of common failures.</p>
|
160
|
+
<p><strong>Total validation failures:</strong> {{total_failures}}</p>
|
161
|
+
<p><strong>Question types with failures:</strong> {{question_types_count}}</p>
|
162
|
+
</div>
|
163
|
+
|
164
|
+
<div class="stats-container">
|
165
|
+
<h2>Validation Failure Statistics</h2>
|
166
|
+
|
167
|
+
<div class="card">
|
168
|
+
<div class="card-header">Failures by Question Type</div>
|
169
|
+
<table>
|
170
|
+
<thead>
|
171
|
+
<tr>
|
172
|
+
<th>Question Type</th>
|
173
|
+
<th>Failure Count</th>
|
174
|
+
<th>Percentage</th>
|
175
|
+
</tr>
|
176
|
+
</thead>
|
177
|
+
<tbody>
|
178
|
+
{{type_stats_rows}}
|
179
|
+
</tbody>
|
180
|
+
</table>
|
181
|
+
</div>
|
182
|
+
|
183
|
+
<div class="card">
|
184
|
+
<div class="card-header">Top Error Messages</div>
|
185
|
+
<table>
|
186
|
+
<thead>
|
187
|
+
<tr>
|
188
|
+
<th>Error Message</th>
|
189
|
+
<th>Occurrence Count</th>
|
190
|
+
</tr>
|
191
|
+
</thead>
|
192
|
+
<tbody>
|
193
|
+
{{error_stats_rows}}
|
194
|
+
</tbody>
|
195
|
+
</table>
|
196
|
+
</div>
|
197
|
+
</div>
|
198
|
+
|
199
|
+
<div class="suggestions-container">
|
200
|
+
<h2>Fix Method Improvement Suggestions</h2>
|
201
|
+
{{suggestions_content}}
|
202
|
+
</div>
|
203
|
+
|
204
|
+
<div class="examples-container">
|
205
|
+
<h2>Example Validation Failures</h2>
|
206
|
+
{{examples_content}}
|
207
|
+
</div>
|
208
|
+
</body>
|
209
|
+
</html>
|
210
|
+
"""
|
211
|
+
|
212
|
+
|
213
|
+
def _generate_type_stats_rows(stats: Dict) -> str:
|
214
|
+
"""Generate HTML table rows for question type statistics."""
|
215
|
+
type_stats = stats.get("by_question_type", {})
|
216
|
+
total_failures = sum(type_stats.values())
|
217
|
+
|
218
|
+
rows = []
|
219
|
+
for question_type, count in sorted(type_stats.items(), key=lambda x: x[1], reverse=True):
|
220
|
+
percentage = (count / total_failures) * 100 if total_failures > 0 else 0
|
221
|
+
row = (
|
222
|
+
f"<tr>"
|
223
|
+
f"<td>{question_type}</td>"
|
224
|
+
f"<td>{count}</td>"
|
225
|
+
f"<td>{percentage:.1f}%</td>"
|
226
|
+
f"</tr>"
|
227
|
+
)
|
228
|
+
rows.append(row)
|
229
|
+
|
230
|
+
return "\n".join(rows)
|
231
|
+
|
232
|
+
|
233
|
+
def _generate_error_stats_rows(stats: Dict) -> str:
|
234
|
+
"""Generate HTML table rows for error message statistics."""
|
235
|
+
error_counts = {}
|
236
|
+
|
237
|
+
# Aggregate error counts across all question types
|
238
|
+
for question_type, errors in stats.get("by_error_message", {}).items():
|
239
|
+
for error_msg, count in errors.items():
|
240
|
+
error_counts[error_msg] = error_counts.get(error_msg, 0) + count
|
241
|
+
|
242
|
+
# Sort by count (descending)
|
243
|
+
sorted_errors = sorted(error_counts.items(), key=lambda x: x[1], reverse=True)
|
244
|
+
|
245
|
+
rows = []
|
246
|
+
for error_msg, count in sorted_errors[:10]: # Show top 10 errors
|
247
|
+
shortened_msg = error_msg[:100] + "..." if len(error_msg) > 100 else error_msg
|
248
|
+
row = (
|
249
|
+
f"<tr>"
|
250
|
+
f"<td>{shortened_msg}</td>"
|
251
|
+
f"<td>{count}</td>"
|
252
|
+
f"</tr>"
|
253
|
+
)
|
254
|
+
rows.append(row)
|
255
|
+
|
256
|
+
return "\n".join(rows)
|
257
|
+
|
258
|
+
|
259
|
+
def _generate_suggestions_content(suggestions: Dict) -> str:
|
260
|
+
"""Generate HTML content for fix method suggestions."""
|
261
|
+
if not suggestions:
|
262
|
+
return "<p>No suggestions available. Log more validation failures to generate improvement suggestions.</p>"
|
263
|
+
|
264
|
+
content = []
|
265
|
+
|
266
|
+
for question_type, question_suggestions in suggestions.items():
|
267
|
+
content.append(f"<div class='card'>")
|
268
|
+
content.append(f"<div class='card-header'>{question_type}</div>")
|
269
|
+
|
270
|
+
for suggestion in question_suggestions:
|
271
|
+
error_msg = suggestion.get("error_message", "")
|
272
|
+
occurrence_count = suggestion.get("occurrence_count", 0)
|
273
|
+
suggestion_text = suggestion.get("suggestion", "")
|
274
|
+
|
275
|
+
content.append(
|
276
|
+
f"<div class='suggestion'>"
|
277
|
+
f"<p><strong>Error:</strong> {error_msg}</p>"
|
278
|
+
f"<p><strong>Occurrences:</strong> {occurrence_count}</p>"
|
279
|
+
f"<div class='fix-method'>"
|
280
|
+
f"<p><strong>Suggested improvement:</strong></p>"
|
281
|
+
f"<p>{suggestion_text}</p>"
|
282
|
+
f"</div>"
|
283
|
+
f"</div>"
|
284
|
+
)
|
285
|
+
|
286
|
+
content.append("</div>")
|
287
|
+
|
288
|
+
return "\n".join(content)
|
289
|
+
|
290
|
+
|
291
|
+
def _generate_examples_content(logs: List[Dict]) -> str:
|
292
|
+
"""Generate HTML content for example validation failures."""
|
293
|
+
if not logs:
|
294
|
+
return "<p>No validation failure examples available.</p>"
|
295
|
+
|
296
|
+
content = []
|
297
|
+
|
298
|
+
# Group logs by question type
|
299
|
+
logs_by_type = {}
|
300
|
+
for log in logs:
|
301
|
+
question_type = log.get("question_type", "unknown")
|
302
|
+
if question_type not in logs_by_type:
|
303
|
+
logs_by_type[question_type] = []
|
304
|
+
logs_by_type[question_type].append(log)
|
305
|
+
|
306
|
+
# For each question type, show the most recent example
|
307
|
+
for question_type, type_logs in logs_by_type.items():
|
308
|
+
# Sort by timestamp (newest first)
|
309
|
+
sorted_logs = sorted(type_logs, key=lambda x: x.get("timestamp", ""), reverse=True)
|
310
|
+
example_log = sorted_logs[0]
|
311
|
+
|
312
|
+
error_message = example_log.get("error_message", "")
|
313
|
+
invalid_data = example_log.get("invalid_data", {})
|
314
|
+
model_schema = example_log.get("model_schema", {})
|
315
|
+
|
316
|
+
content.append(f"<div class='card'>")
|
317
|
+
content.append(f"<div class='card-header'>{question_type}</div>")
|
318
|
+
|
319
|
+
content.append(
|
320
|
+
f"<div class='example'>"
|
321
|
+
f"<p><strong>Error:</strong> {error_message}</p>"
|
322
|
+
f"<p><strong>Invalid Data:</strong></p>"
|
323
|
+
f"<pre><code>{json.dumps(invalid_data, indent=2)}</code></pre>"
|
324
|
+
f"<p><strong>Expected Schema:</strong></p>"
|
325
|
+
f"<pre><code>{json.dumps(model_schema, indent=2)}</code></pre>"
|
326
|
+
f"</div>"
|
327
|
+
)
|
328
|
+
|
329
|
+
content.append("</div>")
|
330
|
+
|
331
|
+
return "\n".join(content)
|
332
|
+
|
333
|
+
|
334
|
+
def generate_html_report(output_path: Optional[Path] = None) -> Path:
|
335
|
+
"""
|
336
|
+
Generate an HTML report of validation failures.
|
337
|
+
|
338
|
+
Args:
|
339
|
+
output_path: Optional custom path for the report
|
340
|
+
|
341
|
+
Returns:
|
342
|
+
Path to the generated HTML report
|
343
|
+
"""
|
344
|
+
# Determine output path
|
345
|
+
if output_path is None:
|
346
|
+
default_log_dir = Path.home() / ".edsl" / "logs"
|
347
|
+
try:
|
348
|
+
report_dir = Path(CONFIG.get("EDSL_LOG_DIR"))
|
349
|
+
except Exception:
|
350
|
+
# If EDSL_LOG_DIR is not defined, use default
|
351
|
+
report_dir = default_log_dir
|
352
|
+
os.makedirs(report_dir, exist_ok=True)
|
353
|
+
output_path = report_dir / "validation_report.html"
|
354
|
+
|
355
|
+
# Get validation data
|
356
|
+
logs = get_validation_failure_logs(n=100) # Get up to 100 recent logs
|
357
|
+
stats = get_validation_failure_stats()
|
358
|
+
suggestions = suggest_fix_improvements()
|
359
|
+
|
360
|
+
# Calculate summary statistics
|
361
|
+
total_failures = sum(stats.get("by_question_type", {}).values())
|
362
|
+
question_types_count = len(stats.get("by_question_type", {}))
|
363
|
+
|
364
|
+
# Generate report content
|
365
|
+
type_stats_rows = _generate_type_stats_rows(stats)
|
366
|
+
error_stats_rows = _generate_error_stats_rows(stats)
|
367
|
+
suggestions_content = _generate_suggestions_content(suggestions)
|
368
|
+
examples_content = _generate_examples_content(logs)
|
369
|
+
|
370
|
+
# Format timestamp
|
371
|
+
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
372
|
+
|
373
|
+
# Fill the template
|
374
|
+
html_content = HTML_TEMPLATE.replace("{{timestamp}}", timestamp)
|
375
|
+
html_content = html_content.replace("{{total_failures}}", str(total_failures))
|
376
|
+
html_content = html_content.replace("{{question_types_count}}", str(question_types_count))
|
377
|
+
html_content = html_content.replace("{{type_stats_rows}}", type_stats_rows)
|
378
|
+
html_content = html_content.replace("{{error_stats_rows}}", error_stats_rows)
|
379
|
+
html_content = html_content.replace("{{suggestions_content}}", suggestions_content)
|
380
|
+
html_content = html_content.replace("{{examples_content}}", examples_content)
|
381
|
+
|
382
|
+
# Write the report
|
383
|
+
with open(output_path, "w") as f:
|
384
|
+
f.write(html_content)
|
385
|
+
|
386
|
+
return output_path
|
387
|
+
|
388
|
+
|
389
|
+
def generate_and_open_report() -> None:
|
390
|
+
"""Generate a validation report and open it in the default browser."""
|
391
|
+
report_path = generate_html_report()
|
392
|
+
print(f"Report generated at: {report_path}")
|
393
|
+
|
394
|
+
# Try to open the report in a browser
|
395
|
+
try:
|
396
|
+
import webbrowser
|
397
|
+
webbrowser.open(f"file://{report_path}")
|
398
|
+
except Exception as e:
|
399
|
+
print(f"Could not open browser: {e}")
|
400
|
+
print(f"Report is available at: {report_path}")
|
401
|
+
|
402
|
+
|
403
|
+
if __name__ == "__main__":
|
404
|
+
generate_and_open_report()
|
@@ -0,0 +1,136 @@
|
|
1
|
+
"""Logger for validation failures in questions.
|
2
|
+
|
3
|
+
This module provides functionality to log validation failures that occur when
|
4
|
+
question answers don't meet the expected format or constraints. The logs can be
|
5
|
+
used to improve the "fix" methods for questions.
|
6
|
+
"""
|
7
|
+
|
8
|
+
import datetime
|
9
|
+
import json
|
10
|
+
import logging
|
11
|
+
import os
|
12
|
+
import traceback
|
13
|
+
from pathlib import Path
|
14
|
+
from typing import Any, Dict, Optional
|
15
|
+
|
16
|
+
from ..config import CONFIG
|
17
|
+
|
18
|
+
# Set up logging
|
19
|
+
logger = logging.getLogger("validation_failures")
|
20
|
+
logger.setLevel(logging.INFO)
|
21
|
+
|
22
|
+
# Determine log directory path
|
23
|
+
DEFAULT_LOG_DIR = Path.home() / ".edsl" / "logs"
|
24
|
+
try:
|
25
|
+
LOG_DIR = Path(CONFIG.get("EDSL_LOG_DIR"))
|
26
|
+
except Exception:
|
27
|
+
# If EDSL_LOG_DIR is not defined, use default
|
28
|
+
LOG_DIR = DEFAULT_LOG_DIR
|
29
|
+
VALIDATION_LOG_FILE = LOG_DIR / "validation_failures.log"
|
30
|
+
|
31
|
+
# Create log directory if it doesn't exist
|
32
|
+
os.makedirs(LOG_DIR, exist_ok=True)
|
33
|
+
|
34
|
+
# Create file handler
|
35
|
+
file_handler = logging.FileHandler(VALIDATION_LOG_FILE)
|
36
|
+
file_handler.setLevel(logging.INFO)
|
37
|
+
|
38
|
+
# Create formatter
|
39
|
+
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
40
|
+
file_handler.setFormatter(formatter)
|
41
|
+
|
42
|
+
# Add handler to logger
|
43
|
+
logger.addHandler(file_handler)
|
44
|
+
|
45
|
+
# Touch the log file to make sure it exists
|
46
|
+
if not os.path.exists(VALIDATION_LOG_FILE):
|
47
|
+
with open(VALIDATION_LOG_FILE, 'a'):
|
48
|
+
pass
|
49
|
+
|
50
|
+
|
51
|
+
def log_validation_failure(
|
52
|
+
question_type: str,
|
53
|
+
question_name: str,
|
54
|
+
error_message: str,
|
55
|
+
invalid_data: Dict[str, Any],
|
56
|
+
model_schema: Dict[str, Any],
|
57
|
+
question_dict: Optional[Dict[str, Any]] = None,
|
58
|
+
) -> None:
|
59
|
+
"""
|
60
|
+
Log a validation failure to the validation failures log file.
|
61
|
+
|
62
|
+
Args:
|
63
|
+
question_type: The type of question that had a validation failure
|
64
|
+
question_name: The name of the question
|
65
|
+
error_message: The validation error message
|
66
|
+
invalid_data: The data that failed validation
|
67
|
+
model_schema: The schema of the model used for validation
|
68
|
+
question_dict: Optional dictionary representation of the question
|
69
|
+
"""
|
70
|
+
log_entry = {
|
71
|
+
"timestamp": datetime.datetime.now().isoformat(),
|
72
|
+
"question_type": question_type,
|
73
|
+
"question_name": question_name,
|
74
|
+
"error_message": error_message,
|
75
|
+
"invalid_data": invalid_data,
|
76
|
+
"model_schema": model_schema,
|
77
|
+
"question_dict": question_dict,
|
78
|
+
"traceback": traceback.format_exc(),
|
79
|
+
}
|
80
|
+
|
81
|
+
# Log as JSON for easier parsing
|
82
|
+
logger.info(json.dumps(log_entry))
|
83
|
+
|
84
|
+
# Write directly to the file as well to ensure it's written
|
85
|
+
with open(VALIDATION_LOG_FILE, "a") as f:
|
86
|
+
f.write(f"{datetime.datetime.now().isoformat()} - validation_failures - INFO - {json.dumps(log_entry)}\n")
|
87
|
+
f.flush()
|
88
|
+
|
89
|
+
|
90
|
+
def get_validation_failure_logs(n: int = 10) -> list:
|
91
|
+
"""
|
92
|
+
Get the latest n validation failure logs.
|
93
|
+
|
94
|
+
Args:
|
95
|
+
n: Number of logs to return (default: 10)
|
96
|
+
|
97
|
+
Returns:
|
98
|
+
List of validation failure log entries as dictionaries
|
99
|
+
"""
|
100
|
+
logs = []
|
101
|
+
|
102
|
+
# Check if log file exists
|
103
|
+
if not os.path.exists(VALIDATION_LOG_FILE):
|
104
|
+
return logs
|
105
|
+
|
106
|
+
with open(VALIDATION_LOG_FILE, "r") as f:
|
107
|
+
for line in f:
|
108
|
+
try:
|
109
|
+
# Skip non-JSON lines (like logger initialization)
|
110
|
+
if not line.strip():
|
111
|
+
continue
|
112
|
+
|
113
|
+
# Handle both the Python logging format and our direct write format
|
114
|
+
parts = line.strip().split(" - ")
|
115
|
+
if len(parts) >= 4:
|
116
|
+
# Regular log line format: timestamp - name - level - message
|
117
|
+
json_part = parts[-1]
|
118
|
+
try:
|
119
|
+
log_entry = json.loads(json_part)
|
120
|
+
logs.append(log_entry)
|
121
|
+
except json.JSONDecodeError:
|
122
|
+
# Skip malformed JSON
|
123
|
+
continue
|
124
|
+
except (IndexError, ValueError):
|
125
|
+
# Skip malformed lines
|
126
|
+
continue
|
127
|
+
|
128
|
+
# Return most recent logs first
|
129
|
+
return sorted(logs, key=lambda x: x.get("timestamp", ""), reverse=True)[:n]
|
130
|
+
|
131
|
+
|
132
|
+
def clear_validation_logs() -> None:
|
133
|
+
"""Clear all validation failure logs."""
|
134
|
+
if os.path.exists(VALIDATION_LOG_FILE):
|
135
|
+
with open(VALIDATION_LOG_FILE, "w") as f:
|
136
|
+
f.write("")
|
edsl/results/result.py
CHANGED
@@ -450,6 +450,13 @@ class Result(Base, UserDict):
|
|
450
450
|
else:
|
451
451
|
d.pop("cache_used_dict", None)
|
452
452
|
|
453
|
+
if hasattr(self, "interview_hash"):
|
454
|
+
d["interview_hash"] = self.interview_hash
|
455
|
+
|
456
|
+
# Preserve the order attribute if it exists
|
457
|
+
if hasattr(self, "order"):
|
458
|
+
d["order"] = self.order
|
459
|
+
|
453
460
|
return d
|
454
461
|
|
455
462
|
def __hash__(self):
|
@@ -490,6 +497,13 @@ class Result(Base, UserDict):
|
|
490
497
|
cache_keys=json_dict.get("cache_keys", {}),
|
491
498
|
indices = json_dict.get("indices", None)
|
492
499
|
)
|
500
|
+
if "interview_hash" in json_dict:
|
501
|
+
result.interview_hash = json_dict["interview_hash"]
|
502
|
+
|
503
|
+
# Restore the order attribute if it exists in the dictionary
|
504
|
+
if "order" in json_dict:
|
505
|
+
result.order = json_dict["order"]
|
506
|
+
|
493
507
|
return result
|
494
508
|
|
495
509
|
def __repr__(self):
|
@@ -574,10 +588,12 @@ class Result(Base, UserDict):
|
|
574
588
|
return scoring_function(**params)
|
575
589
|
|
576
590
|
@classmethod
|
577
|
-
def from_interview(
|
578
|
-
|
579
|
-
|
580
|
-
|
591
|
+
def from_interview(cls, interview) -> Result:
|
592
|
+
"""Return a Result object from an interview dictionary, ensuring no reference to the original interview is maintained."""
|
593
|
+
# Copy the valid results to avoid maintaining references
|
594
|
+
model_response_objects = list(interview.valid_results) if hasattr(interview, 'valid_results') else []
|
595
|
+
# Create a copy of the answers
|
596
|
+
extracted_answers = dict(interview.answers) if hasattr(interview, 'answers') else {}
|
581
597
|
|
582
598
|
def get_question_results(
|
583
599
|
model_response_objects,
|
@@ -653,38 +669,69 @@ class Result(Base, UserDict):
|
|
653
669
|
|
654
670
|
return raw_model_results_dictionary, cache_used_dictionary
|
655
671
|
|
672
|
+
# Save essential information from the interview before clearing references
|
673
|
+
agent_copy = interview.agent.copy() if hasattr(interview, 'agent') else None
|
674
|
+
scenario_copy = interview.scenario.copy() if hasattr(interview, 'scenario') else None
|
675
|
+
model_copy = interview.model.copy() if hasattr(interview, 'model') else None
|
676
|
+
iteration = interview.iteration if hasattr(interview, 'iteration') else 0
|
677
|
+
survey_copy = interview.survey.copy() if hasattr(interview, 'survey') and interview.survey else None
|
678
|
+
indices_copy = dict(interview.indices) if hasattr(interview, 'indices') and interview.indices else None
|
679
|
+
initial_hash = interview.initial_hash if hasattr(interview, 'initial_hash') else hash(interview)
|
680
|
+
|
681
|
+
# Process data to create dictionaries needed for Result
|
656
682
|
question_results = get_question_results(model_response_objects)
|
657
683
|
answer_key_names = list(question_results.keys())
|
658
|
-
generated_tokens_dict = get_generated_tokens_dict(answer_key_names)
|
659
|
-
comments_dict = get_comments_dict(answer_key_names)
|
660
|
-
|
684
|
+
generated_tokens_dict = get_generated_tokens_dict(answer_key_names) if answer_key_names else {}
|
685
|
+
comments_dict = get_comments_dict(answer_key_names) if answer_key_names else {}
|
686
|
+
|
687
|
+
# Get answers that are in the question results
|
688
|
+
answer_dict = {}
|
689
|
+
for k in answer_key_names:
|
690
|
+
if k in extracted_answers:
|
691
|
+
answer_dict[k] = extracted_answers[k]
|
692
|
+
|
661
693
|
cache_keys = get_cache_keys(model_response_objects)
|
662
694
|
|
663
695
|
question_name_to_prompts = get_question_name_to_prompts(model_response_objects)
|
664
696
|
prompt_dictionary = get_prompt_dictionary(
|
665
697
|
answer_key_names, question_name_to_prompts
|
666
|
-
)
|
698
|
+
) if answer_key_names else {}
|
699
|
+
|
667
700
|
raw_model_results_dictionary, cache_used_dictionary = (
|
668
701
|
get_raw_model_results_and_cache_used_dictionary(model_response_objects)
|
669
702
|
)
|
670
703
|
|
704
|
+
# Create the Result object with all copied data
|
671
705
|
result = cls(
|
672
|
-
agent=
|
673
|
-
scenario=
|
674
|
-
model=
|
675
|
-
iteration=
|
676
|
-
# Computed objects
|
706
|
+
agent=agent_copy,
|
707
|
+
scenario=scenario_copy,
|
708
|
+
model=model_copy,
|
709
|
+
iteration=iteration,
|
677
710
|
answer=answer_dict,
|
678
711
|
prompt=prompt_dictionary,
|
679
712
|
raw_model_response=raw_model_results_dictionary,
|
680
|
-
survey=
|
713
|
+
survey=survey_copy,
|
681
714
|
generated_tokens=generated_tokens_dict,
|
682
715
|
comments_dict=comments_dict,
|
683
716
|
cache_used_dict=cache_used_dictionary,
|
684
|
-
indices=
|
717
|
+
indices=indices_copy,
|
685
718
|
cache_keys=cache_keys,
|
686
719
|
)
|
687
|
-
|
720
|
+
|
721
|
+
# Store only the hash, not the interview
|
722
|
+
result.interview_hash = initial_hash
|
723
|
+
|
724
|
+
# Clear references to help garbage collection of the interview
|
725
|
+
if hasattr(interview, 'clear_references'):
|
726
|
+
interview.clear_references()
|
727
|
+
|
728
|
+
# Clear local references to help with garbage collection
|
729
|
+
del model_response_objects
|
730
|
+
del extracted_answers
|
731
|
+
del question_results
|
732
|
+
del answer_key_names
|
733
|
+
del question_name_to_prompts
|
734
|
+
|
688
735
|
return result
|
689
736
|
|
690
737
|
|