edsl 0.1.53__py3-none-any.whl → 0.1.55__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. edsl/__init__.py +8 -1
  2. edsl/__init__original.py +134 -0
  3. edsl/__version__.py +1 -1
  4. edsl/agents/agent.py +29 -0
  5. edsl/agents/agent_list.py +36 -1
  6. edsl/base/base_class.py +281 -151
  7. edsl/buckets/__init__.py +8 -3
  8. edsl/buckets/bucket_collection.py +9 -3
  9. edsl/buckets/model_buckets.py +4 -2
  10. edsl/buckets/token_bucket.py +2 -2
  11. edsl/buckets/token_bucket_client.py +5 -3
  12. edsl/caching/cache.py +131 -62
  13. edsl/caching/cache_entry.py +70 -58
  14. edsl/caching/sql_dict.py +17 -0
  15. edsl/cli.py +99 -0
  16. edsl/config/config_class.py +16 -0
  17. edsl/conversation/__init__.py +31 -0
  18. edsl/coop/coop.py +276 -242
  19. edsl/coop/coop_jobs_objects.py +59 -0
  20. edsl/coop/coop_objects.py +29 -0
  21. edsl/coop/coop_regular_objects.py +26 -0
  22. edsl/coop/utils.py +24 -19
  23. edsl/dataset/dataset.py +338 -101
  24. edsl/db_list/sqlite_list.py +349 -0
  25. edsl/inference_services/__init__.py +40 -5
  26. edsl/inference_services/exceptions.py +11 -0
  27. edsl/inference_services/services/anthropic_service.py +5 -2
  28. edsl/inference_services/services/aws_bedrock.py +6 -2
  29. edsl/inference_services/services/azure_ai.py +6 -2
  30. edsl/inference_services/services/google_service.py +3 -2
  31. edsl/inference_services/services/mistral_ai_service.py +6 -2
  32. edsl/inference_services/services/open_ai_service.py +6 -2
  33. edsl/inference_services/services/perplexity_service.py +6 -2
  34. edsl/inference_services/services/test_service.py +105 -7
  35. edsl/interviews/answering_function.py +167 -59
  36. edsl/interviews/interview.py +124 -72
  37. edsl/interviews/interview_task_manager.py +10 -0
  38. edsl/invigilators/invigilators.py +10 -1
  39. edsl/jobs/async_interview_runner.py +146 -104
  40. edsl/jobs/data_structures.py +6 -4
  41. edsl/jobs/decorators.py +61 -0
  42. edsl/jobs/fetch_invigilator.py +61 -18
  43. edsl/jobs/html_table_job_logger.py +14 -2
  44. edsl/jobs/jobs.py +180 -104
  45. edsl/jobs/jobs_component_constructor.py +2 -2
  46. edsl/jobs/jobs_interview_constructor.py +2 -0
  47. edsl/jobs/jobs_pricing_estimation.py +127 -46
  48. edsl/jobs/jobs_remote_inference_logger.py +4 -0
  49. edsl/jobs/jobs_runner_status.py +30 -25
  50. edsl/jobs/progress_bar_manager.py +79 -0
  51. edsl/jobs/remote_inference.py +35 -1
  52. edsl/key_management/key_lookup_builder.py +6 -1
  53. edsl/language_models/language_model.py +102 -12
  54. edsl/language_models/model.py +10 -3
  55. edsl/language_models/price_manager.py +45 -75
  56. edsl/language_models/registry.py +5 -0
  57. edsl/language_models/utilities.py +2 -1
  58. edsl/notebooks/notebook.py +77 -10
  59. edsl/questions/VALIDATION_README.md +134 -0
  60. edsl/questions/__init__.py +24 -1
  61. edsl/questions/exceptions.py +21 -0
  62. edsl/questions/question_check_box.py +171 -149
  63. edsl/questions/question_dict.py +243 -51
  64. edsl/questions/question_multiple_choice_with_other.py +624 -0
  65. edsl/questions/question_registry.py +2 -1
  66. edsl/questions/templates/multiple_choice_with_other/__init__.py +0 -0
  67. edsl/questions/templates/multiple_choice_with_other/answering_instructions.jinja +15 -0
  68. edsl/questions/templates/multiple_choice_with_other/question_presentation.jinja +17 -0
  69. edsl/questions/validation_analysis.py +185 -0
  70. edsl/questions/validation_cli.py +131 -0
  71. edsl/questions/validation_html_report.py +404 -0
  72. edsl/questions/validation_logger.py +136 -0
  73. edsl/results/result.py +63 -16
  74. edsl/results/results.py +702 -171
  75. edsl/scenarios/construct_download_link.py +16 -3
  76. edsl/scenarios/directory_scanner.py +226 -226
  77. edsl/scenarios/file_methods.py +5 -0
  78. edsl/scenarios/file_store.py +117 -6
  79. edsl/scenarios/handlers/__init__.py +5 -1
  80. edsl/scenarios/handlers/mp4_file_store.py +104 -0
  81. edsl/scenarios/handlers/webm_file_store.py +104 -0
  82. edsl/scenarios/scenario.py +120 -101
  83. edsl/scenarios/scenario_list.py +800 -727
  84. edsl/scenarios/scenario_list_gc_test.py +146 -0
  85. edsl/scenarios/scenario_list_memory_test.py +214 -0
  86. edsl/scenarios/scenario_list_source_refactor.md +35 -0
  87. edsl/scenarios/scenario_selector.py +5 -4
  88. edsl/scenarios/scenario_source.py +1990 -0
  89. edsl/scenarios/tests/test_scenario_list_sources.py +52 -0
  90. edsl/surveys/survey.py +22 -0
  91. edsl/tasks/__init__.py +4 -2
  92. edsl/tasks/task_history.py +198 -36
  93. edsl/tests/scenarios/test_ScenarioSource.py +51 -0
  94. edsl/tests/scenarios/test_scenario_list_sources.py +51 -0
  95. edsl/utilities/__init__.py +2 -1
  96. edsl/utilities/decorators.py +121 -0
  97. edsl/utilities/memory_debugger.py +1010 -0
  98. {edsl-0.1.53.dist-info → edsl-0.1.55.dist-info}/METADATA +52 -76
  99. {edsl-0.1.53.dist-info → edsl-0.1.55.dist-info}/RECORD +102 -78
  100. edsl/jobs/jobs_runner_asyncio.py +0 -281
  101. edsl/language_models/unused/fake_openai_service.py +0 -60
  102. {edsl-0.1.53.dist-info → edsl-0.1.55.dist-info}/LICENSE +0 -0
  103. {edsl-0.1.53.dist-info → edsl-0.1.55.dist-info}/WHEEL +0 -0
  104. {edsl-0.1.53.dist-info → edsl-0.1.55.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,404 @@
1
+ """Generate an HTML report for validation failures.
2
+
3
+ This module provides functionality to create an HTML report of validation failures,
4
+ including statistics, suggestions for improvements, and examples of common failures.
5
+ """
6
+
7
+ import json
8
+ import os
9
+ from datetime import datetime
10
+ from pathlib import Path
11
+ from typing import Dict, List, Optional
12
+
13
+ from ..config import CONFIG
14
+ from .validation_analysis import (
15
+ get_validation_failure_stats,
16
+ suggest_fix_improvements,
17
+ export_improvements_report
18
+ )
19
+ from .validation_logger import get_validation_failure_logs
20
+
21
+ HTML_TEMPLATE = """
22
+ <!DOCTYPE html>
23
+ <html lang="en">
24
+ <head>
25
+ <meta charset="UTF-8">
26
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
27
+ <title>EDSL Validation Failures Report</title>
28
+ <style>
29
+ body {
30
+ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif;
31
+ line-height: 1.6;
32
+ color: #333;
33
+ max-width: 1200px;
34
+ margin: 0 auto;
35
+ padding: 20px;
36
+ }
37
+ h1, h2, h3, h4 {
38
+ color: #2c3e50;
39
+ }
40
+ .header {
41
+ border-bottom: 1px solid #eee;
42
+ padding-bottom: 10px;
43
+ margin-bottom: 20px;
44
+ display: flex;
45
+ justify-content: space-between;
46
+ align-items: center;
47
+ }
48
+ .timestamp {
49
+ color: #7f8c8d;
50
+ font-size: 0.9em;
51
+ }
52
+ .summary {
53
+ background-color: #f8f9fa;
54
+ border-radius: 5px;
55
+ padding: 15px;
56
+ margin-bottom: 20px;
57
+ }
58
+ .stats-container, .suggestions-container, .examples-container {
59
+ margin-bottom: 30px;
60
+ }
61
+ table {
62
+ width: 100%;
63
+ border-collapse: collapse;
64
+ margin-bottom: 20px;
65
+ }
66
+ th, td {
67
+ padding: 12px 15px;
68
+ text-align: left;
69
+ border-bottom: 1px solid #ddd;
70
+ }
71
+ th {
72
+ background-color: #f8f9fa;
73
+ font-weight: 600;
74
+ }
75
+ tr:hover {
76
+ background-color: #f5f5f5;
77
+ }
78
+ .suggestion {
79
+ background-color: #e3f2fd;
80
+ border-left: 4px solid #2196f3;
81
+ padding: 10px 15px;
82
+ margin-bottom: 10px;
83
+ border-radius: 0 4px 4px 0;
84
+ }
85
+ .card {
86
+ border: 1px solid #ddd;
87
+ border-radius: 4px;
88
+ padding: 15px;
89
+ margin-bottom: 20px;
90
+ box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
91
+ }
92
+ .card-header {
93
+ font-weight: 600;
94
+ margin-bottom: 10px;
95
+ padding-bottom: 10px;
96
+ border-bottom: 1px solid #eee;
97
+ }
98
+ .example {
99
+ background-color: #fff8e1;
100
+ border-left: 4px solid #ffc107;
101
+ padding: 10px 15px;
102
+ margin-bottom: 10px;
103
+ border-radius: 0 4px 4px 0;
104
+ overflow-x: auto;
105
+ }
106
+ pre {
107
+ background-color: #f5f5f5;
108
+ padding: 10px;
109
+ border-radius: 4px;
110
+ overflow-x: auto;
111
+ }
112
+ code {
113
+ font-family: SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace;
114
+ font-size: 0.9em;
115
+ }
116
+ .badge {
117
+ display: inline-block;
118
+ padding: 3px 7px;
119
+ font-size: 0.75em;
120
+ font-weight: 600;
121
+ line-height: 1;
122
+ text-align: center;
123
+ white-space: nowrap;
124
+ vertical-align: baseline;
125
+ border-radius: 10px;
126
+ background-color: #e9ecef;
127
+ margin-right: 5px;
128
+ }
129
+ .badge-warning {
130
+ background-color: #fff3cd;
131
+ color: #856404;
132
+ }
133
+ .badge-primary {
134
+ background-color: #cfe2ff;
135
+ color: #084298;
136
+ }
137
+ .badge-success {
138
+ background-color: #d1e7dd;
139
+ color: #0f5132;
140
+ }
141
+ .fix-method {
142
+ background-color: #e8f5e9;
143
+ border-left: 4px solid #4caf50;
144
+ padding: 10px 15px;
145
+ margin: 10px 0;
146
+ border-radius: 0 4px 4px 0;
147
+ }
148
+ </style>
149
+ </head>
150
+ <body>
151
+ <div class="header">
152
+ <h1>EDSL Validation Failures Report</h1>
153
+ <span class="timestamp">Generated on {{timestamp}}</span>
154
+ </div>
155
+
156
+ <div class="summary">
157
+ <h2>Summary</h2>
158
+ <p>This report analyzes validation failures that occurred when question answers didn't meet the expected format or constraints.
159
+ It provides statistics, improvement suggestions for fix methods, and examples of common failures.</p>
160
+ <p><strong>Total validation failures:</strong> {{total_failures}}</p>
161
+ <p><strong>Question types with failures:</strong> {{question_types_count}}</p>
162
+ </div>
163
+
164
+ <div class="stats-container">
165
+ <h2>Validation Failure Statistics</h2>
166
+
167
+ <div class="card">
168
+ <div class="card-header">Failures by Question Type</div>
169
+ <table>
170
+ <thead>
171
+ <tr>
172
+ <th>Question Type</th>
173
+ <th>Failure Count</th>
174
+ <th>Percentage</th>
175
+ </tr>
176
+ </thead>
177
+ <tbody>
178
+ {{type_stats_rows}}
179
+ </tbody>
180
+ </table>
181
+ </div>
182
+
183
+ <div class="card">
184
+ <div class="card-header">Top Error Messages</div>
185
+ <table>
186
+ <thead>
187
+ <tr>
188
+ <th>Error Message</th>
189
+ <th>Occurrence Count</th>
190
+ </tr>
191
+ </thead>
192
+ <tbody>
193
+ {{error_stats_rows}}
194
+ </tbody>
195
+ </table>
196
+ </div>
197
+ </div>
198
+
199
+ <div class="suggestions-container">
200
+ <h2>Fix Method Improvement Suggestions</h2>
201
+ {{suggestions_content}}
202
+ </div>
203
+
204
+ <div class="examples-container">
205
+ <h2>Example Validation Failures</h2>
206
+ {{examples_content}}
207
+ </div>
208
+ </body>
209
+ </html>
210
+ """
211
+
212
+
213
+ def _generate_type_stats_rows(stats: Dict) -> str:
214
+ """Generate HTML table rows for question type statistics."""
215
+ type_stats = stats.get("by_question_type", {})
216
+ total_failures = sum(type_stats.values())
217
+
218
+ rows = []
219
+ for question_type, count in sorted(type_stats.items(), key=lambda x: x[1], reverse=True):
220
+ percentage = (count / total_failures) * 100 if total_failures > 0 else 0
221
+ row = (
222
+ f"<tr>"
223
+ f"<td>{question_type}</td>"
224
+ f"<td>{count}</td>"
225
+ f"<td>{percentage:.1f}%</td>"
226
+ f"</tr>"
227
+ )
228
+ rows.append(row)
229
+
230
+ return "\n".join(rows)
231
+
232
+
233
+ def _generate_error_stats_rows(stats: Dict) -> str:
234
+ """Generate HTML table rows for error message statistics."""
235
+ error_counts = {}
236
+
237
+ # Aggregate error counts across all question types
238
+ for question_type, errors in stats.get("by_error_message", {}).items():
239
+ for error_msg, count in errors.items():
240
+ error_counts[error_msg] = error_counts.get(error_msg, 0) + count
241
+
242
+ # Sort by count (descending)
243
+ sorted_errors = sorted(error_counts.items(), key=lambda x: x[1], reverse=True)
244
+
245
+ rows = []
246
+ for error_msg, count in sorted_errors[:10]: # Show top 10 errors
247
+ shortened_msg = error_msg[:100] + "..." if len(error_msg) > 100 else error_msg
248
+ row = (
249
+ f"<tr>"
250
+ f"<td>{shortened_msg}</td>"
251
+ f"<td>{count}</td>"
252
+ f"</tr>"
253
+ )
254
+ rows.append(row)
255
+
256
+ return "\n".join(rows)
257
+
258
+
259
+ def _generate_suggestions_content(suggestions: Dict) -> str:
260
+ """Generate HTML content for fix method suggestions."""
261
+ if not suggestions:
262
+ return "<p>No suggestions available. Log more validation failures to generate improvement suggestions.</p>"
263
+
264
+ content = []
265
+
266
+ for question_type, question_suggestions in suggestions.items():
267
+ content.append(f"<div class='card'>")
268
+ content.append(f"<div class='card-header'>{question_type}</div>")
269
+
270
+ for suggestion in question_suggestions:
271
+ error_msg = suggestion.get("error_message", "")
272
+ occurrence_count = suggestion.get("occurrence_count", 0)
273
+ suggestion_text = suggestion.get("suggestion", "")
274
+
275
+ content.append(
276
+ f"<div class='suggestion'>"
277
+ f"<p><strong>Error:</strong> {error_msg}</p>"
278
+ f"<p><strong>Occurrences:</strong> {occurrence_count}</p>"
279
+ f"<div class='fix-method'>"
280
+ f"<p><strong>Suggested improvement:</strong></p>"
281
+ f"<p>{suggestion_text}</p>"
282
+ f"</div>"
283
+ f"</div>"
284
+ )
285
+
286
+ content.append("</div>")
287
+
288
+ return "\n".join(content)
289
+
290
+
291
+ def _generate_examples_content(logs: List[Dict]) -> str:
292
+ """Generate HTML content for example validation failures."""
293
+ if not logs:
294
+ return "<p>No validation failure examples available.</p>"
295
+
296
+ content = []
297
+
298
+ # Group logs by question type
299
+ logs_by_type = {}
300
+ for log in logs:
301
+ question_type = log.get("question_type", "unknown")
302
+ if question_type not in logs_by_type:
303
+ logs_by_type[question_type] = []
304
+ logs_by_type[question_type].append(log)
305
+
306
+ # For each question type, show the most recent example
307
+ for question_type, type_logs in logs_by_type.items():
308
+ # Sort by timestamp (newest first)
309
+ sorted_logs = sorted(type_logs, key=lambda x: x.get("timestamp", ""), reverse=True)
310
+ example_log = sorted_logs[0]
311
+
312
+ error_message = example_log.get("error_message", "")
313
+ invalid_data = example_log.get("invalid_data", {})
314
+ model_schema = example_log.get("model_schema", {})
315
+
316
+ content.append(f"<div class='card'>")
317
+ content.append(f"<div class='card-header'>{question_type}</div>")
318
+
319
+ content.append(
320
+ f"<div class='example'>"
321
+ f"<p><strong>Error:</strong> {error_message}</p>"
322
+ f"<p><strong>Invalid Data:</strong></p>"
323
+ f"<pre><code>{json.dumps(invalid_data, indent=2)}</code></pre>"
324
+ f"<p><strong>Expected Schema:</strong></p>"
325
+ f"<pre><code>{json.dumps(model_schema, indent=2)}</code></pre>"
326
+ f"</div>"
327
+ )
328
+
329
+ content.append("</div>")
330
+
331
+ return "\n".join(content)
332
+
333
+
334
+ def generate_html_report(output_path: Optional[Path] = None) -> Path:
335
+ """
336
+ Generate an HTML report of validation failures.
337
+
338
+ Args:
339
+ output_path: Optional custom path for the report
340
+
341
+ Returns:
342
+ Path to the generated HTML report
343
+ """
344
+ # Determine output path
345
+ if output_path is None:
346
+ default_log_dir = Path.home() / ".edsl" / "logs"
347
+ try:
348
+ report_dir = Path(CONFIG.get("EDSL_LOG_DIR"))
349
+ except Exception:
350
+ # If EDSL_LOG_DIR is not defined, use default
351
+ report_dir = default_log_dir
352
+ os.makedirs(report_dir, exist_ok=True)
353
+ output_path = report_dir / "validation_report.html"
354
+
355
+ # Get validation data
356
+ logs = get_validation_failure_logs(n=100) # Get up to 100 recent logs
357
+ stats = get_validation_failure_stats()
358
+ suggestions = suggest_fix_improvements()
359
+
360
+ # Calculate summary statistics
361
+ total_failures = sum(stats.get("by_question_type", {}).values())
362
+ question_types_count = len(stats.get("by_question_type", {}))
363
+
364
+ # Generate report content
365
+ type_stats_rows = _generate_type_stats_rows(stats)
366
+ error_stats_rows = _generate_error_stats_rows(stats)
367
+ suggestions_content = _generate_suggestions_content(suggestions)
368
+ examples_content = _generate_examples_content(logs)
369
+
370
+ # Format timestamp
371
+ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
372
+
373
+ # Fill the template
374
+ html_content = HTML_TEMPLATE.replace("{{timestamp}}", timestamp)
375
+ html_content = html_content.replace("{{total_failures}}", str(total_failures))
376
+ html_content = html_content.replace("{{question_types_count}}", str(question_types_count))
377
+ html_content = html_content.replace("{{type_stats_rows}}", type_stats_rows)
378
+ html_content = html_content.replace("{{error_stats_rows}}", error_stats_rows)
379
+ html_content = html_content.replace("{{suggestions_content}}", suggestions_content)
380
+ html_content = html_content.replace("{{examples_content}}", examples_content)
381
+
382
+ # Write the report
383
+ with open(output_path, "w") as f:
384
+ f.write(html_content)
385
+
386
+ return output_path
387
+
388
+
389
+ def generate_and_open_report() -> None:
390
+ """Generate a validation report and open it in the default browser."""
391
+ report_path = generate_html_report()
392
+ print(f"Report generated at: {report_path}")
393
+
394
+ # Try to open the report in a browser
395
+ try:
396
+ import webbrowser
397
+ webbrowser.open(f"file://{report_path}")
398
+ except Exception as e:
399
+ print(f"Could not open browser: {e}")
400
+ print(f"Report is available at: {report_path}")
401
+
402
+
403
+ if __name__ == "__main__":
404
+ generate_and_open_report()
@@ -0,0 +1,136 @@
1
+ """Logger for validation failures in questions.
2
+
3
+ This module provides functionality to log validation failures that occur when
4
+ question answers don't meet the expected format or constraints. The logs can be
5
+ used to improve the "fix" methods for questions.
6
+ """
7
+
8
+ import datetime
9
+ import json
10
+ import logging
11
+ import os
12
+ import traceback
13
+ from pathlib import Path
14
+ from typing import Any, Dict, Optional
15
+
16
+ from ..config import CONFIG
17
+
18
+ # Set up logging
19
+ logger = logging.getLogger("validation_failures")
20
+ logger.setLevel(logging.INFO)
21
+
22
+ # Determine log directory path
23
+ DEFAULT_LOG_DIR = Path.home() / ".edsl" / "logs"
24
+ try:
25
+ LOG_DIR = Path(CONFIG.get("EDSL_LOG_DIR"))
26
+ except Exception:
27
+ # If EDSL_LOG_DIR is not defined, use default
28
+ LOG_DIR = DEFAULT_LOG_DIR
29
+ VALIDATION_LOG_FILE = LOG_DIR / "validation_failures.log"
30
+
31
+ # Create log directory if it doesn't exist
32
+ os.makedirs(LOG_DIR, exist_ok=True)
33
+
34
+ # Create file handler
35
+ file_handler = logging.FileHandler(VALIDATION_LOG_FILE)
36
+ file_handler.setLevel(logging.INFO)
37
+
38
+ # Create formatter
39
+ formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
40
+ file_handler.setFormatter(formatter)
41
+
42
+ # Add handler to logger
43
+ logger.addHandler(file_handler)
44
+
45
+ # Touch the log file to make sure it exists
46
+ if not os.path.exists(VALIDATION_LOG_FILE):
47
+ with open(VALIDATION_LOG_FILE, 'a'):
48
+ pass
49
+
50
+
51
+ def log_validation_failure(
52
+ question_type: str,
53
+ question_name: str,
54
+ error_message: str,
55
+ invalid_data: Dict[str, Any],
56
+ model_schema: Dict[str, Any],
57
+ question_dict: Optional[Dict[str, Any]] = None,
58
+ ) -> None:
59
+ """
60
+ Log a validation failure to the validation failures log file.
61
+
62
+ Args:
63
+ question_type: The type of question that had a validation failure
64
+ question_name: The name of the question
65
+ error_message: The validation error message
66
+ invalid_data: The data that failed validation
67
+ model_schema: The schema of the model used for validation
68
+ question_dict: Optional dictionary representation of the question
69
+ """
70
+ log_entry = {
71
+ "timestamp": datetime.datetime.now().isoformat(),
72
+ "question_type": question_type,
73
+ "question_name": question_name,
74
+ "error_message": error_message,
75
+ "invalid_data": invalid_data,
76
+ "model_schema": model_schema,
77
+ "question_dict": question_dict,
78
+ "traceback": traceback.format_exc(),
79
+ }
80
+
81
+ # Log as JSON for easier parsing
82
+ logger.info(json.dumps(log_entry))
83
+
84
+ # Write directly to the file as well to ensure it's written
85
+ with open(VALIDATION_LOG_FILE, "a") as f:
86
+ f.write(f"{datetime.datetime.now().isoformat()} - validation_failures - INFO - {json.dumps(log_entry)}\n")
87
+ f.flush()
88
+
89
+
90
+ def get_validation_failure_logs(n: int = 10) -> list:
91
+ """
92
+ Get the latest n validation failure logs.
93
+
94
+ Args:
95
+ n: Number of logs to return (default: 10)
96
+
97
+ Returns:
98
+ List of validation failure log entries as dictionaries
99
+ """
100
+ logs = []
101
+
102
+ # Check if log file exists
103
+ if not os.path.exists(VALIDATION_LOG_FILE):
104
+ return logs
105
+
106
+ with open(VALIDATION_LOG_FILE, "r") as f:
107
+ for line in f:
108
+ try:
109
+ # Skip non-JSON lines (like logger initialization)
110
+ if not line.strip():
111
+ continue
112
+
113
+ # Handle both the Python logging format and our direct write format
114
+ parts = line.strip().split(" - ")
115
+ if len(parts) >= 4:
116
+ # Regular log line format: timestamp - name - level - message
117
+ json_part = parts[-1]
118
+ try:
119
+ log_entry = json.loads(json_part)
120
+ logs.append(log_entry)
121
+ except json.JSONDecodeError:
122
+ # Skip malformed JSON
123
+ continue
124
+ except (IndexError, ValueError):
125
+ # Skip malformed lines
126
+ continue
127
+
128
+ # Return most recent logs first
129
+ return sorted(logs, key=lambda x: x.get("timestamp", ""), reverse=True)[:n]
130
+
131
+
132
+ def clear_validation_logs() -> None:
133
+ """Clear all validation failure logs."""
134
+ if os.path.exists(VALIDATION_LOG_FILE):
135
+ with open(VALIDATION_LOG_FILE, "w") as f:
136
+ f.write("")
edsl/results/result.py CHANGED
@@ -450,6 +450,13 @@ class Result(Base, UserDict):
450
450
  else:
451
451
  d.pop("cache_used_dict", None)
452
452
 
453
+ if hasattr(self, "interview_hash"):
454
+ d["interview_hash"] = self.interview_hash
455
+
456
+ # Preserve the order attribute if it exists
457
+ if hasattr(self, "order"):
458
+ d["order"] = self.order
459
+
453
460
  return d
454
461
 
455
462
  def __hash__(self):
@@ -490,6 +497,13 @@ class Result(Base, UserDict):
490
497
  cache_keys=json_dict.get("cache_keys", {}),
491
498
  indices = json_dict.get("indices", None)
492
499
  )
500
+ if "interview_hash" in json_dict:
501
+ result.interview_hash = json_dict["interview_hash"]
502
+
503
+ # Restore the order attribute if it exists in the dictionary
504
+ if "order" in json_dict:
505
+ result.order = json_dict["order"]
506
+
493
507
  return result
494
508
 
495
509
  def __repr__(self):
@@ -574,10 +588,12 @@ class Result(Base, UserDict):
574
588
  return scoring_function(**params)
575
589
 
576
590
  @classmethod
577
- def from_interview(
578
- cls, interview, extracted_answers, model_response_objects
579
- ) -> Result:
580
- """Return a Result object from an interview dictionary."""
591
+ def from_interview(cls, interview) -> Result:
592
+ """Return a Result object from an interview dictionary, ensuring no reference to the original interview is maintained."""
593
+ # Copy the valid results to avoid maintaining references
594
+ model_response_objects = list(interview.valid_results) if hasattr(interview, 'valid_results') else []
595
+ # Create a copy of the answers
596
+ extracted_answers = dict(interview.answers) if hasattr(interview, 'answers') else {}
581
597
 
582
598
  def get_question_results(
583
599
  model_response_objects,
@@ -653,38 +669,69 @@ class Result(Base, UserDict):
653
669
 
654
670
  return raw_model_results_dictionary, cache_used_dictionary
655
671
 
672
+ # Save essential information from the interview before clearing references
673
+ agent_copy = interview.agent.copy() if hasattr(interview, 'agent') else None
674
+ scenario_copy = interview.scenario.copy() if hasattr(interview, 'scenario') else None
675
+ model_copy = interview.model.copy() if hasattr(interview, 'model') else None
676
+ iteration = interview.iteration if hasattr(interview, 'iteration') else 0
677
+ survey_copy = interview.survey.copy() if hasattr(interview, 'survey') and interview.survey else None
678
+ indices_copy = dict(interview.indices) if hasattr(interview, 'indices') and interview.indices else None
679
+ initial_hash = interview.initial_hash if hasattr(interview, 'initial_hash') else hash(interview)
680
+
681
+ # Process data to create dictionaries needed for Result
656
682
  question_results = get_question_results(model_response_objects)
657
683
  answer_key_names = list(question_results.keys())
658
- generated_tokens_dict = get_generated_tokens_dict(answer_key_names)
659
- comments_dict = get_comments_dict(answer_key_names)
660
- answer_dict = {k: extracted_answers[k] for k in answer_key_names}
684
+ generated_tokens_dict = get_generated_tokens_dict(answer_key_names) if answer_key_names else {}
685
+ comments_dict = get_comments_dict(answer_key_names) if answer_key_names else {}
686
+
687
+ # Get answers that are in the question results
688
+ answer_dict = {}
689
+ for k in answer_key_names:
690
+ if k in extracted_answers:
691
+ answer_dict[k] = extracted_answers[k]
692
+
661
693
  cache_keys = get_cache_keys(model_response_objects)
662
694
 
663
695
  question_name_to_prompts = get_question_name_to_prompts(model_response_objects)
664
696
  prompt_dictionary = get_prompt_dictionary(
665
697
  answer_key_names, question_name_to_prompts
666
- )
698
+ ) if answer_key_names else {}
699
+
667
700
  raw_model_results_dictionary, cache_used_dictionary = (
668
701
  get_raw_model_results_and_cache_used_dictionary(model_response_objects)
669
702
  )
670
703
 
704
+ # Create the Result object with all copied data
671
705
  result = cls(
672
- agent=interview.agent,
673
- scenario=interview.scenario,
674
- model=interview.model,
675
- iteration=interview.iteration,
676
- # Computed objects
706
+ agent=agent_copy,
707
+ scenario=scenario_copy,
708
+ model=model_copy,
709
+ iteration=iteration,
677
710
  answer=answer_dict,
678
711
  prompt=prompt_dictionary,
679
712
  raw_model_response=raw_model_results_dictionary,
680
- survey=interview.survey,
713
+ survey=survey_copy,
681
714
  generated_tokens=generated_tokens_dict,
682
715
  comments_dict=comments_dict,
683
716
  cache_used_dict=cache_used_dictionary,
684
- indices=interview.indices,
717
+ indices=indices_copy,
685
718
  cache_keys=cache_keys,
686
719
  )
687
- result.interview_hash = interview.initial_hash
720
+
721
+ # Store only the hash, not the interview
722
+ result.interview_hash = initial_hash
723
+
724
+ # Clear references to help garbage collection of the interview
725
+ if hasattr(interview, 'clear_references'):
726
+ interview.clear_references()
727
+
728
+ # Clear local references to help with garbage collection
729
+ del model_response_objects
730
+ del extracted_answers
731
+ del question_results
732
+ del answer_key_names
733
+ del question_name_to_prompts
734
+
688
735
  return result
689
736
 
690
737