edsl 0.1.58__py3-none-any.whl → 0.1.60__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- edsl/__version__.py +1 -1
- edsl/agents/agent.py +23 -4
- edsl/agents/agent_list.py +36 -6
- edsl/base/data_transfer_models.py +5 -0
- edsl/base/enums.py +7 -2
- edsl/coop/coop.py +103 -1
- edsl/dataset/dataset.py +74 -0
- edsl/dataset/dataset_operations_mixin.py +69 -64
- edsl/inference_services/services/__init__.py +3 -1
- edsl/inference_services/services/open_ai_service_v2.py +243 -0
- edsl/inference_services/services/test_service.py +1 -1
- edsl/interviews/exception_tracking.py +66 -20
- edsl/invigilators/invigilators.py +5 -1
- edsl/invigilators/prompt_constructor.py +299 -136
- edsl/jobs/data_structures.py +3 -0
- edsl/jobs/html_table_job_logger.py +18 -1
- edsl/jobs/jobs_pricing_estimation.py +6 -2
- edsl/jobs/jobs_remote_inference_logger.py +2 -0
- edsl/jobs/remote_inference.py +34 -7
- edsl/key_management/key_lookup_builder.py +25 -3
- edsl/language_models/language_model.py +41 -3
- edsl/language_models/raw_response_handler.py +126 -7
- edsl/prompts/prompt.py +1 -0
- edsl/questions/question_list.py +76 -20
- edsl/results/result.py +37 -0
- edsl/results/results.py +9 -1
- edsl/scenarios/file_store.py +8 -12
- edsl/scenarios/scenario.py +50 -2
- edsl/scenarios/scenario_list.py +34 -12
- edsl/surveys/survey.py +4 -0
- edsl/tasks/task_history.py +180 -6
- edsl/utilities/wikipedia.py +194 -0
- {edsl-0.1.58.dist-info → edsl-0.1.60.dist-info}/METADATA +5 -4
- {edsl-0.1.58.dist-info → edsl-0.1.60.dist-info}/RECORD +37 -35
- {edsl-0.1.58.dist-info → edsl-0.1.60.dist-info}/LICENSE +0 -0
- {edsl-0.1.58.dist-info → edsl-0.1.60.dist-info}/WHEEL +0 -0
- {edsl-0.1.58.dist-info → edsl-0.1.60.dist-info}/entry_points.txt +0 -0
edsl/results/result.py
CHANGED
@@ -95,6 +95,7 @@ class Result(Base, UserDict):
|
|
95
95
|
question_to_attributes: Optional[dict[QuestionName, Any]] = None,
|
96
96
|
generated_tokens: Optional[dict] = None,
|
97
97
|
comments_dict: Optional[dict] = None,
|
98
|
+
reasoning_summaries_dict: Optional[dict] = None,
|
98
99
|
cache_used_dict: Optional[dict[QuestionName, bool]] = None,
|
99
100
|
indices: Optional[dict] = None,
|
100
101
|
cache_keys: Optional[dict[QuestionName, str]] = None,
|
@@ -112,6 +113,7 @@ class Result(Base, UserDict):
|
|
112
113
|
:param question_to_attributes: A dictionary of question attributes.
|
113
114
|
:param generated_tokens: A dictionary of generated tokens.
|
114
115
|
:param comments_dict: A dictionary of comments.
|
116
|
+
:param reasoning_summaries_dict: A dictionary of reasoning summaries.
|
115
117
|
:param cache_used_dict: A dictionary of cache usage.
|
116
118
|
:param indices: A dictionary of indices.
|
117
119
|
|
@@ -130,6 +132,7 @@ class Result(Base, UserDict):
|
|
130
132
|
"question_to_attributes": self.question_to_attributes,
|
131
133
|
"generated_tokens": generated_tokens or {},
|
132
134
|
"comments_dict": comments_dict or {},
|
135
|
+
"reasoning_summaries_dict": reasoning_summaries_dict or {},
|
133
136
|
"cache_used_dict": cache_used_dict or {},
|
134
137
|
"cache_keys": cache_keys or {},
|
135
138
|
}
|
@@ -236,6 +239,7 @@ class Result(Base, UserDict):
|
|
236
239
|
"answer": self.data["answer"],
|
237
240
|
"prompt": self.data["prompt"],
|
238
241
|
"comment": self.data["comments_dict"],
|
242
|
+
"reasoning_summary": self.data["reasoning_summaries_dict"],
|
239
243
|
"generated_tokens": self.data["generated_tokens"],
|
240
244
|
"raw_model_response": self.data["raw_model_response"],
|
241
245
|
"question_text": sub_dicts_needing_new_keys["question_text"],
|
@@ -497,6 +501,7 @@ class Result(Base, UserDict):
|
|
497
501
|
question_to_attributes=json_dict.get("question_to_attributes", None),
|
498
502
|
generated_tokens=json_dict.get("generated_tokens", {}),
|
499
503
|
comments_dict=json_dict.get("comments_dict", {}),
|
504
|
+
reasoning_summaries_dict=json_dict.get("reasoning_summaries_dict", {}),
|
500
505
|
cache_used_dict=json_dict.get("cache_used_dict", {}),
|
501
506
|
cache_keys=json_dict.get("cache_keys", {}),
|
502
507
|
indices=json_dict.get("indices", None),
|
@@ -631,6 +636,36 @@ class Result(Base, UserDict):
|
|
631
636
|
}
|
632
637
|
return comments_dict
|
633
638
|
|
639
|
+
def get_reasoning_summaries_dict(answer_key_names) -> dict[str, Any]:
|
640
|
+
reasoning_summaries_dict = {}
|
641
|
+
for k in answer_key_names:
|
642
|
+
reasoning_summary = question_results[k].reasoning_summary
|
643
|
+
|
644
|
+
# If reasoning summary is None but we have a raw model response, try to extract it
|
645
|
+
if reasoning_summary is None and hasattr(question_results[k], 'raw_model_response'):
|
646
|
+
try:
|
647
|
+
# Get the model class to access the reasoning_sequence
|
648
|
+
model_class = interview.model.__class__ if hasattr(interview, 'model') else None
|
649
|
+
|
650
|
+
if model_class and hasattr(model_class, 'reasoning_sequence'):
|
651
|
+
from ..language_models.raw_response_handler import RawResponseHandler
|
652
|
+
|
653
|
+
# Create a handler with the model's reasoning sequence
|
654
|
+
handler = RawResponseHandler(
|
655
|
+
key_sequence=model_class.key_sequence if hasattr(model_class, 'key_sequence') else None,
|
656
|
+
usage_sequence=model_class.usage_sequence if hasattr(model_class, 'usage_sequence') else None,
|
657
|
+
reasoning_sequence=model_class.reasoning_sequence
|
658
|
+
)
|
659
|
+
|
660
|
+
# Try to extract the reasoning summary
|
661
|
+
reasoning_summary = handler.get_reasoning_summary(question_results[k].raw_model_response)
|
662
|
+
except Exception:
|
663
|
+
# If extraction fails, keep it as None
|
664
|
+
pass
|
665
|
+
|
666
|
+
reasoning_summaries_dict[k + "_reasoning_summary"] = reasoning_summary
|
667
|
+
return reasoning_summaries_dict
|
668
|
+
|
634
669
|
def get_question_name_to_prompts(
|
635
670
|
model_response_objects,
|
636
671
|
) -> dict[str, dict[str, str]]:
|
@@ -705,6 +740,7 @@ class Result(Base, UserDict):
|
|
705
740
|
answer_key_names = list(question_results.keys())
|
706
741
|
generated_tokens_dict = get_generated_tokens_dict(answer_key_names) if answer_key_names else {}
|
707
742
|
comments_dict = get_comments_dict(answer_key_names) if answer_key_names else {}
|
743
|
+
reasoning_summaries_dict = get_reasoning_summaries_dict(answer_key_names) if answer_key_names else {}
|
708
744
|
|
709
745
|
# Get answers that are in the question results
|
710
746
|
answer_dict = {}
|
@@ -735,6 +771,7 @@ class Result(Base, UserDict):
|
|
735
771
|
survey=survey_copy,
|
736
772
|
generated_tokens=generated_tokens_dict,
|
737
773
|
comments_dict=comments_dict,
|
774
|
+
reasoning_summaries_dict=reasoning_summaries_dict,
|
738
775
|
cache_used_dict=cache_used_dictionary,
|
739
776
|
indices=indices_copy,
|
740
777
|
cache_keys=cache_keys,
|
edsl/results/results.py
CHANGED
@@ -273,6 +273,7 @@ class Results(MutableSequence, ResultsOperationsMixin, Base):
|
|
273
273
|
"generated_tokens",
|
274
274
|
"cache_used",
|
275
275
|
"cache_keys",
|
276
|
+
"reasoning_summary",
|
276
277
|
]
|
277
278
|
|
278
279
|
@classmethod
|
@@ -771,6 +772,10 @@ class Results(MutableSequence, ResultsOperationsMixin, Base):
|
|
771
772
|
def to_dataset(self) -> "Dataset":
|
772
773
|
return self.select()
|
773
774
|
|
775
|
+
def optimzie_scenarios(self):
|
776
|
+
for result in self.data:
|
777
|
+
result.scenario.offload(inplace=True)
|
778
|
+
|
774
779
|
def to_dict(
|
775
780
|
self,
|
776
781
|
sort: bool = False,
|
@@ -778,9 +783,12 @@ class Results(MutableSequence, ResultsOperationsMixin, Base):
|
|
778
783
|
include_cache: bool = True,
|
779
784
|
include_task_history: bool = False,
|
780
785
|
include_cache_info: bool = True,
|
786
|
+
offload_scenarios: bool = True,
|
781
787
|
) -> dict[str, Any]:
|
782
788
|
from ..caching import Cache
|
783
789
|
|
790
|
+
if offload_scenarios:
|
791
|
+
self.optimzie_scenarios()
|
784
792
|
if sort:
|
785
793
|
data = sorted([result for result in self.data], key=lambda x: hash(x))
|
786
794
|
else:
|
@@ -809,7 +817,7 @@ class Results(MutableSequence, ResultsOperationsMixin, Base):
|
|
809
817
|
)
|
810
818
|
|
811
819
|
if self.task_history.has_unfixed_exceptions or include_task_history:
|
812
|
-
d.update({"task_history": self.task_history.to_dict()})
|
820
|
+
d.update({"task_history": self.task_history.to_dict(offload_content=True)})
|
813
821
|
|
814
822
|
if add_edsl_version:
|
815
823
|
from .. import __version__
|
edsl/scenarios/file_store.py
CHANGED
@@ -446,9 +446,7 @@ class FileStore(Scenario):
|
|
446
446
|
if suffix is None:
|
447
447
|
suffix = self.suffix
|
448
448
|
if self.binary:
|
449
|
-
file_like_object = self.base64_to_file(
|
450
|
-
self["base64_string"], is_binary=True
|
451
|
-
)
|
449
|
+
file_like_object = self.base64_to_file(self.base64_string, is_binary=True)
|
452
450
|
else:
|
453
451
|
file_like_object = self.base64_to_text_file(self.base64_string)
|
454
452
|
|
@@ -765,15 +763,13 @@ class FileStore(Scenario):
|
|
765
763
|
if name.startswith("__") and name.endswith("__"):
|
766
764
|
raise AttributeError(name)
|
767
765
|
|
768
|
-
#
|
769
|
-
|
770
|
-
|
771
|
-
|
772
|
-
|
773
|
-
|
774
|
-
|
775
|
-
return getattr(df, name)
|
776
|
-
# If not a CSV or attribute doesn't exist in DataFrame, raise AttributeError
|
766
|
+
# Check for _data directly in __dict__ to avoid recursion
|
767
|
+
_data = self.__dict__.get("_data", None)
|
768
|
+
if _data and _data.get("suffix") == "csv":
|
769
|
+
df = self.to_pandas()
|
770
|
+
if hasattr(df, name):
|
771
|
+
return getattr(df, name)
|
772
|
+
|
777
773
|
raise AttributeError(
|
778
774
|
f"'{self.__class__.__name__}' object has no attribute '{name}'"
|
779
775
|
)
|
edsl/scenarios/scenario.py
CHANGED
@@ -264,9 +264,49 @@ class Scenario(Base, UserDict):
|
|
264
264
|
"""Display a scenario as a table."""
|
265
265
|
return self.to_dataset().table(tablefmt=tablefmt)
|
266
266
|
|
267
|
-
def
|
267
|
+
def offload(self, inplace=False) -> "Scenario":
|
268
|
+
"""
|
269
|
+
Offloads base64-encoded content from the scenario by replacing 'base64_string'
|
270
|
+
fields with 'offloaded'. This reduces memory usage.
|
271
|
+
|
272
|
+
Args:
|
273
|
+
inplace (bool): If True, modify the current scenario. If False, return a new one.
|
274
|
+
|
275
|
+
Returns:
|
276
|
+
Scenario: The modified scenario (either self or a new instance).
|
277
|
+
"""
|
278
|
+
from edsl.scenarios import FileStore
|
279
|
+
from edsl.prompts import Prompt
|
280
|
+
|
281
|
+
target = self if inplace else Scenario()
|
282
|
+
|
283
|
+
for key, value in self.items():
|
284
|
+
if isinstance(value, FileStore):
|
285
|
+
file_store_dict = value.to_dict()
|
286
|
+
if "base64_string" in file_store_dict:
|
287
|
+
file_store_dict["base64_string"] = "offloaded"
|
288
|
+
modified_value = FileStore.from_dict(file_store_dict)
|
289
|
+
elif isinstance(value, dict) and "base64_string" in value:
|
290
|
+
value_copy = value.copy()
|
291
|
+
value_copy["base64_string"] = "offloaded"
|
292
|
+
modified_value = value_copy
|
293
|
+
else:
|
294
|
+
modified_value = value
|
295
|
+
|
296
|
+
target[key] = modified_value
|
297
|
+
|
298
|
+
return target
|
299
|
+
|
300
|
+
def to_dict(
|
301
|
+
self, add_edsl_version: bool = True, offload_base64: bool = False
|
302
|
+
) -> dict:
|
268
303
|
"""Convert a scenario to a dictionary.
|
269
304
|
|
305
|
+
Args:
|
306
|
+
add_edsl_version: If True, adds the EDSL version to the returned dictionary.
|
307
|
+
offload_base64: If True, replaces any base64_string fields with 'offloaded'
|
308
|
+
to reduce memory usage.
|
309
|
+
|
270
310
|
Example:
|
271
311
|
|
272
312
|
>>> s = Scenario({"food": "wood chips"})
|
@@ -283,7 +323,15 @@ class Scenario(Base, UserDict):
|
|
283
323
|
d = self.data.copy()
|
284
324
|
for key, value in d.items():
|
285
325
|
if isinstance(value, FileStore) or isinstance(value, Prompt):
|
286
|
-
|
326
|
+
value_dict = value.to_dict(add_edsl_version=add_edsl_version)
|
327
|
+
if (
|
328
|
+
offload_base64
|
329
|
+
and isinstance(value_dict, dict)
|
330
|
+
and "base64_string" in value_dict
|
331
|
+
):
|
332
|
+
value_dict["base64_string"] = "offloaded"
|
333
|
+
d[key] = value_dict
|
334
|
+
|
287
335
|
if add_edsl_version:
|
288
336
|
from edsl import __version__
|
289
337
|
|
edsl/scenarios/scenario_list.py
CHANGED
@@ -145,22 +145,18 @@ class ScenarioList(MutableSequence, Base, ScenarioListOperationsMixin):
|
|
145
145
|
"""Initialize a new ScenarioList with optional data and codebook."""
|
146
146
|
self._data_class = data_class
|
147
147
|
self.data = self._data_class([])
|
148
|
-
warned = False
|
149
148
|
for item in data or []:
|
150
|
-
try:
|
151
|
-
_ = json.dumps(item.to_dict())
|
152
|
-
except:
|
153
|
-
import warnings
|
154
|
-
if not warned:
|
155
|
-
warnings.warn(
|
156
|
-
f"One or more items in the data list are not JSON serializable. "
|
157
|
-
"This would prevent running a job that uses this ScenarioList."
|
158
|
-
"One solution is to use 'str(item)' to convert the item to a string before adding."
|
159
|
-
)
|
160
|
-
warned = True
|
161
149
|
self.data.append(item)
|
162
150
|
self.codebook = codebook or {}
|
163
151
|
|
152
|
+
def is_serializable(self):
|
153
|
+
for item in self.data:
|
154
|
+
try:
|
155
|
+
_ = json.dumps(item.to_dict())
|
156
|
+
except Exception as e:
|
157
|
+
return False
|
158
|
+
return True
|
159
|
+
|
164
160
|
# Required MutableSequence abstract methods
|
165
161
|
def __getitem__(self, index):
|
166
162
|
"""Get item at index."""
|
@@ -360,6 +356,32 @@ class ScenarioList(MutableSequence, Base, ScenarioListOperationsMixin):
|
|
360
356
|
new_scenarios.append(Scenario(new_scenario))
|
361
357
|
|
362
358
|
return new_scenarios
|
359
|
+
|
360
|
+
@classmethod
|
361
|
+
def from_search_terms(cls, search_terms: List[str]) -> ScenarioList:
|
362
|
+
"""Create a ScenarioList from a list of search terms, using Wikipedia.
|
363
|
+
|
364
|
+
Args:
|
365
|
+
search_terms: A list of search terms.
|
366
|
+
"""
|
367
|
+
from ..utilities.wikipedia import fetch_wikipedia_content
|
368
|
+
results = fetch_wikipedia_content(search_terms)
|
369
|
+
return cls([Scenario(result) for result in results])
|
370
|
+
|
371
|
+
def augment_with_wikipedia(self, search_key:str, content_only: bool = True, key_name: str = "wikipedia_content") -> ScenarioList:
|
372
|
+
"""Augment the ScenarioList with Wikipedia content."""
|
373
|
+
search_terms = self.select(search_key).to_list()
|
374
|
+
wikipedia_results = ScenarioList.from_search_terms(search_terms)
|
375
|
+
new_sl = ScenarioList(data = [], codebook = self.codebook)
|
376
|
+
for scenario, wikipedia_result in zip(self, wikipedia_results):
|
377
|
+
if content_only:
|
378
|
+
scenario[key_name] = wikipedia_result["content"]
|
379
|
+
new_sl.append(scenario)
|
380
|
+
else:
|
381
|
+
scenario[key_name] = wikipedia_result
|
382
|
+
new_sl.append(scenario)
|
383
|
+
return new_sl
|
384
|
+
|
363
385
|
|
364
386
|
def pivot(
|
365
387
|
self,
|
edsl/surveys/survey.py
CHANGED
@@ -384,6 +384,10 @@ class Survey(Base):
|
|
384
384
|
if question_name not in self.question_name_to_index:
|
385
385
|
raise SurveyError(f"Question name {question_name} not found in survey.")
|
386
386
|
return self.questions[self.question_name_to_index[question_name]]
|
387
|
+
|
388
|
+
def get(self, question_name: str) -> QuestionBase:
|
389
|
+
"""Return the question object given the question name."""
|
390
|
+
return self._get_question_by_name(question_name)
|
387
391
|
|
388
392
|
def question_names_to_questions(self) -> dict:
|
389
393
|
"""Return a dictionary mapping question names to question attributes."""
|
edsl/tasks/task_history.py
CHANGED
@@ -43,6 +43,7 @@ class TaskHistory(RepresentationMixin):
|
|
43
43
|
- Generates interactive HTML reports with filtering and drill-down
|
44
44
|
- Computes statistics across interviews (by model, question type, etc.)
|
45
45
|
- Exports to various formats (HTML, notebook, etc.)
|
46
|
+
- Memory optimization via offloading of large file content
|
46
47
|
"""
|
47
48
|
|
48
49
|
def __init__(
|
@@ -191,8 +192,22 @@ class TaskHistory(RepresentationMixin):
|
|
191
192
|
"""Return a string representation of the TaskHistory."""
|
192
193
|
return f"TaskHistory(interviews={self.total_interviews})."
|
193
194
|
|
194
|
-
def to_dict(self, add_edsl_version=True):
|
195
|
-
"""
|
195
|
+
def to_dict(self, add_edsl_version=True, offload_content=False):
|
196
|
+
"""
|
197
|
+
Return the TaskHistory as a dictionary.
|
198
|
+
|
199
|
+
Parameters:
|
200
|
+
add_edsl_version: Whether to include EDSL version in the output
|
201
|
+
offload_content: Whether to offload large file content like videos and images
|
202
|
+
to reduce memory usage
|
203
|
+
|
204
|
+
Returns:
|
205
|
+
A dictionary representation of this TaskHistory instance
|
206
|
+
"""
|
207
|
+
# Offload large file content if requested
|
208
|
+
if offload_content:
|
209
|
+
self.offload_files_content()
|
210
|
+
|
196
211
|
# Serialize each interview object
|
197
212
|
interview_dicts = []
|
198
213
|
for i in self.total_interviews:
|
@@ -255,27 +270,60 @@ class TaskHistory(RepresentationMixin):
|
|
255
270
|
InterviewExceptionCollection,
|
256
271
|
)
|
257
272
|
|
273
|
+
# Store the original data in full
|
274
|
+
self._original_data = data
|
275
|
+
|
276
|
+
# Preserve the original interview id
|
277
|
+
self._interview_id = data.get("id", None)
|
278
|
+
|
279
|
+
# Store exceptions using the original data structure
|
280
|
+
# This ensures when we re-serialize, we keep original data intact
|
281
|
+
self._exceptions_data = data.get("exceptions", {})
|
282
|
+
|
283
|
+
# Create the InterviewExceptionCollection for runtime use
|
258
284
|
exceptions_data = data.get("exceptions", {})
|
259
285
|
self.exceptions = (
|
260
286
|
InterviewExceptionCollection.from_dict(exceptions_data)
|
261
287
|
if exceptions_data
|
262
288
|
else InterviewExceptionCollection()
|
263
289
|
)
|
290
|
+
|
291
|
+
# Store other fields
|
264
292
|
self.task_status_logs = data.get("task_status_logs", {})
|
265
293
|
self.model = data.get("model", {})
|
266
294
|
self.survey = data.get("survey", {})
|
267
295
|
|
268
296
|
def to_dict(self, add_edsl_version=True):
|
269
|
-
|
297
|
+
# Use the original exceptions data structure when serializing again
|
298
|
+
# This preserves all exception details exactly as they were
|
299
|
+
data = {
|
270
300
|
"type": "InterviewReference",
|
271
|
-
"exceptions": self.
|
272
|
-
if hasattr(self
|
273
|
-
else
|
301
|
+
"exceptions": self._exceptions_data
|
302
|
+
if hasattr(self, "_exceptions_data")
|
303
|
+
else (
|
304
|
+
self.exceptions.to_dict()
|
305
|
+
if hasattr(self.exceptions, "to_dict")
|
306
|
+
else self.exceptions
|
307
|
+
),
|
274
308
|
"task_status_logs": self.task_status_logs,
|
275
309
|
"model": self.model,
|
276
310
|
"survey": self.survey,
|
277
311
|
}
|
278
312
|
|
313
|
+
# Preserve the original interview id if it exists
|
314
|
+
if self._interview_id:
|
315
|
+
data["id"] = self._interview_id
|
316
|
+
|
317
|
+
# Preserve original version info
|
318
|
+
if (
|
319
|
+
add_edsl_version
|
320
|
+
and hasattr(self, "_original_data")
|
321
|
+
and "edsl_version" in self._original_data
|
322
|
+
):
|
323
|
+
data["edsl_version"] = self._original_data["edsl_version"]
|
324
|
+
|
325
|
+
return data
|
326
|
+
|
279
327
|
# Create the reference and add it directly
|
280
328
|
ref = DeserializedInterviewRef(interview_data)
|
281
329
|
instance.total_interviews.append(ref)
|
@@ -728,6 +776,132 @@ class TaskHistory(RepresentationMixin):
|
|
728
776
|
|
729
777
|
return nb
|
730
778
|
|
779
|
+
def offload_files_content(self):
|
780
|
+
"""
|
781
|
+
Offload large file content from scenarios in interview exceptions.
|
782
|
+
|
783
|
+
This method iterates over all the interview exceptions and calls the offload method
|
784
|
+
for any scenario components in the invigilator. This significantly reduces memory usage
|
785
|
+
by replacing base64-encoded content with a placeholder string, while preserving the
|
786
|
+
structure of the scenarios.
|
787
|
+
|
788
|
+
Returns:
|
789
|
+
self: Returns the TaskHistory instance for method chaining
|
790
|
+
|
791
|
+
This is particularly useful for TaskHistory instances containing interviews with
|
792
|
+
large file content, such as videos, images, or other binary data.
|
793
|
+
"""
|
794
|
+
for interview in self.total_interviews:
|
795
|
+
if not hasattr(interview, "exceptions") or not interview.exceptions:
|
796
|
+
continue
|
797
|
+
|
798
|
+
for question_name, exceptions in interview.exceptions.items():
|
799
|
+
for exception in exceptions:
|
800
|
+
# Check if exception has an invigilator with scenario
|
801
|
+
if hasattr(exception, "invigilator") and exception.invigilator:
|
802
|
+
if (
|
803
|
+
hasattr(exception.invigilator, "scenario")
|
804
|
+
and exception.invigilator.scenario
|
805
|
+
):
|
806
|
+
# Call the offload method on the scenario
|
807
|
+
if hasattr(exception.invigilator.scenario, "offload"):
|
808
|
+
try:
|
809
|
+
# Replace the original scenario with the offloaded version
|
810
|
+
exception.invigilator.scenario = (
|
811
|
+
exception.invigilator.scenario.offload()
|
812
|
+
)
|
813
|
+
except Exception as e:
|
814
|
+
# Silently continue if offloading fails for any reason
|
815
|
+
pass
|
816
|
+
|
817
|
+
return self
|
818
|
+
|
819
|
+
def deduplicate_and_clean_interviews(self):
|
820
|
+
"""
|
821
|
+
Deduplicates exception entries in this task history to reduce memory usage.
|
822
|
+
|
823
|
+
This method removes duplicate error messages across interviews while preserving
|
824
|
+
the first occurrence of each unique error. This significantly reduces the size
|
825
|
+
of serialized task history data, especially for jobs with many similar errors.
|
826
|
+
|
827
|
+
Returns:
|
828
|
+
self: Returns the TaskHistory instance for method chaining.
|
829
|
+
"""
|
830
|
+
seen = set()
|
831
|
+
cleaned_interviews = []
|
832
|
+
|
833
|
+
for interview in self.total_interviews:
|
834
|
+
# Skip if interview has no exceptions
|
835
|
+
if not hasattr(interview, "exceptions") or not interview.exceptions:
|
836
|
+
continue
|
837
|
+
|
838
|
+
keep_interview = False
|
839
|
+
questions_to_modify = {}
|
840
|
+
questions_to_remove = []
|
841
|
+
|
842
|
+
# First pass: Collect all modifications without changing the dictionary
|
843
|
+
if hasattr(interview.exceptions, "items"):
|
844
|
+
for question_name, exceptions in list(interview.exceptions.items()):
|
845
|
+
filtered_exceptions = []
|
846
|
+
|
847
|
+
for exception in exceptions:
|
848
|
+
# Get the exception message (may require different access based on structure)
|
849
|
+
if hasattr(exception, "exception") and hasattr(
|
850
|
+
exception.exception, "args"
|
851
|
+
):
|
852
|
+
message = (
|
853
|
+
str(exception.exception.args[0])
|
854
|
+
if exception.exception.args
|
855
|
+
else ""
|
856
|
+
)
|
857
|
+
else:
|
858
|
+
message = str(exception)
|
859
|
+
|
860
|
+
# Create a unique key for this exception
|
861
|
+
key = (question_name, message)
|
862
|
+
|
863
|
+
# Only keep exceptions we haven't seen before
|
864
|
+
if key not in seen:
|
865
|
+
seen.add(key)
|
866
|
+
filtered_exceptions.append(exception)
|
867
|
+
|
868
|
+
# Track what should happen to this question's exceptions
|
869
|
+
if filtered_exceptions:
|
870
|
+
keep_interview = True
|
871
|
+
questions_to_modify[question_name] = filtered_exceptions
|
872
|
+
else:
|
873
|
+
questions_to_remove.append(question_name)
|
874
|
+
|
875
|
+
# Second pass: Apply all modifications safely
|
876
|
+
if hasattr(interview.exceptions, "items"):
|
877
|
+
# Add/replace filtered exceptions
|
878
|
+
for question_name, filtered_exceptions in questions_to_modify.items():
|
879
|
+
interview.exceptions[question_name] = filtered_exceptions
|
880
|
+
|
881
|
+
# Remove questions with all duplicate exceptions
|
882
|
+
for question_name in questions_to_remove:
|
883
|
+
if hasattr(interview.exceptions, "pop"):
|
884
|
+
interview.exceptions.pop(question_name, None)
|
885
|
+
elif (
|
886
|
+
hasattr(interview.exceptions, "__delitem__")
|
887
|
+
and question_name in interview.exceptions
|
888
|
+
):
|
889
|
+
del interview.exceptions[question_name]
|
890
|
+
|
891
|
+
# Only keep the interview if it still has exceptions after filtering
|
892
|
+
if keep_interview:
|
893
|
+
cleaned_interviews.append(interview)
|
894
|
+
|
895
|
+
# Replace the total_interviews with our cleaned list
|
896
|
+
self.total_interviews = cleaned_interviews
|
897
|
+
|
898
|
+
# Rebuild the _interviews dictionary
|
899
|
+
self._interviews = {
|
900
|
+
index: interview for index, interview in enumerate(self.total_interviews)
|
901
|
+
}
|
902
|
+
|
903
|
+
return self
|
904
|
+
|
731
905
|
|
732
906
|
if __name__ == "__main__":
|
733
907
|
import doctest
|