edsl 0.1.57__py3-none-any.whl → 0.1.59__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- edsl/__version__.py +1 -1
- edsl/agents/agent.py +23 -4
- edsl/agents/agent_list.py +36 -6
- edsl/coop/coop.py +274 -35
- edsl/coop/utils.py +63 -0
- edsl/dataset/dataset.py +74 -0
- edsl/dataset/dataset_operations_mixin.py +67 -62
- edsl/inference_services/services/test_service.py +1 -1
- edsl/interviews/exception_tracking.py +92 -20
- edsl/invigilators/invigilators.py +5 -1
- edsl/invigilators/prompt_constructor.py +299 -136
- edsl/jobs/html_table_job_logger.py +394 -48
- edsl/jobs/jobs_pricing_estimation.py +19 -114
- edsl/jobs/jobs_remote_inference_logger.py +29 -0
- edsl/jobs/jobs_runner_status.py +52 -21
- edsl/jobs/remote_inference.py +214 -30
- edsl/language_models/language_model.py +40 -3
- edsl/language_models/price_manager.py +91 -57
- edsl/prompts/prompt.py +1 -0
- edsl/questions/question_list.py +76 -20
- edsl/results/results.py +8 -1
- edsl/scenarios/file_store.py +8 -12
- edsl/scenarios/scenario.py +50 -2
- edsl/scenarios/scenario_list.py +34 -12
- edsl/surveys/survey.py +4 -0
- edsl/tasks/task_history.py +180 -6
- edsl/utilities/wikipedia.py +194 -0
- {edsl-0.1.57.dist-info → edsl-0.1.59.dist-info}/METADATA +4 -3
- {edsl-0.1.57.dist-info → edsl-0.1.59.dist-info}/RECORD +32 -32
- edsl/language_models/compute_cost.py +0 -78
- {edsl-0.1.57.dist-info → edsl-0.1.59.dist-info}/LICENSE +0 -0
- {edsl-0.1.57.dist-info → edsl-0.1.59.dist-info}/WHEEL +0 -0
- {edsl-0.1.57.dist-info → edsl-0.1.59.dist-info}/entry_points.txt +0 -0
edsl/questions/question_list.py
CHANGED
@@ -299,23 +299,24 @@ class ListResponseValidator(ResponseValidatorABC):
|
|
299
299
|
# This method can now be removed since validation is handled in the Pydantic model
|
300
300
|
pass
|
301
301
|
|
302
|
-
def fix(self, response, verbose=False):
|
302
|
+
def fix(self, response, verbose=False) -> dict[str, Any]:
|
303
303
|
"""
|
304
304
|
Fix common issues in list responses by splitting strings into lists.
|
305
305
|
|
306
306
|
Examples:
|
307
307
|
>>> from edsl import QuestionList
|
308
|
-
>>>
|
309
|
-
>>>
|
308
|
+
>>> q_constrained = QuestionList.example(min_list_items=2, max_list_items=4)
|
309
|
+
>>> validator_constrained = q_constrained.response_validator
|
310
310
|
|
311
|
+
>>> q_permissive = QuestionList.example(permissive=True)
|
312
|
+
>>> validator_permissive = q_permissive.response_validator
|
313
|
+
|
311
314
|
>>> # Fix a string that should be a list
|
312
315
|
>>> bad_response = {"answer": "apple,banana,cherry"}
|
313
|
-
>>>
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
... validated = validator.validate(fixed)
|
318
|
-
... validated # Show full response
|
316
|
+
>>> fixed = validator_constrained.fix(bad_response)
|
317
|
+
>>> fixed
|
318
|
+
{'answer': ['apple', 'banana', 'cherry']}
|
319
|
+
>>> validator_constrained.validate(fixed) # Show full response after validation
|
319
320
|
{'answer': ['apple', 'banana', 'cherry'], 'comment': None, 'generated_tokens': None}
|
320
321
|
|
321
322
|
>>> # Fix using generated_tokens when answer is invalid
|
@@ -323,12 +324,10 @@ class ListResponseValidator(ResponseValidatorABC):
|
|
323
324
|
... "answer": None,
|
324
325
|
... "generated_tokens": "pizza, pasta, salad"
|
325
326
|
... }
|
326
|
-
>>>
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
... validated = validator.validate(fixed)
|
331
|
-
... validated
|
327
|
+
>>> fixed = validator_constrained.fix(bad_response)
|
328
|
+
>>> fixed
|
329
|
+
{'answer': ['pizza', ' pasta', ' salad']}
|
330
|
+
>>> validator_constrained.validate(fixed)
|
332
331
|
{'answer': ['pizza', ' pasta', ' salad'], 'comment': None, 'generated_tokens': None}
|
333
332
|
|
334
333
|
>>> # Preserve comments during fixing
|
@@ -336,17 +335,74 @@ class ListResponseValidator(ResponseValidatorABC):
|
|
336
335
|
... "answer": "red,blue,green",
|
337
336
|
... "comment": "These are colors"
|
338
337
|
... }
|
339
|
-
>>>
|
340
|
-
>>>
|
338
|
+
>>> fixed_output = validator_constrained.fix(bad_response)
|
339
|
+
>>> fixed_output
|
340
|
+
{'answer': ['red', 'blue', 'green'], 'comment': 'These are colors'}
|
341
|
+
>>> validated_output = validator_constrained.validate(fixed_output)
|
342
|
+
>>> validated_output == {
|
341
343
|
... "answer": ["red", "blue", "green"],
|
342
|
-
... "comment": "These are colors"
|
344
|
+
... "comment": "These are colors",
|
345
|
+
... "generated_tokens": None
|
343
346
|
... }
|
344
347
|
True
|
348
|
+
|
349
|
+
>>> # Fix an empty string answer
|
350
|
+
>>> bad_response = {"answer": ""}
|
351
|
+
>>> fixed = validator_constrained.fix(bad_response)
|
352
|
+
>>> fixed
|
353
|
+
{'answer': []}
|
354
|
+
>>> validator_permissive.validate(fixed)
|
355
|
+
{'answer': [], 'comment': None, 'generated_tokens': None}
|
356
|
+
|
357
|
+
>>> # Fix a single item string answer (no commas)
|
358
|
+
>>> bad_response = {"answer": "single_item"}
|
359
|
+
>>> fixed = validator_constrained.fix(bad_response)
|
360
|
+
>>> fixed
|
361
|
+
{'answer': ['single_item']}
|
362
|
+
>>> validator_permissive.validate(fixed)
|
363
|
+
{'answer': ['single_item'], 'comment': None, 'generated_tokens': None}
|
364
|
+
|
365
|
+
>>> # Fix when answer is None and no generated_tokens
|
366
|
+
>>> bad_response = {"answer": None}
|
367
|
+
>>> fixed = validator_constrained.fix(bad_response)
|
368
|
+
>>> fixed
|
369
|
+
{'answer': []}
|
370
|
+
>>> validator_permissive.validate(fixed)
|
371
|
+
{'answer': [], 'comment': None, 'generated_tokens': None}
|
372
|
+
|
373
|
+
>>> # Fix when answer key is missing but generated_tokens is present
|
374
|
+
>>> bad_response = {"generated_tokens": "token1,token2"}
|
375
|
+
>>> fixed = validator_constrained.fix(bad_response)
|
376
|
+
>>> fixed
|
377
|
+
{'answer': ['token1', 'token2']}
|
378
|
+
>>> validator_constrained.validate(fixed) # 2 items, OK for constrained validator
|
379
|
+
{'answer': ['token1', 'token2'], 'comment': None, 'generated_tokens': None}
|
380
|
+
|
381
|
+
>>> # Fix when answer key is missing and generated_tokens is an empty string
|
382
|
+
>>> bad_response = {"generated_tokens": ""}
|
383
|
+
>>> fixed = validator_constrained.fix(bad_response)
|
384
|
+
>>> fixed
|
385
|
+
{'answer': []}
|
386
|
+
>>> validator_permissive.validate(fixed)
|
387
|
+
{'answer': [], 'comment': None, 'generated_tokens': None}
|
388
|
+
|
389
|
+
>>> # Fix when answer key is missing and generated_tokens is a single item
|
390
|
+
>>> bad_response = {"generated_tokens": "single_token"}
|
391
|
+
>>> fixed = validator_constrained.fix(bad_response)
|
392
|
+
>>> fixed
|
393
|
+
{'answer': ['single_token']}
|
394
|
+
>>> validator_permissive.validate(fixed)
|
395
|
+
{'answer': ['single_token'], 'comment': None, 'generated_tokens': None}
|
345
396
|
"""
|
346
397
|
if verbose:
|
347
398
|
print(f"Fixing list response: {response}")
|
348
399
|
answer = str(response.get("answer") or response.get("generated_tokens", ""))
|
349
|
-
|
400
|
+
if "," in answer:
|
401
|
+
result = {"answer": answer.split(",")}
|
402
|
+
elif answer == "":
|
403
|
+
result = {"answer": []}
|
404
|
+
else:
|
405
|
+
result = {"answer": [answer]}
|
350
406
|
if "comment" in response:
|
351
407
|
result["comment"] = response["comment"]
|
352
408
|
return result
|
@@ -395,7 +451,7 @@ class QuestionList(QuestionBase):
|
|
395
451
|
|
396
452
|
self.include_comment = include_comment
|
397
453
|
self.answering_instructions = answering_instructions
|
398
|
-
self.
|
454
|
+
self.question_presentation = question_presentation
|
399
455
|
|
400
456
|
def create_response_model(self):
|
401
457
|
return create_model(self.min_list_items, self.max_list_items, self.permissive)
|
edsl/results/results.py
CHANGED
@@ -771,6 +771,10 @@ class Results(MutableSequence, ResultsOperationsMixin, Base):
|
|
771
771
|
def to_dataset(self) -> "Dataset":
|
772
772
|
return self.select()
|
773
773
|
|
774
|
+
def optimzie_scenarios(self):
|
775
|
+
for result in self.data:
|
776
|
+
result.scenario.offload(inplace=True)
|
777
|
+
|
774
778
|
def to_dict(
|
775
779
|
self,
|
776
780
|
sort: bool = False,
|
@@ -778,9 +782,12 @@ class Results(MutableSequence, ResultsOperationsMixin, Base):
|
|
778
782
|
include_cache: bool = True,
|
779
783
|
include_task_history: bool = False,
|
780
784
|
include_cache_info: bool = True,
|
785
|
+
offload_scenarios: bool = True,
|
781
786
|
) -> dict[str, Any]:
|
782
787
|
from ..caching import Cache
|
783
788
|
|
789
|
+
if offload_scenarios:
|
790
|
+
self.optimzie_scenarios()
|
784
791
|
if sort:
|
785
792
|
data = sorted([result for result in self.data], key=lambda x: hash(x))
|
786
793
|
else:
|
@@ -809,7 +816,7 @@ class Results(MutableSequence, ResultsOperationsMixin, Base):
|
|
809
816
|
)
|
810
817
|
|
811
818
|
if self.task_history.has_unfixed_exceptions or include_task_history:
|
812
|
-
d.update({"task_history": self.task_history.to_dict()})
|
819
|
+
d.update({"task_history": self.task_history.to_dict(offload_content=True)})
|
813
820
|
|
814
821
|
if add_edsl_version:
|
815
822
|
from .. import __version__
|
edsl/scenarios/file_store.py
CHANGED
@@ -446,9 +446,7 @@ class FileStore(Scenario):
|
|
446
446
|
if suffix is None:
|
447
447
|
suffix = self.suffix
|
448
448
|
if self.binary:
|
449
|
-
file_like_object = self.base64_to_file(
|
450
|
-
self["base64_string"], is_binary=True
|
451
|
-
)
|
449
|
+
file_like_object = self.base64_to_file(self.base64_string, is_binary=True)
|
452
450
|
else:
|
453
451
|
file_like_object = self.base64_to_text_file(self.base64_string)
|
454
452
|
|
@@ -765,15 +763,13 @@ class FileStore(Scenario):
|
|
765
763
|
if name.startswith("__") and name.endswith("__"):
|
766
764
|
raise AttributeError(name)
|
767
765
|
|
768
|
-
#
|
769
|
-
|
770
|
-
|
771
|
-
|
772
|
-
|
773
|
-
|
774
|
-
|
775
|
-
return getattr(df, name)
|
776
|
-
# If not a CSV or attribute doesn't exist in DataFrame, raise AttributeError
|
766
|
+
# Check for _data directly in __dict__ to avoid recursion
|
767
|
+
_data = self.__dict__.get("_data", None)
|
768
|
+
if _data and _data.get("suffix") == "csv":
|
769
|
+
df = self.to_pandas()
|
770
|
+
if hasattr(df, name):
|
771
|
+
return getattr(df, name)
|
772
|
+
|
777
773
|
raise AttributeError(
|
778
774
|
f"'{self.__class__.__name__}' object has no attribute '{name}'"
|
779
775
|
)
|
edsl/scenarios/scenario.py
CHANGED
@@ -264,9 +264,49 @@ class Scenario(Base, UserDict):
|
|
264
264
|
"""Display a scenario as a table."""
|
265
265
|
return self.to_dataset().table(tablefmt=tablefmt)
|
266
266
|
|
267
|
-
def
|
267
|
+
def offload(self, inplace=False) -> "Scenario":
|
268
|
+
"""
|
269
|
+
Offloads base64-encoded content from the scenario by replacing 'base64_string'
|
270
|
+
fields with 'offloaded'. This reduces memory usage.
|
271
|
+
|
272
|
+
Args:
|
273
|
+
inplace (bool): If True, modify the current scenario. If False, return a new one.
|
274
|
+
|
275
|
+
Returns:
|
276
|
+
Scenario: The modified scenario (either self or a new instance).
|
277
|
+
"""
|
278
|
+
from edsl.scenarios import FileStore
|
279
|
+
from edsl.prompts import Prompt
|
280
|
+
|
281
|
+
target = self if inplace else Scenario()
|
282
|
+
|
283
|
+
for key, value in self.items():
|
284
|
+
if isinstance(value, FileStore):
|
285
|
+
file_store_dict = value.to_dict()
|
286
|
+
if "base64_string" in file_store_dict:
|
287
|
+
file_store_dict["base64_string"] = "offloaded"
|
288
|
+
modified_value = FileStore.from_dict(file_store_dict)
|
289
|
+
elif isinstance(value, dict) and "base64_string" in value:
|
290
|
+
value_copy = value.copy()
|
291
|
+
value_copy["base64_string"] = "offloaded"
|
292
|
+
modified_value = value_copy
|
293
|
+
else:
|
294
|
+
modified_value = value
|
295
|
+
|
296
|
+
target[key] = modified_value
|
297
|
+
|
298
|
+
return target
|
299
|
+
|
300
|
+
def to_dict(
|
301
|
+
self, add_edsl_version: bool = True, offload_base64: bool = False
|
302
|
+
) -> dict:
|
268
303
|
"""Convert a scenario to a dictionary.
|
269
304
|
|
305
|
+
Args:
|
306
|
+
add_edsl_version: If True, adds the EDSL version to the returned dictionary.
|
307
|
+
offload_base64: If True, replaces any base64_string fields with 'offloaded'
|
308
|
+
to reduce memory usage.
|
309
|
+
|
270
310
|
Example:
|
271
311
|
|
272
312
|
>>> s = Scenario({"food": "wood chips"})
|
@@ -283,7 +323,15 @@ class Scenario(Base, UserDict):
|
|
283
323
|
d = self.data.copy()
|
284
324
|
for key, value in d.items():
|
285
325
|
if isinstance(value, FileStore) or isinstance(value, Prompt):
|
286
|
-
|
326
|
+
value_dict = value.to_dict(add_edsl_version=add_edsl_version)
|
327
|
+
if (
|
328
|
+
offload_base64
|
329
|
+
and isinstance(value_dict, dict)
|
330
|
+
and "base64_string" in value_dict
|
331
|
+
):
|
332
|
+
value_dict["base64_string"] = "offloaded"
|
333
|
+
d[key] = value_dict
|
334
|
+
|
287
335
|
if add_edsl_version:
|
288
336
|
from edsl import __version__
|
289
337
|
|
edsl/scenarios/scenario_list.py
CHANGED
@@ -145,22 +145,18 @@ class ScenarioList(MutableSequence, Base, ScenarioListOperationsMixin):
|
|
145
145
|
"""Initialize a new ScenarioList with optional data and codebook."""
|
146
146
|
self._data_class = data_class
|
147
147
|
self.data = self._data_class([])
|
148
|
-
warned = False
|
149
148
|
for item in data or []:
|
150
|
-
try:
|
151
|
-
_ = json.dumps(item.to_dict())
|
152
|
-
except:
|
153
|
-
import warnings
|
154
|
-
if not warned:
|
155
|
-
warnings.warn(
|
156
|
-
f"One or more items in the data list are not JSON serializable. "
|
157
|
-
"This would prevent running a job that uses this ScenarioList."
|
158
|
-
"One solution is to use 'str(item)' to convert the item to a string before adding."
|
159
|
-
)
|
160
|
-
warned = True
|
161
149
|
self.data.append(item)
|
162
150
|
self.codebook = codebook or {}
|
163
151
|
|
152
|
+
def is_serializable(self):
|
153
|
+
for item in self.data:
|
154
|
+
try:
|
155
|
+
_ = json.dumps(item.to_dict())
|
156
|
+
except Exception as e:
|
157
|
+
return False
|
158
|
+
return True
|
159
|
+
|
164
160
|
# Required MutableSequence abstract methods
|
165
161
|
def __getitem__(self, index):
|
166
162
|
"""Get item at index."""
|
@@ -360,6 +356,32 @@ class ScenarioList(MutableSequence, Base, ScenarioListOperationsMixin):
|
|
360
356
|
new_scenarios.append(Scenario(new_scenario))
|
361
357
|
|
362
358
|
return new_scenarios
|
359
|
+
|
360
|
+
@classmethod
|
361
|
+
def from_search_terms(cls, search_terms: List[str]) -> ScenarioList:
|
362
|
+
"""Create a ScenarioList from a list of search terms, using Wikipedia.
|
363
|
+
|
364
|
+
Args:
|
365
|
+
search_terms: A list of search terms.
|
366
|
+
"""
|
367
|
+
from ..utilities.wikipedia import fetch_wikipedia_content
|
368
|
+
results = fetch_wikipedia_content(search_terms)
|
369
|
+
return cls([Scenario(result) for result in results])
|
370
|
+
|
371
|
+
def augment_with_wikipedia(self, search_key:str, content_only: bool = True, key_name: str = "wikipedia_content") -> ScenarioList:
|
372
|
+
"""Augment the ScenarioList with Wikipedia content."""
|
373
|
+
search_terms = self.select(search_key).to_list()
|
374
|
+
wikipedia_results = ScenarioList.from_search_terms(search_terms)
|
375
|
+
new_sl = ScenarioList(data = [], codebook = self.codebook)
|
376
|
+
for scenario, wikipedia_result in zip(self, wikipedia_results):
|
377
|
+
if content_only:
|
378
|
+
scenario[key_name] = wikipedia_result["content"]
|
379
|
+
new_sl.append(scenario)
|
380
|
+
else:
|
381
|
+
scenario[key_name] = wikipedia_result
|
382
|
+
new_sl.append(scenario)
|
383
|
+
return new_sl
|
384
|
+
|
363
385
|
|
364
386
|
def pivot(
|
365
387
|
self,
|
edsl/surveys/survey.py
CHANGED
@@ -384,6 +384,10 @@ class Survey(Base):
|
|
384
384
|
if question_name not in self.question_name_to_index:
|
385
385
|
raise SurveyError(f"Question name {question_name} not found in survey.")
|
386
386
|
return self.questions[self.question_name_to_index[question_name]]
|
387
|
+
|
388
|
+
def get(self, question_name: str) -> QuestionBase:
|
389
|
+
"""Return the question object given the question name."""
|
390
|
+
return self._get_question_by_name(question_name)
|
387
391
|
|
388
392
|
def question_names_to_questions(self) -> dict:
|
389
393
|
"""Return a dictionary mapping question names to question attributes."""
|
edsl/tasks/task_history.py
CHANGED
@@ -43,6 +43,7 @@ class TaskHistory(RepresentationMixin):
|
|
43
43
|
- Generates interactive HTML reports with filtering and drill-down
|
44
44
|
- Computes statistics across interviews (by model, question type, etc.)
|
45
45
|
- Exports to various formats (HTML, notebook, etc.)
|
46
|
+
- Memory optimization via offloading of large file content
|
46
47
|
"""
|
47
48
|
|
48
49
|
def __init__(
|
@@ -191,8 +192,22 @@ class TaskHistory(RepresentationMixin):
|
|
191
192
|
"""Return a string representation of the TaskHistory."""
|
192
193
|
return f"TaskHistory(interviews={self.total_interviews})."
|
193
194
|
|
194
|
-
def to_dict(self, add_edsl_version=True):
|
195
|
-
"""
|
195
|
+
def to_dict(self, add_edsl_version=True, offload_content=False):
|
196
|
+
"""
|
197
|
+
Return the TaskHistory as a dictionary.
|
198
|
+
|
199
|
+
Parameters:
|
200
|
+
add_edsl_version: Whether to include EDSL version in the output
|
201
|
+
offload_content: Whether to offload large file content like videos and images
|
202
|
+
to reduce memory usage
|
203
|
+
|
204
|
+
Returns:
|
205
|
+
A dictionary representation of this TaskHistory instance
|
206
|
+
"""
|
207
|
+
# Offload large file content if requested
|
208
|
+
if offload_content:
|
209
|
+
self.offload_files_content()
|
210
|
+
|
196
211
|
# Serialize each interview object
|
197
212
|
interview_dicts = []
|
198
213
|
for i in self.total_interviews:
|
@@ -255,27 +270,60 @@ class TaskHistory(RepresentationMixin):
|
|
255
270
|
InterviewExceptionCollection,
|
256
271
|
)
|
257
272
|
|
273
|
+
# Store the original data in full
|
274
|
+
self._original_data = data
|
275
|
+
|
276
|
+
# Preserve the original interview id
|
277
|
+
self._interview_id = data.get("id", None)
|
278
|
+
|
279
|
+
# Store exceptions using the original data structure
|
280
|
+
# This ensures when we re-serialize, we keep original data intact
|
281
|
+
self._exceptions_data = data.get("exceptions", {})
|
282
|
+
|
283
|
+
# Create the InterviewExceptionCollection for runtime use
|
258
284
|
exceptions_data = data.get("exceptions", {})
|
259
285
|
self.exceptions = (
|
260
286
|
InterviewExceptionCollection.from_dict(exceptions_data)
|
261
287
|
if exceptions_data
|
262
288
|
else InterviewExceptionCollection()
|
263
289
|
)
|
290
|
+
|
291
|
+
# Store other fields
|
264
292
|
self.task_status_logs = data.get("task_status_logs", {})
|
265
293
|
self.model = data.get("model", {})
|
266
294
|
self.survey = data.get("survey", {})
|
267
295
|
|
268
296
|
def to_dict(self, add_edsl_version=True):
|
269
|
-
|
297
|
+
# Use the original exceptions data structure when serializing again
|
298
|
+
# This preserves all exception details exactly as they were
|
299
|
+
data = {
|
270
300
|
"type": "InterviewReference",
|
271
|
-
"exceptions": self.
|
272
|
-
if hasattr(self
|
273
|
-
else
|
301
|
+
"exceptions": self._exceptions_data
|
302
|
+
if hasattr(self, "_exceptions_data")
|
303
|
+
else (
|
304
|
+
self.exceptions.to_dict()
|
305
|
+
if hasattr(self.exceptions, "to_dict")
|
306
|
+
else self.exceptions
|
307
|
+
),
|
274
308
|
"task_status_logs": self.task_status_logs,
|
275
309
|
"model": self.model,
|
276
310
|
"survey": self.survey,
|
277
311
|
}
|
278
312
|
|
313
|
+
# Preserve the original interview id if it exists
|
314
|
+
if self._interview_id:
|
315
|
+
data["id"] = self._interview_id
|
316
|
+
|
317
|
+
# Preserve original version info
|
318
|
+
if (
|
319
|
+
add_edsl_version
|
320
|
+
and hasattr(self, "_original_data")
|
321
|
+
and "edsl_version" in self._original_data
|
322
|
+
):
|
323
|
+
data["edsl_version"] = self._original_data["edsl_version"]
|
324
|
+
|
325
|
+
return data
|
326
|
+
|
279
327
|
# Create the reference and add it directly
|
280
328
|
ref = DeserializedInterviewRef(interview_data)
|
281
329
|
instance.total_interviews.append(ref)
|
@@ -728,6 +776,132 @@ class TaskHistory(RepresentationMixin):
|
|
728
776
|
|
729
777
|
return nb
|
730
778
|
|
779
|
+
def offload_files_content(self):
|
780
|
+
"""
|
781
|
+
Offload large file content from scenarios in interview exceptions.
|
782
|
+
|
783
|
+
This method iterates over all the interview exceptions and calls the offload method
|
784
|
+
for any scenario components in the invigilator. This significantly reduces memory usage
|
785
|
+
by replacing base64-encoded content with a placeholder string, while preserving the
|
786
|
+
structure of the scenarios.
|
787
|
+
|
788
|
+
Returns:
|
789
|
+
self: Returns the TaskHistory instance for method chaining
|
790
|
+
|
791
|
+
This is particularly useful for TaskHistory instances containing interviews with
|
792
|
+
large file content, such as videos, images, or other binary data.
|
793
|
+
"""
|
794
|
+
for interview in self.total_interviews:
|
795
|
+
if not hasattr(interview, "exceptions") or not interview.exceptions:
|
796
|
+
continue
|
797
|
+
|
798
|
+
for question_name, exceptions in interview.exceptions.items():
|
799
|
+
for exception in exceptions:
|
800
|
+
# Check if exception has an invigilator with scenario
|
801
|
+
if hasattr(exception, "invigilator") and exception.invigilator:
|
802
|
+
if (
|
803
|
+
hasattr(exception.invigilator, "scenario")
|
804
|
+
and exception.invigilator.scenario
|
805
|
+
):
|
806
|
+
# Call the offload method on the scenario
|
807
|
+
if hasattr(exception.invigilator.scenario, "offload"):
|
808
|
+
try:
|
809
|
+
# Replace the original scenario with the offloaded version
|
810
|
+
exception.invigilator.scenario = (
|
811
|
+
exception.invigilator.scenario.offload()
|
812
|
+
)
|
813
|
+
except Exception as e:
|
814
|
+
# Silently continue if offloading fails for any reason
|
815
|
+
pass
|
816
|
+
|
817
|
+
return self
|
818
|
+
|
819
|
+
def deduplicate_and_clean_interviews(self):
|
820
|
+
"""
|
821
|
+
Deduplicates exception entries in this task history to reduce memory usage.
|
822
|
+
|
823
|
+
This method removes duplicate error messages across interviews while preserving
|
824
|
+
the first occurrence of each unique error. This significantly reduces the size
|
825
|
+
of serialized task history data, especially for jobs with many similar errors.
|
826
|
+
|
827
|
+
Returns:
|
828
|
+
self: Returns the TaskHistory instance for method chaining.
|
829
|
+
"""
|
830
|
+
seen = set()
|
831
|
+
cleaned_interviews = []
|
832
|
+
|
833
|
+
for interview in self.total_interviews:
|
834
|
+
# Skip if interview has no exceptions
|
835
|
+
if not hasattr(interview, "exceptions") or not interview.exceptions:
|
836
|
+
continue
|
837
|
+
|
838
|
+
keep_interview = False
|
839
|
+
questions_to_modify = {}
|
840
|
+
questions_to_remove = []
|
841
|
+
|
842
|
+
# First pass: Collect all modifications without changing the dictionary
|
843
|
+
if hasattr(interview.exceptions, "items"):
|
844
|
+
for question_name, exceptions in list(interview.exceptions.items()):
|
845
|
+
filtered_exceptions = []
|
846
|
+
|
847
|
+
for exception in exceptions:
|
848
|
+
# Get the exception message (may require different access based on structure)
|
849
|
+
if hasattr(exception, "exception") and hasattr(
|
850
|
+
exception.exception, "args"
|
851
|
+
):
|
852
|
+
message = (
|
853
|
+
str(exception.exception.args[0])
|
854
|
+
if exception.exception.args
|
855
|
+
else ""
|
856
|
+
)
|
857
|
+
else:
|
858
|
+
message = str(exception)
|
859
|
+
|
860
|
+
# Create a unique key for this exception
|
861
|
+
key = (question_name, message)
|
862
|
+
|
863
|
+
# Only keep exceptions we haven't seen before
|
864
|
+
if key not in seen:
|
865
|
+
seen.add(key)
|
866
|
+
filtered_exceptions.append(exception)
|
867
|
+
|
868
|
+
# Track what should happen to this question's exceptions
|
869
|
+
if filtered_exceptions:
|
870
|
+
keep_interview = True
|
871
|
+
questions_to_modify[question_name] = filtered_exceptions
|
872
|
+
else:
|
873
|
+
questions_to_remove.append(question_name)
|
874
|
+
|
875
|
+
# Second pass: Apply all modifications safely
|
876
|
+
if hasattr(interview.exceptions, "items"):
|
877
|
+
# Add/replace filtered exceptions
|
878
|
+
for question_name, filtered_exceptions in questions_to_modify.items():
|
879
|
+
interview.exceptions[question_name] = filtered_exceptions
|
880
|
+
|
881
|
+
# Remove questions with all duplicate exceptions
|
882
|
+
for question_name in questions_to_remove:
|
883
|
+
if hasattr(interview.exceptions, "pop"):
|
884
|
+
interview.exceptions.pop(question_name, None)
|
885
|
+
elif (
|
886
|
+
hasattr(interview.exceptions, "__delitem__")
|
887
|
+
and question_name in interview.exceptions
|
888
|
+
):
|
889
|
+
del interview.exceptions[question_name]
|
890
|
+
|
891
|
+
# Only keep the interview if it still has exceptions after filtering
|
892
|
+
if keep_interview:
|
893
|
+
cleaned_interviews.append(interview)
|
894
|
+
|
895
|
+
# Replace the total_interviews with our cleaned list
|
896
|
+
self.total_interviews = cleaned_interviews
|
897
|
+
|
898
|
+
# Rebuild the _interviews dictionary
|
899
|
+
self._interviews = {
|
900
|
+
index: interview for index, interview in enumerate(self.total_interviews)
|
901
|
+
}
|
902
|
+
|
903
|
+
return self
|
904
|
+
|
731
905
|
|
732
906
|
if __name__ == "__main__":
|
733
907
|
import doctest
|