edsl 0.1.57__py3-none-any.whl → 0.1.59__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -299,23 +299,24 @@ class ListResponseValidator(ResponseValidatorABC):
299
299
  # This method can now be removed since validation is handled in the Pydantic model
300
300
  pass
301
301
 
302
- def fix(self, response, verbose=False):
302
+ def fix(self, response, verbose=False) -> dict[str, Any]:
303
303
  """
304
304
  Fix common issues in list responses by splitting strings into lists.
305
305
 
306
306
  Examples:
307
307
  >>> from edsl import QuestionList
308
- >>> q = QuestionList.example(min_list_items=2, max_list_items=4)
309
- >>> validator = q.response_validator
308
+ >>> q_constrained = QuestionList.example(min_list_items=2, max_list_items=4)
309
+ >>> validator_constrained = q_constrained.response_validator
310
310
 
311
+ >>> q_permissive = QuestionList.example(permissive=True)
312
+ >>> validator_permissive = q_permissive.response_validator
313
+
311
314
  >>> # Fix a string that should be a list
312
315
  >>> bad_response = {"answer": "apple,banana,cherry"}
313
- >>> try:
314
- ... validator.validate(bad_response)
315
- ... except Exception:
316
- ... fixed = validator.fix(bad_response)
317
- ... validated = validator.validate(fixed)
318
- ... validated # Show full response
316
+ >>> fixed = validator_constrained.fix(bad_response)
317
+ >>> fixed
318
+ {'answer': ['apple', 'banana', 'cherry']}
319
+ >>> validator_constrained.validate(fixed) # Show full response after validation
319
320
  {'answer': ['apple', 'banana', 'cherry'], 'comment': None, 'generated_tokens': None}
320
321
 
321
322
  >>> # Fix using generated_tokens when answer is invalid
@@ -323,12 +324,10 @@ class ListResponseValidator(ResponseValidatorABC):
323
324
  ... "answer": None,
324
325
  ... "generated_tokens": "pizza, pasta, salad"
325
326
  ... }
326
- >>> try:
327
- ... validator.validate(bad_response)
328
- ... except Exception:
329
- ... fixed = validator.fix(bad_response)
330
- ... validated = validator.validate(fixed)
331
- ... validated
327
+ >>> fixed = validator_constrained.fix(bad_response)
328
+ >>> fixed
329
+ {'answer': ['pizza', ' pasta', ' salad']}
330
+ >>> validator_constrained.validate(fixed)
332
331
  {'answer': ['pizza', ' pasta', ' salad'], 'comment': None, 'generated_tokens': None}
333
332
 
334
333
  >>> # Preserve comments during fixing
@@ -336,17 +335,74 @@ class ListResponseValidator(ResponseValidatorABC):
336
335
  ... "answer": "red,blue,green",
337
336
  ... "comment": "These are colors"
338
337
  ... }
339
- >>> fixed = validator.fix(bad_response)
340
- >>> fixed == {
338
+ >>> fixed_output = validator_constrained.fix(bad_response)
339
+ >>> fixed_output
340
+ {'answer': ['red', 'blue', 'green'], 'comment': 'These are colors'}
341
+ >>> validated_output = validator_constrained.validate(fixed_output)
342
+ >>> validated_output == {
341
343
  ... "answer": ["red", "blue", "green"],
342
- ... "comment": "These are colors"
344
+ ... "comment": "These are colors",
345
+ ... "generated_tokens": None
343
346
  ... }
344
347
  True
348
+
349
+ >>> # Fix an empty string answer
350
+ >>> bad_response = {"answer": ""}
351
+ >>> fixed = validator_constrained.fix(bad_response)
352
+ >>> fixed
353
+ {'answer': []}
354
+ >>> validator_permissive.validate(fixed)
355
+ {'answer': [], 'comment': None, 'generated_tokens': None}
356
+
357
+ >>> # Fix a single item string answer (no commas)
358
+ >>> bad_response = {"answer": "single_item"}
359
+ >>> fixed = validator_constrained.fix(bad_response)
360
+ >>> fixed
361
+ {'answer': ['single_item']}
362
+ >>> validator_permissive.validate(fixed)
363
+ {'answer': ['single_item'], 'comment': None, 'generated_tokens': None}
364
+
365
+ >>> # Fix when answer is None and no generated_tokens
366
+ >>> bad_response = {"answer": None}
367
+ >>> fixed = validator_constrained.fix(bad_response)
368
+ >>> fixed
369
+ {'answer': []}
370
+ >>> validator_permissive.validate(fixed)
371
+ {'answer': [], 'comment': None, 'generated_tokens': None}
372
+
373
+ >>> # Fix when answer key is missing but generated_tokens is present
374
+ >>> bad_response = {"generated_tokens": "token1,token2"}
375
+ >>> fixed = validator_constrained.fix(bad_response)
376
+ >>> fixed
377
+ {'answer': ['token1', 'token2']}
378
+ >>> validator_constrained.validate(fixed) # 2 items, OK for constrained validator
379
+ {'answer': ['token1', 'token2'], 'comment': None, 'generated_tokens': None}
380
+
381
+ >>> # Fix when answer key is missing and generated_tokens is an empty string
382
+ >>> bad_response = {"generated_tokens": ""}
383
+ >>> fixed = validator_constrained.fix(bad_response)
384
+ >>> fixed
385
+ {'answer': []}
386
+ >>> validator_permissive.validate(fixed)
387
+ {'answer': [], 'comment': None, 'generated_tokens': None}
388
+
389
+ >>> # Fix when answer key is missing and generated_tokens is a single item
390
+ >>> bad_response = {"generated_tokens": "single_token"}
391
+ >>> fixed = validator_constrained.fix(bad_response)
392
+ >>> fixed
393
+ {'answer': ['single_token']}
394
+ >>> validator_permissive.validate(fixed)
395
+ {'answer': ['single_token'], 'comment': None, 'generated_tokens': None}
345
396
  """
346
397
  if verbose:
347
398
  print(f"Fixing list response: {response}")
348
399
  answer = str(response.get("answer") or response.get("generated_tokens", ""))
349
- result = {"answer": answer.split(",")}
400
+ if "," in answer:
401
+ result = {"answer": answer.split(",")}
402
+ elif answer == "":
403
+ result = {"answer": []}
404
+ else:
405
+ result = {"answer": [answer]}
350
406
  if "comment" in response:
351
407
  result["comment"] = response["comment"]
352
408
  return result
@@ -395,7 +451,7 @@ class QuestionList(QuestionBase):
395
451
 
396
452
  self.include_comment = include_comment
397
453
  self.answering_instructions = answering_instructions
398
- self.question_presentations = question_presentation
454
+ self.question_presentation = question_presentation
399
455
 
400
456
  def create_response_model(self):
401
457
  return create_model(self.min_list_items, self.max_list_items, self.permissive)
edsl/results/results.py CHANGED
@@ -771,6 +771,10 @@ class Results(MutableSequence, ResultsOperationsMixin, Base):
771
771
  def to_dataset(self) -> "Dataset":
772
772
  return self.select()
773
773
 
774
+ def optimzie_scenarios(self):
775
+ for result in self.data:
776
+ result.scenario.offload(inplace=True)
777
+
774
778
  def to_dict(
775
779
  self,
776
780
  sort: bool = False,
@@ -778,9 +782,12 @@ class Results(MutableSequence, ResultsOperationsMixin, Base):
778
782
  include_cache: bool = True,
779
783
  include_task_history: bool = False,
780
784
  include_cache_info: bool = True,
785
+ offload_scenarios: bool = True,
781
786
  ) -> dict[str, Any]:
782
787
  from ..caching import Cache
783
788
 
789
+ if offload_scenarios:
790
+ self.optimzie_scenarios()
784
791
  if sort:
785
792
  data = sorted([result for result in self.data], key=lambda x: hash(x))
786
793
  else:
@@ -809,7 +816,7 @@ class Results(MutableSequence, ResultsOperationsMixin, Base):
809
816
  )
810
817
 
811
818
  if self.task_history.has_unfixed_exceptions or include_task_history:
812
- d.update({"task_history": self.task_history.to_dict()})
819
+ d.update({"task_history": self.task_history.to_dict(offload_content=True)})
813
820
 
814
821
  if add_edsl_version:
815
822
  from .. import __version__
@@ -446,9 +446,7 @@ class FileStore(Scenario):
446
446
  if suffix is None:
447
447
  suffix = self.suffix
448
448
  if self.binary:
449
- file_like_object = self.base64_to_file(
450
- self["base64_string"], is_binary=True
451
- )
449
+ file_like_object = self.base64_to_file(self.base64_string, is_binary=True)
452
450
  else:
453
451
  file_like_object = self.base64_to_text_file(self.base64_string)
454
452
 
@@ -765,15 +763,13 @@ class FileStore(Scenario):
765
763
  if name.startswith("__") and name.endswith("__"):
766
764
  raise AttributeError(name)
767
765
 
768
- # Only try to access suffix if it's in our __dict__
769
- if hasattr(self, "_data") and "suffix" in self._data:
770
- if self._data["suffix"] == "csv":
771
- # Get the pandas DataFrame
772
- df = self.to_pandas()
773
- # Check if the requested attribute exists in the DataFrame
774
- if hasattr(df, name):
775
- return getattr(df, name)
776
- # If not a CSV or attribute doesn't exist in DataFrame, raise AttributeError
766
+ # Check for _data directly in __dict__ to avoid recursion
767
+ _data = self.__dict__.get("_data", None)
768
+ if _data and _data.get("suffix") == "csv":
769
+ df = self.to_pandas()
770
+ if hasattr(df, name):
771
+ return getattr(df, name)
772
+
777
773
  raise AttributeError(
778
774
  f"'{self.__class__.__name__}' object has no attribute '{name}'"
779
775
  )
@@ -264,9 +264,49 @@ class Scenario(Base, UserDict):
264
264
  """Display a scenario as a table."""
265
265
  return self.to_dataset().table(tablefmt=tablefmt)
266
266
 
267
- def to_dict(self, add_edsl_version: bool = True) -> dict:
267
+ def offload(self, inplace=False) -> "Scenario":
268
+ """
269
+ Offloads base64-encoded content from the scenario by replacing 'base64_string'
270
+ fields with 'offloaded'. This reduces memory usage.
271
+
272
+ Args:
273
+ inplace (bool): If True, modify the current scenario. If False, return a new one.
274
+
275
+ Returns:
276
+ Scenario: The modified scenario (either self or a new instance).
277
+ """
278
+ from edsl.scenarios import FileStore
279
+ from edsl.prompts import Prompt
280
+
281
+ target = self if inplace else Scenario()
282
+
283
+ for key, value in self.items():
284
+ if isinstance(value, FileStore):
285
+ file_store_dict = value.to_dict()
286
+ if "base64_string" in file_store_dict:
287
+ file_store_dict["base64_string"] = "offloaded"
288
+ modified_value = FileStore.from_dict(file_store_dict)
289
+ elif isinstance(value, dict) and "base64_string" in value:
290
+ value_copy = value.copy()
291
+ value_copy["base64_string"] = "offloaded"
292
+ modified_value = value_copy
293
+ else:
294
+ modified_value = value
295
+
296
+ target[key] = modified_value
297
+
298
+ return target
299
+
300
+ def to_dict(
301
+ self, add_edsl_version: bool = True, offload_base64: bool = False
302
+ ) -> dict:
268
303
  """Convert a scenario to a dictionary.
269
304
 
305
+ Args:
306
+ add_edsl_version: If True, adds the EDSL version to the returned dictionary.
307
+ offload_base64: If True, replaces any base64_string fields with 'offloaded'
308
+ to reduce memory usage.
309
+
270
310
  Example:
271
311
 
272
312
  >>> s = Scenario({"food": "wood chips"})
@@ -283,7 +323,15 @@ class Scenario(Base, UserDict):
283
323
  d = self.data.copy()
284
324
  for key, value in d.items():
285
325
  if isinstance(value, FileStore) or isinstance(value, Prompt):
286
- d[key] = value.to_dict(add_edsl_version=add_edsl_version)
326
+ value_dict = value.to_dict(add_edsl_version=add_edsl_version)
327
+ if (
328
+ offload_base64
329
+ and isinstance(value_dict, dict)
330
+ and "base64_string" in value_dict
331
+ ):
332
+ value_dict["base64_string"] = "offloaded"
333
+ d[key] = value_dict
334
+
287
335
  if add_edsl_version:
288
336
  from edsl import __version__
289
337
 
@@ -145,22 +145,18 @@ class ScenarioList(MutableSequence, Base, ScenarioListOperationsMixin):
145
145
  """Initialize a new ScenarioList with optional data and codebook."""
146
146
  self._data_class = data_class
147
147
  self.data = self._data_class([])
148
- warned = False
149
148
  for item in data or []:
150
- try:
151
- _ = json.dumps(item.to_dict())
152
- except:
153
- import warnings
154
- if not warned:
155
- warnings.warn(
156
- f"One or more items in the data list are not JSON serializable. "
157
- "This would prevent running a job that uses this ScenarioList."
158
- "One solution is to use 'str(item)' to convert the item to a string before adding."
159
- )
160
- warned = True
161
149
  self.data.append(item)
162
150
  self.codebook = codebook or {}
163
151
 
152
+ def is_serializable(self):
153
+ for item in self.data:
154
+ try:
155
+ _ = json.dumps(item.to_dict())
156
+ except Exception as e:
157
+ return False
158
+ return True
159
+
164
160
  # Required MutableSequence abstract methods
165
161
  def __getitem__(self, index):
166
162
  """Get item at index."""
@@ -360,6 +356,32 @@ class ScenarioList(MutableSequence, Base, ScenarioListOperationsMixin):
360
356
  new_scenarios.append(Scenario(new_scenario))
361
357
 
362
358
  return new_scenarios
359
+
360
+ @classmethod
361
+ def from_search_terms(cls, search_terms: List[str]) -> ScenarioList:
362
+ """Create a ScenarioList from a list of search terms, using Wikipedia.
363
+
364
+ Args:
365
+ search_terms: A list of search terms.
366
+ """
367
+ from ..utilities.wikipedia import fetch_wikipedia_content
368
+ results = fetch_wikipedia_content(search_terms)
369
+ return cls([Scenario(result) for result in results])
370
+
371
+ def augment_with_wikipedia(self, search_key:str, content_only: bool = True, key_name: str = "wikipedia_content") -> ScenarioList:
372
+ """Augment the ScenarioList with Wikipedia content."""
373
+ search_terms = self.select(search_key).to_list()
374
+ wikipedia_results = ScenarioList.from_search_terms(search_terms)
375
+ new_sl = ScenarioList(data = [], codebook = self.codebook)
376
+ for scenario, wikipedia_result in zip(self, wikipedia_results):
377
+ if content_only:
378
+ scenario[key_name] = wikipedia_result["content"]
379
+ new_sl.append(scenario)
380
+ else:
381
+ scenario[key_name] = wikipedia_result
382
+ new_sl.append(scenario)
383
+ return new_sl
384
+
363
385
 
364
386
  def pivot(
365
387
  self,
edsl/surveys/survey.py CHANGED
@@ -384,6 +384,10 @@ class Survey(Base):
384
384
  if question_name not in self.question_name_to_index:
385
385
  raise SurveyError(f"Question name {question_name} not found in survey.")
386
386
  return self.questions[self.question_name_to_index[question_name]]
387
+
388
+ def get(self, question_name: str) -> QuestionBase:
389
+ """Return the question object given the question name."""
390
+ return self._get_question_by_name(question_name)
387
391
 
388
392
  def question_names_to_questions(self) -> dict:
389
393
  """Return a dictionary mapping question names to question attributes."""
@@ -43,6 +43,7 @@ class TaskHistory(RepresentationMixin):
43
43
  - Generates interactive HTML reports with filtering and drill-down
44
44
  - Computes statistics across interviews (by model, question type, etc.)
45
45
  - Exports to various formats (HTML, notebook, etc.)
46
+ - Memory optimization via offloading of large file content
46
47
  """
47
48
 
48
49
  def __init__(
@@ -191,8 +192,22 @@ class TaskHistory(RepresentationMixin):
191
192
  """Return a string representation of the TaskHistory."""
192
193
  return f"TaskHistory(interviews={self.total_interviews})."
193
194
 
194
- def to_dict(self, add_edsl_version=True):
195
- """Return the TaskHistory as a dictionary."""
195
+ def to_dict(self, add_edsl_version=True, offload_content=False):
196
+ """
197
+ Return the TaskHistory as a dictionary.
198
+
199
+ Parameters:
200
+ add_edsl_version: Whether to include EDSL version in the output
201
+ offload_content: Whether to offload large file content like videos and images
202
+ to reduce memory usage
203
+
204
+ Returns:
205
+ A dictionary representation of this TaskHistory instance
206
+ """
207
+ # Offload large file content if requested
208
+ if offload_content:
209
+ self.offload_files_content()
210
+
196
211
  # Serialize each interview object
197
212
  interview_dicts = []
198
213
  for i in self.total_interviews:
@@ -255,27 +270,60 @@ class TaskHistory(RepresentationMixin):
255
270
  InterviewExceptionCollection,
256
271
  )
257
272
 
273
+ # Store the original data in full
274
+ self._original_data = data
275
+
276
+ # Preserve the original interview id
277
+ self._interview_id = data.get("id", None)
278
+
279
+ # Store exceptions using the original data structure
280
+ # This ensures when we re-serialize, we keep original data intact
281
+ self._exceptions_data = data.get("exceptions", {})
282
+
283
+ # Create the InterviewExceptionCollection for runtime use
258
284
  exceptions_data = data.get("exceptions", {})
259
285
  self.exceptions = (
260
286
  InterviewExceptionCollection.from_dict(exceptions_data)
261
287
  if exceptions_data
262
288
  else InterviewExceptionCollection()
263
289
  )
290
+
291
+ # Store other fields
264
292
  self.task_status_logs = data.get("task_status_logs", {})
265
293
  self.model = data.get("model", {})
266
294
  self.survey = data.get("survey", {})
267
295
 
268
296
  def to_dict(self, add_edsl_version=True):
269
- return {
297
+ # Use the original exceptions data structure when serializing again
298
+ # This preserves all exception details exactly as they were
299
+ data = {
270
300
  "type": "InterviewReference",
271
- "exceptions": self.exceptions.to_dict()
272
- if hasattr(self.exceptions, "to_dict")
273
- else self.exceptions,
301
+ "exceptions": self._exceptions_data
302
+ if hasattr(self, "_exceptions_data")
303
+ else (
304
+ self.exceptions.to_dict()
305
+ if hasattr(self.exceptions, "to_dict")
306
+ else self.exceptions
307
+ ),
274
308
  "task_status_logs": self.task_status_logs,
275
309
  "model": self.model,
276
310
  "survey": self.survey,
277
311
  }
278
312
 
313
+ # Preserve the original interview id if it exists
314
+ if self._interview_id:
315
+ data["id"] = self._interview_id
316
+
317
+ # Preserve original version info
318
+ if (
319
+ add_edsl_version
320
+ and hasattr(self, "_original_data")
321
+ and "edsl_version" in self._original_data
322
+ ):
323
+ data["edsl_version"] = self._original_data["edsl_version"]
324
+
325
+ return data
326
+
279
327
  # Create the reference and add it directly
280
328
  ref = DeserializedInterviewRef(interview_data)
281
329
  instance.total_interviews.append(ref)
@@ -728,6 +776,132 @@ class TaskHistory(RepresentationMixin):
728
776
 
729
777
  return nb
730
778
 
779
+ def offload_files_content(self):
780
+ """
781
+ Offload large file content from scenarios in interview exceptions.
782
+
783
+ This method iterates over all the interview exceptions and calls the offload method
784
+ for any scenario components in the invigilator. This significantly reduces memory usage
785
+ by replacing base64-encoded content with a placeholder string, while preserving the
786
+ structure of the scenarios.
787
+
788
+ Returns:
789
+ self: Returns the TaskHistory instance for method chaining
790
+
791
+ This is particularly useful for TaskHistory instances containing interviews with
792
+ large file content, such as videos, images, or other binary data.
793
+ """
794
+ for interview in self.total_interviews:
795
+ if not hasattr(interview, "exceptions") or not interview.exceptions:
796
+ continue
797
+
798
+ for question_name, exceptions in interview.exceptions.items():
799
+ for exception in exceptions:
800
+ # Check if exception has an invigilator with scenario
801
+ if hasattr(exception, "invigilator") and exception.invigilator:
802
+ if (
803
+ hasattr(exception.invigilator, "scenario")
804
+ and exception.invigilator.scenario
805
+ ):
806
+ # Call the offload method on the scenario
807
+ if hasattr(exception.invigilator.scenario, "offload"):
808
+ try:
809
+ # Replace the original scenario with the offloaded version
810
+ exception.invigilator.scenario = (
811
+ exception.invigilator.scenario.offload()
812
+ )
813
+ except Exception as e:
814
+ # Silently continue if offloading fails for any reason
815
+ pass
816
+
817
+ return self
818
+
819
+ def deduplicate_and_clean_interviews(self):
820
+ """
821
+ Deduplicates exception entries in this task history to reduce memory usage.
822
+
823
+ This method removes duplicate error messages across interviews while preserving
824
+ the first occurrence of each unique error. This significantly reduces the size
825
+ of serialized task history data, especially for jobs with many similar errors.
826
+
827
+ Returns:
828
+ self: Returns the TaskHistory instance for method chaining.
829
+ """
830
+ seen = set()
831
+ cleaned_interviews = []
832
+
833
+ for interview in self.total_interviews:
834
+ # Skip if interview has no exceptions
835
+ if not hasattr(interview, "exceptions") or not interview.exceptions:
836
+ continue
837
+
838
+ keep_interview = False
839
+ questions_to_modify = {}
840
+ questions_to_remove = []
841
+
842
+ # First pass: Collect all modifications without changing the dictionary
843
+ if hasattr(interview.exceptions, "items"):
844
+ for question_name, exceptions in list(interview.exceptions.items()):
845
+ filtered_exceptions = []
846
+
847
+ for exception in exceptions:
848
+ # Get the exception message (may require different access based on structure)
849
+ if hasattr(exception, "exception") and hasattr(
850
+ exception.exception, "args"
851
+ ):
852
+ message = (
853
+ str(exception.exception.args[0])
854
+ if exception.exception.args
855
+ else ""
856
+ )
857
+ else:
858
+ message = str(exception)
859
+
860
+ # Create a unique key for this exception
861
+ key = (question_name, message)
862
+
863
+ # Only keep exceptions we haven't seen before
864
+ if key not in seen:
865
+ seen.add(key)
866
+ filtered_exceptions.append(exception)
867
+
868
+ # Track what should happen to this question's exceptions
869
+ if filtered_exceptions:
870
+ keep_interview = True
871
+ questions_to_modify[question_name] = filtered_exceptions
872
+ else:
873
+ questions_to_remove.append(question_name)
874
+
875
+ # Second pass: Apply all modifications safely
876
+ if hasattr(interview.exceptions, "items"):
877
+ # Add/replace filtered exceptions
878
+ for question_name, filtered_exceptions in questions_to_modify.items():
879
+ interview.exceptions[question_name] = filtered_exceptions
880
+
881
+ # Remove questions with all duplicate exceptions
882
+ for question_name in questions_to_remove:
883
+ if hasattr(interview.exceptions, "pop"):
884
+ interview.exceptions.pop(question_name, None)
885
+ elif (
886
+ hasattr(interview.exceptions, "__delitem__")
887
+ and question_name in interview.exceptions
888
+ ):
889
+ del interview.exceptions[question_name]
890
+
891
+ # Only keep the interview if it still has exceptions after filtering
892
+ if keep_interview:
893
+ cleaned_interviews.append(interview)
894
+
895
+ # Replace the total_interviews with our cleaned list
896
+ self.total_interviews = cleaned_interviews
897
+
898
+ # Rebuild the _interviews dictionary
899
+ self._interviews = {
900
+ index: interview for index, interview in enumerate(self.total_interviews)
901
+ }
902
+
903
+ return self
904
+
731
905
 
732
906
  if __name__ == "__main__":
733
907
  import doctest