edsl 0.1.54__py3-none-any.whl → 0.1.56__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. edsl/__init__.py +8 -1
  2. edsl/__init__original.py +134 -0
  3. edsl/__version__.py +1 -1
  4. edsl/agents/agent.py +29 -0
  5. edsl/agents/agent_list.py +36 -1
  6. edsl/base/base_class.py +281 -151
  7. edsl/base/data_transfer_models.py +15 -4
  8. edsl/buckets/__init__.py +8 -3
  9. edsl/buckets/bucket_collection.py +9 -3
  10. edsl/buckets/model_buckets.py +4 -2
  11. edsl/buckets/token_bucket.py +2 -2
  12. edsl/buckets/token_bucket_client.py +5 -3
  13. edsl/caching/cache.py +131 -62
  14. edsl/caching/cache_entry.py +70 -58
  15. edsl/caching/sql_dict.py +17 -0
  16. edsl/cli.py +99 -0
  17. edsl/config/config_class.py +16 -0
  18. edsl/conversation/__init__.py +31 -0
  19. edsl/coop/coop.py +276 -242
  20. edsl/coop/coop_jobs_objects.py +59 -0
  21. edsl/coop/coop_objects.py +29 -0
  22. edsl/coop/coop_regular_objects.py +26 -0
  23. edsl/coop/utils.py +24 -19
  24. edsl/dataset/dataset.py +338 -101
  25. edsl/dataset/dataset_operations_mixin.py +216 -180
  26. edsl/db_list/sqlite_list.py +349 -0
  27. edsl/inference_services/__init__.py +40 -5
  28. edsl/inference_services/exceptions.py +11 -0
  29. edsl/inference_services/services/anthropic_service.py +5 -2
  30. edsl/inference_services/services/aws_bedrock.py +6 -2
  31. edsl/inference_services/services/azure_ai.py +6 -2
  32. edsl/inference_services/services/google_service.py +7 -3
  33. edsl/inference_services/services/mistral_ai_service.py +6 -2
  34. edsl/inference_services/services/open_ai_service.py +6 -2
  35. edsl/inference_services/services/perplexity_service.py +6 -2
  36. edsl/inference_services/services/test_service.py +94 -5
  37. edsl/interviews/answering_function.py +167 -59
  38. edsl/interviews/interview.py +124 -72
  39. edsl/interviews/interview_task_manager.py +10 -0
  40. edsl/interviews/request_token_estimator.py +8 -0
  41. edsl/invigilators/invigilators.py +35 -13
  42. edsl/jobs/async_interview_runner.py +146 -104
  43. edsl/jobs/data_structures.py +6 -4
  44. edsl/jobs/decorators.py +61 -0
  45. edsl/jobs/fetch_invigilator.py +61 -18
  46. edsl/jobs/html_table_job_logger.py +14 -2
  47. edsl/jobs/jobs.py +180 -104
  48. edsl/jobs/jobs_component_constructor.py +2 -2
  49. edsl/jobs/jobs_interview_constructor.py +2 -0
  50. edsl/jobs/jobs_pricing_estimation.py +154 -113
  51. edsl/jobs/jobs_remote_inference_logger.py +4 -0
  52. edsl/jobs/jobs_runner_status.py +30 -25
  53. edsl/jobs/progress_bar_manager.py +79 -0
  54. edsl/jobs/remote_inference.py +35 -1
  55. edsl/key_management/key_lookup_builder.py +6 -1
  56. edsl/language_models/language_model.py +110 -12
  57. edsl/language_models/model.py +10 -3
  58. edsl/language_models/price_manager.py +176 -71
  59. edsl/language_models/registry.py +5 -0
  60. edsl/notebooks/notebook.py +77 -10
  61. edsl/questions/VALIDATION_README.md +134 -0
  62. edsl/questions/__init__.py +24 -1
  63. edsl/questions/exceptions.py +21 -0
  64. edsl/questions/question_dict.py +201 -16
  65. edsl/questions/question_multiple_choice_with_other.py +624 -0
  66. edsl/questions/question_registry.py +2 -1
  67. edsl/questions/templates/multiple_choice_with_other/__init__.py +0 -0
  68. edsl/questions/templates/multiple_choice_with_other/answering_instructions.jinja +15 -0
  69. edsl/questions/templates/multiple_choice_with_other/question_presentation.jinja +17 -0
  70. edsl/questions/validation_analysis.py +185 -0
  71. edsl/questions/validation_cli.py +131 -0
  72. edsl/questions/validation_html_report.py +404 -0
  73. edsl/questions/validation_logger.py +136 -0
  74. edsl/results/result.py +115 -46
  75. edsl/results/results.py +702 -171
  76. edsl/scenarios/construct_download_link.py +16 -3
  77. edsl/scenarios/directory_scanner.py +226 -226
  78. edsl/scenarios/file_methods.py +5 -0
  79. edsl/scenarios/file_store.py +150 -9
  80. edsl/scenarios/handlers/__init__.py +5 -1
  81. edsl/scenarios/handlers/mp4_file_store.py +104 -0
  82. edsl/scenarios/handlers/webm_file_store.py +104 -0
  83. edsl/scenarios/scenario.py +120 -101
  84. edsl/scenarios/scenario_list.py +800 -727
  85. edsl/scenarios/scenario_list_gc_test.py +146 -0
  86. edsl/scenarios/scenario_list_memory_test.py +214 -0
  87. edsl/scenarios/scenario_list_source_refactor.md +35 -0
  88. edsl/scenarios/scenario_selector.py +5 -4
  89. edsl/scenarios/scenario_source.py +1990 -0
  90. edsl/scenarios/tests/test_scenario_list_sources.py +52 -0
  91. edsl/surveys/survey.py +22 -0
  92. edsl/tasks/__init__.py +4 -2
  93. edsl/tasks/task_history.py +198 -36
  94. edsl/tests/scenarios/test_ScenarioSource.py +51 -0
  95. edsl/tests/scenarios/test_scenario_list_sources.py +51 -0
  96. edsl/utilities/__init__.py +2 -1
  97. edsl/utilities/decorators.py +121 -0
  98. edsl/utilities/memory_debugger.py +1010 -0
  99. {edsl-0.1.54.dist-info → edsl-0.1.56.dist-info}/METADATA +51 -76
  100. {edsl-0.1.54.dist-info → edsl-0.1.56.dist-info}/RECORD +103 -79
  101. edsl/jobs/jobs_runner_asyncio.py +0 -281
  102. edsl/language_models/unused/fake_openai_service.py +0 -60
  103. {edsl-0.1.54.dist-info → edsl-0.1.56.dist-info}/LICENSE +0 -0
  104. {edsl-0.1.54.dist-info → edsl-0.1.56.dist-info}/WHEEL +0 -0
  105. {edsl-0.1.54.dist-info → edsl-0.1.56.dist-info}/entry_points.txt +0 -0
edsl/results/result.py CHANGED
@@ -20,6 +20,7 @@ The Result class inherits from both Base (for serialization) and UserDict (for
20
20
  dictionary-like behavior), allowing it to be accessed like a dictionary while
21
21
  maintaining a rich object model.
22
22
  """
23
+
23
24
  from __future__ import annotations
24
25
  import inspect
25
26
  from collections import UserDict
@@ -40,6 +41,7 @@ if TYPE_CHECKING:
40
41
  QuestionName = str
41
42
  AnswerValue = Any
42
43
 
44
+
43
45
  class AgentNamer:
44
46
  """Maintains a registry of agent names to ensure unique naming."""
45
47
 
@@ -61,20 +63,20 @@ agent_namer = AgentNamer().get_name
61
63
  class Result(Base, UserDict):
62
64
  """
63
65
  The Result class captures the complete data from one agent interview.
64
-
66
+
65
67
  A Result object stores the agent, scenario, language model, and all answers
66
68
  provided during an interview, along with metadata such as token usage,
67
69
  caching information, and raw model responses. It provides a rich interface
68
70
  for accessing this data and supports serialization for storage and retrieval.
69
-
71
+
70
72
  Key features:
71
-
73
+
72
74
  - Dictionary-like access to all data through the UserDict interface
73
75
  - Properties for convenient access to common attributes (agent, scenario, model, answer)
74
76
  - Rich data structure with sub-dictionaries for organization
75
77
  - Support for scoring results against reference answers
76
78
  - Serialization to/from dictionaries for storage
77
-
79
+
78
80
  Results are typically created by the Jobs system when running interviews and
79
81
  collected into a Results collection for analysis. You rarely need to create
80
82
  Result objects manually.
@@ -260,6 +262,7 @@ class Result(Base, UserDict):
260
262
  for key in self.problem_keys:
261
263
  if key in expression and key + "." not in expression:
262
264
  from .exceptions import ResultsColumnNotFoundError
265
+
263
266
  raise ResultsColumnNotFoundError(
264
267
  f"Key by itself {key} is problematic. Use the full key {key + '.' + key} name instead."
265
268
  )
@@ -268,6 +271,7 @@ class Result(Base, UserDict):
268
271
  def code(self):
269
272
  """Return a string of code that can be used to recreate the Result object."""
270
273
  from .exceptions import ResultsError
274
+
271
275
  raise ResultsError("The code() method is not implemented for Result objects")
272
276
 
273
277
  @property
@@ -316,7 +320,7 @@ class Result(Base, UserDict):
316
320
 
317
321
  def get_value(self, data_type: str, key: str) -> Any:
318
322
  """Return the value for a given data type and key.
319
-
323
+
320
324
  This method provides a consistent way to access values across different
321
325
  sub-dictionaries in the Result object. It's particularly useful when you
322
326
  need to programmatically access values without knowing which data type
@@ -331,7 +335,7 @@ class Result(Base, UserDict):
331
335
 
332
336
  Returns:
333
337
  The value associated with the key in the specified data type
334
-
338
+
335
339
  Examples:
336
340
  >>> r = Result.example()
337
341
  >>> r.get_value("answer", "how_feeling")
@@ -344,15 +348,15 @@ class Result(Base, UserDict):
344
348
  @property
345
349
  def key_to_data_type(self) -> dict[str, str]:
346
350
  """A mapping of attribute names to their container data types.
347
-
351
+
348
352
  This property returns a dictionary that maps each attribute name (like 'how_feeling')
349
353
  to its containing data type or category (like 'answer'). This is useful for
350
354
  determining which part of the Result object a particular attribute belongs to,
351
355
  especially when working with data programmatically.
352
-
356
+
353
357
  If a key name appears in multiple data types, the property will automatically
354
358
  rename the conflicting keys by appending the data type name to avoid ambiguity.
355
-
359
+
356
360
  Returns:
357
361
  A dictionary mapping attribute names to their data types
358
362
 
@@ -435,7 +439,7 @@ class Result(Base, UserDict):
435
439
  else prompt_obj.to_dict()
436
440
  )
437
441
  d[key] = new_prompt_dict
438
-
442
+
439
443
  if self.indices is not None:
440
444
  d["indices"] = self.indices
441
445
 
@@ -450,6 +454,13 @@ class Result(Base, UserDict):
450
454
  else:
451
455
  d.pop("cache_used_dict", None)
452
456
 
457
+ if hasattr(self, "interview_hash"):
458
+ d["interview_hash"] = self.interview_hash
459
+
460
+ # Preserve the order attribute if it exists
461
+ if hasattr(self, "order"):
462
+ d["order"] = self.order
463
+
453
464
  return d
454
465
 
455
466
  def __hash__(self):
@@ -488,8 +499,15 @@ class Result(Base, UserDict):
488
499
  comments_dict=json_dict.get("comments_dict", {}),
489
500
  cache_used_dict=json_dict.get("cache_used_dict", {}),
490
501
  cache_keys=json_dict.get("cache_keys", {}),
491
- indices = json_dict.get("indices", None)
502
+ indices=json_dict.get("indices", None),
492
503
  )
504
+ if "interview_hash" in json_dict:
505
+ result.interview_hash = json_dict["interview_hash"]
506
+
507
+ # Restore the order attribute if it exists in the dictionary
508
+ if "order" in json_dict:
509
+ result.order = json_dict["order"]
510
+
493
511
  return result
494
512
 
495
513
  def __repr__(self):
@@ -508,14 +526,14 @@ class Result(Base, UserDict):
508
526
  from .results import Results
509
527
 
510
528
  return Results.example()[0]
511
-
529
+
512
530
  def score_with_answer_key(self, answer_key: dict) -> dict[str, int]:
513
531
  """Score the result against a reference answer key.
514
-
515
- This method evaluates the correctness of answers by comparing them to a
516
- provided answer key. It returns a dictionary with counts of correct,
532
+
533
+ This method evaluates the correctness of answers by comparing them to a
534
+ provided answer key. It returns a dictionary with counts of correct,
517
535
  incorrect, and missing answers.
518
-
536
+
519
537
  The answer key can contain either single values or lists of acceptable values.
520
538
  If a list is provided, the answer is considered correct if it matches any
521
539
  value in the list.
@@ -527,7 +545,7 @@ class Result(Base, UserDict):
527
545
  Returns:
528
546
  A dictionary with keys 'correct', 'incorrect', and 'missing', indicating
529
547
  the counts of each answer type.
530
-
548
+
531
549
  Examples:
532
550
  >>> Result.example()['answer']
533
551
  {'how_feeling': 'OK', 'how_feeling_yesterday': 'Great'}
@@ -536,21 +554,24 @@ class Result(Base, UserDict):
536
554
  >>> answer_key = {'how_feeling': 'OK', 'how_feeling_yesterday': 'Great'}
537
555
  >>> Result.example().score_with_answer_key(answer_key)
538
556
  {'correct': 2, 'incorrect': 0, 'missing': 0}
539
-
557
+
540
558
  >>> # Using answer key with multiple acceptable answers
541
559
  >>> answer_key = {'how_feeling': 'OK', 'how_feeling_yesterday': ['Great', 'Good']}
542
560
  >>> Result.example().score_with_answer_key(answer_key)
543
561
  {'correct': 2, 'incorrect': 0, 'missing': 0}
544
562
  """
545
- final_scores = {'correct': 0, 'incorrect': 0, 'missing': 0}
563
+ final_scores = {"correct": 0, "incorrect": 0, "missing": 0}
546
564
  for question_name, answer in self.answer.items():
547
565
  if question_name in answer_key:
548
- if answer == answer_key[question_name] or answer in answer_key[question_name]:
549
- final_scores['correct'] += 1
566
+ if (
567
+ answer == answer_key[question_name]
568
+ or answer in answer_key[question_name]
569
+ ):
570
+ final_scores["correct"] += 1
550
571
  else:
551
- final_scores['incorrect'] += 1
572
+ final_scores["incorrect"] += 1
552
573
  else:
553
- final_scores['missing'] += 1
574
+ final_scores["missing"] += 1
554
575
 
555
576
  return final_scores
556
577
 
@@ -570,14 +591,17 @@ class Result(Base, UserDict):
570
591
  params[k] = v.default
571
592
  else:
572
593
  from .exceptions import ResultsError
594
+
573
595
  raise ResultsError(f"Parameter {k} not found in Result object")
574
596
  return scoring_function(**params)
575
597
 
576
598
  @classmethod
577
- def from_interview(
578
- cls, interview, extracted_answers, model_response_objects
579
- ) -> Result:
580
- """Return a Result object from an interview dictionary."""
599
+ def from_interview(cls, interview) -> Result:
600
+ """Return a Result object from an interview dictionary, ensuring no reference to the original interview is maintained."""
601
+ # Copy the valid results to avoid maintaining references
602
+ model_response_objects = list(interview.valid_results) if hasattr(interview, 'valid_results') else []
603
+ # Create a copy of the answers
604
+ extracted_answers = dict(interview.answers) if hasattr(interview, 'answers') else {}
581
605
 
582
606
  def get_question_results(
583
607
  model_response_objects,
@@ -638,53 +662,98 @@ class Result(Base, UserDict):
638
662
  raw_model_results_dictionary[question_name + "_raw_model_response"] = (
639
663
  result.raw_model_response
640
664
  )
641
- raw_model_results_dictionary[question_name + "_cost"] = result.cost
642
- one_use_buys = (
665
+ raw_model_results_dictionary[question_name + "_input_tokens"] = (
666
+ result.input_tokens
667
+ )
668
+ raw_model_results_dictionary[question_name + "_output_tokens"] = (
669
+ result.output_tokens
670
+ )
671
+ raw_model_results_dictionary[
672
+ question_name + "_input_price_per_million_tokens"
673
+ ] = result.input_price_per_million_tokens
674
+ raw_model_results_dictionary[
675
+ question_name + "_output_price_per_million_tokens"
676
+ ] = result.output_price_per_million_tokens
677
+ raw_model_results_dictionary[question_name + "_cost"] = (
678
+ result.total_cost
679
+ )
680
+ one_usd_buys = (
643
681
  "NA"
644
- if isinstance(result.cost, str)
645
- or result.cost == 0
646
- or result.cost is None
647
- else 1.0 / result.cost
682
+ if isinstance(result.total_cost, str)
683
+ or result.total_cost == 0
684
+ or result.total_cost is None
685
+ else 1.0 / result.total_cost
648
686
  )
649
687
  raw_model_results_dictionary[question_name + "_one_usd_buys"] = (
650
- one_use_buys
688
+ one_usd_buys
651
689
  )
652
690
  cache_used_dictionary[question_name] = result.cache_used
653
691
 
654
692
  return raw_model_results_dictionary, cache_used_dictionary
655
693
 
694
+ # Save essential information from the interview before clearing references
695
+ agent_copy = interview.agent.copy() if hasattr(interview, 'agent') else None
696
+ scenario_copy = interview.scenario.copy() if hasattr(interview, 'scenario') else None
697
+ model_copy = interview.model.copy() if hasattr(interview, 'model') else None
698
+ iteration = interview.iteration if hasattr(interview, 'iteration') else 0
699
+ survey_copy = interview.survey.copy() if hasattr(interview, 'survey') and interview.survey else None
700
+ indices_copy = dict(interview.indices) if hasattr(interview, 'indices') and interview.indices else None
701
+ initial_hash = interview.initial_hash if hasattr(interview, 'initial_hash') else hash(interview)
702
+
703
+ # Process data to create dictionaries needed for Result
656
704
  question_results = get_question_results(model_response_objects)
657
705
  answer_key_names = list(question_results.keys())
658
- generated_tokens_dict = get_generated_tokens_dict(answer_key_names)
659
- comments_dict = get_comments_dict(answer_key_names)
660
- answer_dict = {k: extracted_answers[k] for k in answer_key_names}
706
+ generated_tokens_dict = get_generated_tokens_dict(answer_key_names) if answer_key_names else {}
707
+ comments_dict = get_comments_dict(answer_key_names) if answer_key_names else {}
708
+
709
+ # Get answers that are in the question results
710
+ answer_dict = {}
711
+ for k in answer_key_names:
712
+ if k in extracted_answers:
713
+ answer_dict[k] = extracted_answers[k]
714
+
661
715
  cache_keys = get_cache_keys(model_response_objects)
662
716
 
663
717
  question_name_to_prompts = get_question_name_to_prompts(model_response_objects)
664
718
  prompt_dictionary = get_prompt_dictionary(
665
719
  answer_key_names, question_name_to_prompts
666
- )
720
+ ) if answer_key_names else {}
721
+
667
722
  raw_model_results_dictionary, cache_used_dictionary = (
668
723
  get_raw_model_results_and_cache_used_dictionary(model_response_objects)
669
724
  )
670
725
 
726
+ # Create the Result object with all copied data
671
727
  result = cls(
672
- agent=interview.agent,
673
- scenario=interview.scenario,
674
- model=interview.model,
675
- iteration=interview.iteration,
676
- # Computed objects
728
+ agent=agent_copy,
729
+ scenario=scenario_copy,
730
+ model=model_copy,
731
+ iteration=iteration,
677
732
  answer=answer_dict,
678
733
  prompt=prompt_dictionary,
679
734
  raw_model_response=raw_model_results_dictionary,
680
- survey=interview.survey,
735
+ survey=survey_copy,
681
736
  generated_tokens=generated_tokens_dict,
682
737
  comments_dict=comments_dict,
683
738
  cache_used_dict=cache_used_dictionary,
684
- indices=interview.indices,
739
+ indices=indices_copy,
685
740
  cache_keys=cache_keys,
686
741
  )
687
- result.interview_hash = interview.initial_hash
742
+
743
+ # Store only the hash, not the interview
744
+ result.interview_hash = initial_hash
745
+
746
+ # Clear references to help garbage collection of the interview
747
+ if hasattr(interview, 'clear_references'):
748
+ interview.clear_references()
749
+
750
+ # Clear local references to help with garbage collection
751
+ del model_response_objects
752
+ del extracted_answers
753
+ del question_results
754
+ del answer_key_names
755
+ del question_name_to_prompts
756
+
688
757
  return result
689
758
 
690
759