PyPI - edsl - Versions diffs - 0.1.54__py3-none-any.whl → 0.1.56__py3-none-any.whl - Mend

edsl 0.1.54py3-none-any.whl → 0.1.56py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (105) hide show

edsl/__init__.py +8 -1
edsl/__init__original.py +134 -0
edsl/__version__.py +1 -1
edsl/agents/agent.py +29 -0
edsl/agents/agent_list.py +36 -1
edsl/base/base_class.py +281 -151
edsl/base/data_transfer_models.py +15 -4
edsl/buckets/__init__.py +8 -3
edsl/buckets/bucket_collection.py +9 -3
edsl/buckets/model_buckets.py +4 -2
edsl/buckets/token_bucket.py +2 -2
edsl/buckets/token_bucket_client.py +5 -3
edsl/caching/cache.py +131 -62
edsl/caching/cache_entry.py +70 -58
edsl/caching/sql_dict.py +17 -0
edsl/cli.py +99 -0
edsl/config/config_class.py +16 -0
edsl/conversation/__init__.py +31 -0
edsl/coop/coop.py +276 -242
edsl/coop/coop_jobs_objects.py +59 -0
edsl/coop/coop_objects.py +29 -0
edsl/coop/coop_regular_objects.py +26 -0
edsl/coop/utils.py +24 -19
edsl/dataset/dataset.py +338 -101
edsl/dataset/dataset_operations_mixin.py +216 -180
edsl/db_list/sqlite_list.py +349 -0
edsl/inference_services/__init__.py +40 -5
edsl/inference_services/exceptions.py +11 -0
edsl/inference_services/services/anthropic_service.py +5 -2
edsl/inference_services/services/aws_bedrock.py +6 -2
edsl/inference_services/services/azure_ai.py +6 -2
edsl/inference_services/services/google_service.py +7 -3
edsl/inference_services/services/mistral_ai_service.py +6 -2
edsl/inference_services/services/open_ai_service.py +6 -2
edsl/inference_services/services/perplexity_service.py +6 -2
edsl/inference_services/services/test_service.py +94 -5
edsl/interviews/answering_function.py +167 -59
edsl/interviews/interview.py +124 -72
edsl/interviews/interview_task_manager.py +10 -0
edsl/interviews/request_token_estimator.py +8 -0
edsl/invigilators/invigilators.py +35 -13
edsl/jobs/async_interview_runner.py +146 -104
edsl/jobs/data_structures.py +6 -4
edsl/jobs/decorators.py +61 -0
edsl/jobs/fetch_invigilator.py +61 -18
edsl/jobs/html_table_job_logger.py +14 -2
edsl/jobs/jobs.py +180 -104
edsl/jobs/jobs_component_constructor.py +2 -2
edsl/jobs/jobs_interview_constructor.py +2 -0
edsl/jobs/jobs_pricing_estimation.py +154 -113
edsl/jobs/jobs_remote_inference_logger.py +4 -0
edsl/jobs/jobs_runner_status.py +30 -25
edsl/jobs/progress_bar_manager.py +79 -0
edsl/jobs/remote_inference.py +35 -1
edsl/key_management/key_lookup_builder.py +6 -1
edsl/language_models/language_model.py +110 -12
edsl/language_models/model.py +10 -3
edsl/language_models/price_manager.py +176 -71
edsl/language_models/registry.py +5 -0
edsl/notebooks/notebook.py +77 -10
edsl/questions/VALIDATION_README.md +134 -0
edsl/questions/__init__.py +24 -1
edsl/questions/exceptions.py +21 -0
edsl/questions/question_dict.py +201 -16
edsl/questions/question_multiple_choice_with_other.py +624 -0
edsl/questions/question_registry.py +2 -1
edsl/questions/templates/multiple_choice_with_other/__init__.py +0 -0
edsl/questions/templates/multiple_choice_with_other/answering_instructions.jinja +15 -0
edsl/questions/templates/multiple_choice_with_other/question_presentation.jinja +17 -0
edsl/questions/validation_analysis.py +185 -0
edsl/questions/validation_cli.py +131 -0
edsl/questions/validation_html_report.py +404 -0
edsl/questions/validation_logger.py +136 -0
edsl/results/result.py +115 -46
edsl/results/results.py +702 -171
edsl/scenarios/construct_download_link.py +16 -3
edsl/scenarios/directory_scanner.py +226 -226
edsl/scenarios/file_methods.py +5 -0
edsl/scenarios/file_store.py +150 -9
edsl/scenarios/handlers/__init__.py +5 -1
edsl/scenarios/handlers/mp4_file_store.py +104 -0
edsl/scenarios/handlers/webm_file_store.py +104 -0
edsl/scenarios/scenario.py +120 -101
edsl/scenarios/scenario_list.py +800 -727
edsl/scenarios/scenario_list_gc_test.py +146 -0
edsl/scenarios/scenario_list_memory_test.py +214 -0
edsl/scenarios/scenario_list_source_refactor.md +35 -0
edsl/scenarios/scenario_selector.py +5 -4
edsl/scenarios/scenario_source.py +1990 -0
edsl/scenarios/tests/test_scenario_list_sources.py +52 -0
edsl/surveys/survey.py +22 -0
edsl/tasks/__init__.py +4 -2
edsl/tasks/task_history.py +198 -36
edsl/tests/scenarios/test_ScenarioSource.py +51 -0
edsl/tests/scenarios/test_scenario_list_sources.py +51 -0
edsl/utilities/__init__.py +2 -1
edsl/utilities/decorators.py +121 -0
edsl/utilities/memory_debugger.py +1010 -0
{edsl-0.1.54.dist-info → edsl-0.1.56.dist-info}/METADATA +51 -76
{edsl-0.1.54.dist-info → edsl-0.1.56.dist-info}/RECORD +103 -79
edsl/jobs/jobs_runner_asyncio.py +0 -281
edsl/language_models/unused/fake_openai_service.py +0 -60
{edsl-0.1.54.dist-info → edsl-0.1.56.dist-info}/LICENSE +0 -0
{edsl-0.1.54.dist-info → edsl-0.1.56.dist-info}/WHEEL +0 -0
{edsl-0.1.54.dist-info → edsl-0.1.56.dist-info}/entry_points.txt +0 -0

edsl/results/result.py CHANGED Viewed

@@ -20,6 +20,7 @@ The Result class inherits from both Base (for serialization) and UserDict (for
 dictionary-like behavior), allowing it to be accessed like a dictionary while
 maintaining a rich object model.
 """
 from __future__ import annotations
 import inspect
 from collections import UserDict
@@ -40,6 +41,7 @@ if TYPE_CHECKING:
 QuestionName = str
 AnswerValue = Any
 class AgentNamer:
     """Maintains a registry of agent names to ensure unique naming."""
@@ -61,20 +63,20 @@ agent_namer = AgentNamer().get_name
 class Result(Base, UserDict):
     """
     The Result class captures the complete data from one agent interview.
     A Result object stores the agent, scenario, language model, and all answers
     provided during an interview, along with metadata such as token usage,
     caching information, and raw model responses. It provides a rich interface
     for accessing this data and supports serialization for storage and retrieval.
     Key features:
     - Dictionary-like access to all data through the UserDict interface
     - Properties for convenient access to common attributes (agent, scenario, model, answer)
     - Rich data structure with sub-dictionaries for organization
     - Support for scoring results against reference answers
     - Serialization to/from dictionaries for storage
     Results are typically created by the Jobs system when running interviews and
     collected into a Results collection for analysis. You rarely need to create
     Result objects manually.
@@ -260,6 +262,7 @@ class Result(Base, UserDict):
         for key in self.problem_keys:
             if key in expression and key + "." not in expression:
                 from .exceptions import ResultsColumnNotFoundError
                 raise ResultsColumnNotFoundError(
                     f"Key by itself {key} is problematic. Use the full key {key + '.' + key} name instead."
                 )
@@ -268,6 +271,7 @@ class Result(Base, UserDict):
     def code(self):
         """Return a string of code that can be used to recreate the Result object."""
         from .exceptions import ResultsError
         raise ResultsError("The code() method is not implemented for Result objects")
     @property
@@ -316,7 +320,7 @@ class Result(Base, UserDict):
     def get_value(self, data_type: str, key: str) -> Any:
         """Return the value for a given data type and key.
         This method provides a consistent way to access values across different
         sub-dictionaries in the Result object. It's particularly useful when you
         need to programmatically access values without knowing which data type
@@ -331,7 +335,7 @@ class Result(Base, UserDict):
         Returns:
             The value associated with the key in the specified data type
         Examples:
             >>> r = Result.example()
             >>> r.get_value("answer", "how_feeling")
@@ -344,15 +348,15 @@ class Result(Base, UserDict):
     @property
     def key_to_data_type(self) -> dict[str, str]:
         """A mapping of attribute names to their container data types.
         This property returns a dictionary that maps each attribute name (like 'how_feeling')
         to its containing data type or category (like 'answer'). This is useful for
         determining which part of the Result object a particular attribute belongs to,
         especially when working with data programmatically.
         If a key name appears in multiple data types, the property will automatically
         rename the conflicting keys by appending the data type name to avoid ambiguity.
         Returns:
             A dictionary mapping attribute names to their data types
@@ -435,7 +439,7 @@ class Result(Base, UserDict):
                         else prompt_obj.to_dict()
                     )
                 d[key] = new_prompt_dict
         if self.indices is not None:
             d["indices"] = self.indices
@@ -450,6 +454,13 @@ class Result(Base, UserDict):
         else:
             d.pop("cache_used_dict", None)
+        if hasattr(self, "interview_hash"):
+            d["interview_hash"] = self.interview_hash
+        # Preserve the order attribute if it exists
+        if hasattr(self, "order"):
+            d["order"] = self.order
         return d
     def __hash__(self):
@@ -488,8 +499,15 @@ class Result(Base, UserDict):
             comments_dict=json_dict.get("comments_dict", {}),
             cache_used_dict=json_dict.get("cache_used_dict", {}),
             cache_keys=json_dict.get("cache_keys", {}),
-            indices = json_dict.get("indices", None)
+            indices=json_dict.get("indices", None),
         )
+        if "interview_hash" in json_dict:
+            result.interview_hash = json_dict["interview_hash"]
+        # Restore the order attribute if it exists in the dictionary
+        if "order" in json_dict:
+            result.order = json_dict["order"]
         return result
     def __repr__(self):
@@ -508,14 +526,14 @@ class Result(Base, UserDict):
         from .results import Results
         return Results.example()[0]
     def score_with_answer_key(self, answer_key: dict) -> dict[str, int]:
         """Score the result against a reference answer key.
-        This method evaluates the correctness of answers by comparing them to a
-        provided answer key. It returns a dictionary with counts of correct,
+        This method evaluates the correctness of answers by comparing them to a
+        provided answer key. It returns a dictionary with counts of correct,
         incorrect, and missing answers.
         The answer key can contain either single values or lists of acceptable values.
         If a list is provided, the answer is considered correct if it matches any
         value in the list.
@@ -527,7 +545,7 @@ class Result(Base, UserDict):
         Returns:
             A dictionary with keys 'correct', 'incorrect', and 'missing', indicating
             the counts of each answer type.
         Examples:
             >>> Result.example()['answer']
             {'how_feeling': 'OK', 'how_feeling_yesterday': 'Great'}
@@ -536,21 +554,24 @@ class Result(Base, UserDict):
             >>> answer_key = {'how_feeling': 'OK', 'how_feeling_yesterday': 'Great'}
             >>> Result.example().score_with_answer_key(answer_key)
             {'correct': 2, 'incorrect': 0, 'missing': 0}
             >>> # Using answer key with multiple acceptable answers
             >>> answer_key = {'how_feeling': 'OK', 'how_feeling_yesterday': ['Great', 'Good']}
             >>> Result.example().score_with_answer_key(answer_key)
             {'correct': 2, 'incorrect': 0, 'missing': 0}
         """
-        final_scores = {'correct': 0, 'incorrect': 0, 'missing': 0}
+        final_scores = {"correct": 0, "incorrect": 0, "missing": 0}
         for question_name, answer in self.answer.items():
             if question_name in answer_key:
-                if answer == answer_key[question_name] or answer in answer_key[question_name]:
-                    final_scores['correct'] += 1
+                if (
+                    answer == answer_key[question_name]
+                    or answer in answer_key[question_name]
+                ):
+                    final_scores["correct"] += 1
                 else:
-                    final_scores['incorrect'] += 1
+                    final_scores["incorrect"] += 1
             else:
-                final_scores['missing'] += 1
+                final_scores["missing"] += 1
         return final_scores
@@ -570,14 +591,17 @@ class Result(Base, UserDict):
                 params[k] = v.default
             else:
                 from .exceptions import ResultsError
                 raise ResultsError(f"Parameter {k} not found in Result object")
         return scoring_function(**params)
     @classmethod
-    def from_interview(
-        cls, interview, extracted_answers, model_response_objects
-    ) -> Result:
-        """Return a Result object from an interview dictionary."""
+    def from_interview(cls, interview) -> Result:
+        """Return a Result object from an interview dictionary, ensuring no reference to the original interview is maintained."""
+        # Copy the valid results to avoid maintaining references
+        model_response_objects = list(interview.valid_results) if hasattr(interview, 'valid_results') else []
+        # Create a copy of the answers
+        extracted_answers = dict(interview.answers) if hasattr(interview, 'answers') else {}
         def get_question_results(
             model_response_objects,
@@ -638,53 +662,98 @@ class Result(Base, UserDict):
                 raw_model_results_dictionary[question_name + "_raw_model_response"] = (
                     result.raw_model_response
                 )
-                raw_model_results_dictionary[question_name + "_cost"] = result.cost
-                one_use_buys = (
+                raw_model_results_dictionary[question_name + "_input_tokens"] = (
+                    result.input_tokens
+                )
+                raw_model_results_dictionary[question_name + "_output_tokens"] = (
+                    result.output_tokens
+                )
+                raw_model_results_dictionary[
+                    question_name + "_input_price_per_million_tokens"
+                ] = result.input_price_per_million_tokens
+                raw_model_results_dictionary[
+                    question_name + "_output_price_per_million_tokens"
+                ] = result.output_price_per_million_tokens
+                raw_model_results_dictionary[question_name + "_cost"] = (
+                    result.total_cost
+                )
+                one_usd_buys = (
                     "NA"
-                    if isinstance(result.cost, str)
-                    or result.cost == 0
-                    or result.cost is None
-                    else 1.0 / result.cost
+                    if isinstance(result.total_cost, str)
+                    or result.total_cost == 0
+                    or result.total_cost is None
+                    else 1.0 / result.total_cost
                 )
                 raw_model_results_dictionary[question_name + "_one_usd_buys"] = (
-                    one_use_buys
+                    one_usd_buys
                 )
                 cache_used_dictionary[question_name] = result.cache_used
             return raw_model_results_dictionary, cache_used_dictionary
+        # Save essential information from the interview before clearing references
+        agent_copy = interview.agent.copy() if hasattr(interview, 'agent') else None
+        scenario_copy = interview.scenario.copy() if hasattr(interview, 'scenario') else None
+        model_copy = interview.model.copy() if hasattr(interview, 'model') else None
+        iteration = interview.iteration if hasattr(interview, 'iteration') else 0
+        survey_copy = interview.survey.copy() if hasattr(interview, 'survey') and interview.survey else None
+        indices_copy = dict(interview.indices) if hasattr(interview, 'indices') and interview.indices else None
+        initial_hash = interview.initial_hash if hasattr(interview, 'initial_hash') else hash(interview)
+        # Process data to create dictionaries needed for Result
         question_results = get_question_results(model_response_objects)
         answer_key_names = list(question_results.keys())
-        generated_tokens_dict = get_generated_tokens_dict(answer_key_names)
-        comments_dict = get_comments_dict(answer_key_names)
-        answer_dict = {k: extracted_answers[k] for k in answer_key_names}
+        generated_tokens_dict = get_generated_tokens_dict(answer_key_names) if answer_key_names else {}
+        comments_dict = get_comments_dict(answer_key_names) if answer_key_names else {}
+        # Get answers that are in the question results
+        answer_dict = {}
+        for k in answer_key_names:
+            if k in extracted_answers:
+                answer_dict[k] = extracted_answers[k]
         cache_keys = get_cache_keys(model_response_objects)
         question_name_to_prompts = get_question_name_to_prompts(model_response_objects)
         prompt_dictionary = get_prompt_dictionary(
             answer_key_names, question_name_to_prompts
-        )
+        ) if answer_key_names else {}
         raw_model_results_dictionary, cache_used_dictionary = (
             get_raw_model_results_and_cache_used_dictionary(model_response_objects)
         )
+        # Create the Result object with all copied data
         result = cls(
-            agent=interview.agent,
-            scenario=interview.scenario,
-            model=interview.model,
-            iteration=interview.iteration,
-            # Computed objects
+            agent=agent_copy,
+            scenario=scenario_copy,
+            model=model_copy,
+            iteration=iteration,
             answer=answer_dict,
             prompt=prompt_dictionary,
             raw_model_response=raw_model_results_dictionary,
-            survey=interview.survey,
+            survey=survey_copy,
             generated_tokens=generated_tokens_dict,
             comments_dict=comments_dict,
             cache_used_dict=cache_used_dictionary,
-            indices=interview.indices,
+            indices=indices_copy,
             cache_keys=cache_keys,
         )
-        result.interview_hash = interview.initial_hash
+        # Store only the hash, not the interview
+        result.interview_hash = initial_hash
+        # Clear references to help garbage collection of the interview
+        if hasattr(interview, 'clear_references'):
+            interview.clear_references()
+        # Clear local references to help with garbage collection
+        del model_response_objects
+        del extracted_answers
+        del question_results
+        del answer_key_names
+        del question_name_to_prompts
         return result

edsl 0.1.54__py3-none-any.whl → 0.1.56__py3-none-any.whl

edsl 0.1.54py3-none-any.whl → 0.1.56py3-none-any.whl