edsl 0.1.54__py3-none-any.whl → 0.1.56__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- edsl/__init__.py +8 -1
- edsl/__init__original.py +134 -0
- edsl/__version__.py +1 -1
- edsl/agents/agent.py +29 -0
- edsl/agents/agent_list.py +36 -1
- edsl/base/base_class.py +281 -151
- edsl/base/data_transfer_models.py +15 -4
- edsl/buckets/__init__.py +8 -3
- edsl/buckets/bucket_collection.py +9 -3
- edsl/buckets/model_buckets.py +4 -2
- edsl/buckets/token_bucket.py +2 -2
- edsl/buckets/token_bucket_client.py +5 -3
- edsl/caching/cache.py +131 -62
- edsl/caching/cache_entry.py +70 -58
- edsl/caching/sql_dict.py +17 -0
- edsl/cli.py +99 -0
- edsl/config/config_class.py +16 -0
- edsl/conversation/__init__.py +31 -0
- edsl/coop/coop.py +276 -242
- edsl/coop/coop_jobs_objects.py +59 -0
- edsl/coop/coop_objects.py +29 -0
- edsl/coop/coop_regular_objects.py +26 -0
- edsl/coop/utils.py +24 -19
- edsl/dataset/dataset.py +338 -101
- edsl/dataset/dataset_operations_mixin.py +216 -180
- edsl/db_list/sqlite_list.py +349 -0
- edsl/inference_services/__init__.py +40 -5
- edsl/inference_services/exceptions.py +11 -0
- edsl/inference_services/services/anthropic_service.py +5 -2
- edsl/inference_services/services/aws_bedrock.py +6 -2
- edsl/inference_services/services/azure_ai.py +6 -2
- edsl/inference_services/services/google_service.py +7 -3
- edsl/inference_services/services/mistral_ai_service.py +6 -2
- edsl/inference_services/services/open_ai_service.py +6 -2
- edsl/inference_services/services/perplexity_service.py +6 -2
- edsl/inference_services/services/test_service.py +94 -5
- edsl/interviews/answering_function.py +167 -59
- edsl/interviews/interview.py +124 -72
- edsl/interviews/interview_task_manager.py +10 -0
- edsl/interviews/request_token_estimator.py +8 -0
- edsl/invigilators/invigilators.py +35 -13
- edsl/jobs/async_interview_runner.py +146 -104
- edsl/jobs/data_structures.py +6 -4
- edsl/jobs/decorators.py +61 -0
- edsl/jobs/fetch_invigilator.py +61 -18
- edsl/jobs/html_table_job_logger.py +14 -2
- edsl/jobs/jobs.py +180 -104
- edsl/jobs/jobs_component_constructor.py +2 -2
- edsl/jobs/jobs_interview_constructor.py +2 -0
- edsl/jobs/jobs_pricing_estimation.py +154 -113
- edsl/jobs/jobs_remote_inference_logger.py +4 -0
- edsl/jobs/jobs_runner_status.py +30 -25
- edsl/jobs/progress_bar_manager.py +79 -0
- edsl/jobs/remote_inference.py +35 -1
- edsl/key_management/key_lookup_builder.py +6 -1
- edsl/language_models/language_model.py +110 -12
- edsl/language_models/model.py +10 -3
- edsl/language_models/price_manager.py +176 -71
- edsl/language_models/registry.py +5 -0
- edsl/notebooks/notebook.py +77 -10
- edsl/questions/VALIDATION_README.md +134 -0
- edsl/questions/__init__.py +24 -1
- edsl/questions/exceptions.py +21 -0
- edsl/questions/question_dict.py +201 -16
- edsl/questions/question_multiple_choice_with_other.py +624 -0
- edsl/questions/question_registry.py +2 -1
- edsl/questions/templates/multiple_choice_with_other/__init__.py +0 -0
- edsl/questions/templates/multiple_choice_with_other/answering_instructions.jinja +15 -0
- edsl/questions/templates/multiple_choice_with_other/question_presentation.jinja +17 -0
- edsl/questions/validation_analysis.py +185 -0
- edsl/questions/validation_cli.py +131 -0
- edsl/questions/validation_html_report.py +404 -0
- edsl/questions/validation_logger.py +136 -0
- edsl/results/result.py +115 -46
- edsl/results/results.py +702 -171
- edsl/scenarios/construct_download_link.py +16 -3
- edsl/scenarios/directory_scanner.py +226 -226
- edsl/scenarios/file_methods.py +5 -0
- edsl/scenarios/file_store.py +150 -9
- edsl/scenarios/handlers/__init__.py +5 -1
- edsl/scenarios/handlers/mp4_file_store.py +104 -0
- edsl/scenarios/handlers/webm_file_store.py +104 -0
- edsl/scenarios/scenario.py +120 -101
- edsl/scenarios/scenario_list.py +800 -727
- edsl/scenarios/scenario_list_gc_test.py +146 -0
- edsl/scenarios/scenario_list_memory_test.py +214 -0
- edsl/scenarios/scenario_list_source_refactor.md +35 -0
- edsl/scenarios/scenario_selector.py +5 -4
- edsl/scenarios/scenario_source.py +1990 -0
- edsl/scenarios/tests/test_scenario_list_sources.py +52 -0
- edsl/surveys/survey.py +22 -0
- edsl/tasks/__init__.py +4 -2
- edsl/tasks/task_history.py +198 -36
- edsl/tests/scenarios/test_ScenarioSource.py +51 -0
- edsl/tests/scenarios/test_scenario_list_sources.py +51 -0
- edsl/utilities/__init__.py +2 -1
- edsl/utilities/decorators.py +121 -0
- edsl/utilities/memory_debugger.py +1010 -0
- {edsl-0.1.54.dist-info → edsl-0.1.56.dist-info}/METADATA +51 -76
- {edsl-0.1.54.dist-info → edsl-0.1.56.dist-info}/RECORD +103 -79
- edsl/jobs/jobs_runner_asyncio.py +0 -281
- edsl/language_models/unused/fake_openai_service.py +0 -60
- {edsl-0.1.54.dist-info → edsl-0.1.56.dist-info}/LICENSE +0 -0
- {edsl-0.1.54.dist-info → edsl-0.1.56.dist-info}/WHEEL +0 -0
- {edsl-0.1.54.dist-info → edsl-0.1.56.dist-info}/entry_points.txt +0 -0
edsl/results/result.py
CHANGED
@@ -20,6 +20,7 @@ The Result class inherits from both Base (for serialization) and UserDict (for
|
|
20
20
|
dictionary-like behavior), allowing it to be accessed like a dictionary while
|
21
21
|
maintaining a rich object model.
|
22
22
|
"""
|
23
|
+
|
23
24
|
from __future__ import annotations
|
24
25
|
import inspect
|
25
26
|
from collections import UserDict
|
@@ -40,6 +41,7 @@ if TYPE_CHECKING:
|
|
40
41
|
QuestionName = str
|
41
42
|
AnswerValue = Any
|
42
43
|
|
44
|
+
|
43
45
|
class AgentNamer:
|
44
46
|
"""Maintains a registry of agent names to ensure unique naming."""
|
45
47
|
|
@@ -61,20 +63,20 @@ agent_namer = AgentNamer().get_name
|
|
61
63
|
class Result(Base, UserDict):
|
62
64
|
"""
|
63
65
|
The Result class captures the complete data from one agent interview.
|
64
|
-
|
66
|
+
|
65
67
|
A Result object stores the agent, scenario, language model, and all answers
|
66
68
|
provided during an interview, along with metadata such as token usage,
|
67
69
|
caching information, and raw model responses. It provides a rich interface
|
68
70
|
for accessing this data and supports serialization for storage and retrieval.
|
69
|
-
|
71
|
+
|
70
72
|
Key features:
|
71
|
-
|
73
|
+
|
72
74
|
- Dictionary-like access to all data through the UserDict interface
|
73
75
|
- Properties for convenient access to common attributes (agent, scenario, model, answer)
|
74
76
|
- Rich data structure with sub-dictionaries for organization
|
75
77
|
- Support for scoring results against reference answers
|
76
78
|
- Serialization to/from dictionaries for storage
|
77
|
-
|
79
|
+
|
78
80
|
Results are typically created by the Jobs system when running interviews and
|
79
81
|
collected into a Results collection for analysis. You rarely need to create
|
80
82
|
Result objects manually.
|
@@ -260,6 +262,7 @@ class Result(Base, UserDict):
|
|
260
262
|
for key in self.problem_keys:
|
261
263
|
if key in expression and key + "." not in expression:
|
262
264
|
from .exceptions import ResultsColumnNotFoundError
|
265
|
+
|
263
266
|
raise ResultsColumnNotFoundError(
|
264
267
|
f"Key by itself {key} is problematic. Use the full key {key + '.' + key} name instead."
|
265
268
|
)
|
@@ -268,6 +271,7 @@ class Result(Base, UserDict):
|
|
268
271
|
def code(self):
|
269
272
|
"""Return a string of code that can be used to recreate the Result object."""
|
270
273
|
from .exceptions import ResultsError
|
274
|
+
|
271
275
|
raise ResultsError("The code() method is not implemented for Result objects")
|
272
276
|
|
273
277
|
@property
|
@@ -316,7 +320,7 @@ class Result(Base, UserDict):
|
|
316
320
|
|
317
321
|
def get_value(self, data_type: str, key: str) -> Any:
|
318
322
|
"""Return the value for a given data type and key.
|
319
|
-
|
323
|
+
|
320
324
|
This method provides a consistent way to access values across different
|
321
325
|
sub-dictionaries in the Result object. It's particularly useful when you
|
322
326
|
need to programmatically access values without knowing which data type
|
@@ -331,7 +335,7 @@ class Result(Base, UserDict):
|
|
331
335
|
|
332
336
|
Returns:
|
333
337
|
The value associated with the key in the specified data type
|
334
|
-
|
338
|
+
|
335
339
|
Examples:
|
336
340
|
>>> r = Result.example()
|
337
341
|
>>> r.get_value("answer", "how_feeling")
|
@@ -344,15 +348,15 @@ class Result(Base, UserDict):
|
|
344
348
|
@property
|
345
349
|
def key_to_data_type(self) -> dict[str, str]:
|
346
350
|
"""A mapping of attribute names to their container data types.
|
347
|
-
|
351
|
+
|
348
352
|
This property returns a dictionary that maps each attribute name (like 'how_feeling')
|
349
353
|
to its containing data type or category (like 'answer'). This is useful for
|
350
354
|
determining which part of the Result object a particular attribute belongs to,
|
351
355
|
especially when working with data programmatically.
|
352
|
-
|
356
|
+
|
353
357
|
If a key name appears in multiple data types, the property will automatically
|
354
358
|
rename the conflicting keys by appending the data type name to avoid ambiguity.
|
355
|
-
|
359
|
+
|
356
360
|
Returns:
|
357
361
|
A dictionary mapping attribute names to their data types
|
358
362
|
|
@@ -435,7 +439,7 @@ class Result(Base, UserDict):
|
|
435
439
|
else prompt_obj.to_dict()
|
436
440
|
)
|
437
441
|
d[key] = new_prompt_dict
|
438
|
-
|
442
|
+
|
439
443
|
if self.indices is not None:
|
440
444
|
d["indices"] = self.indices
|
441
445
|
|
@@ -450,6 +454,13 @@ class Result(Base, UserDict):
|
|
450
454
|
else:
|
451
455
|
d.pop("cache_used_dict", None)
|
452
456
|
|
457
|
+
if hasattr(self, "interview_hash"):
|
458
|
+
d["interview_hash"] = self.interview_hash
|
459
|
+
|
460
|
+
# Preserve the order attribute if it exists
|
461
|
+
if hasattr(self, "order"):
|
462
|
+
d["order"] = self.order
|
463
|
+
|
453
464
|
return d
|
454
465
|
|
455
466
|
def __hash__(self):
|
@@ -488,8 +499,15 @@ class Result(Base, UserDict):
|
|
488
499
|
comments_dict=json_dict.get("comments_dict", {}),
|
489
500
|
cache_used_dict=json_dict.get("cache_used_dict", {}),
|
490
501
|
cache_keys=json_dict.get("cache_keys", {}),
|
491
|
-
indices
|
502
|
+
indices=json_dict.get("indices", None),
|
492
503
|
)
|
504
|
+
if "interview_hash" in json_dict:
|
505
|
+
result.interview_hash = json_dict["interview_hash"]
|
506
|
+
|
507
|
+
# Restore the order attribute if it exists in the dictionary
|
508
|
+
if "order" in json_dict:
|
509
|
+
result.order = json_dict["order"]
|
510
|
+
|
493
511
|
return result
|
494
512
|
|
495
513
|
def __repr__(self):
|
@@ -508,14 +526,14 @@ class Result(Base, UserDict):
|
|
508
526
|
from .results import Results
|
509
527
|
|
510
528
|
return Results.example()[0]
|
511
|
-
|
529
|
+
|
512
530
|
def score_with_answer_key(self, answer_key: dict) -> dict[str, int]:
|
513
531
|
"""Score the result against a reference answer key.
|
514
|
-
|
515
|
-
This method evaluates the correctness of answers by comparing them to a
|
516
|
-
provided answer key. It returns a dictionary with counts of correct,
|
532
|
+
|
533
|
+
This method evaluates the correctness of answers by comparing them to a
|
534
|
+
provided answer key. It returns a dictionary with counts of correct,
|
517
535
|
incorrect, and missing answers.
|
518
|
-
|
536
|
+
|
519
537
|
The answer key can contain either single values or lists of acceptable values.
|
520
538
|
If a list is provided, the answer is considered correct if it matches any
|
521
539
|
value in the list.
|
@@ -527,7 +545,7 @@ class Result(Base, UserDict):
|
|
527
545
|
Returns:
|
528
546
|
A dictionary with keys 'correct', 'incorrect', and 'missing', indicating
|
529
547
|
the counts of each answer type.
|
530
|
-
|
548
|
+
|
531
549
|
Examples:
|
532
550
|
>>> Result.example()['answer']
|
533
551
|
{'how_feeling': 'OK', 'how_feeling_yesterday': 'Great'}
|
@@ -536,21 +554,24 @@ class Result(Base, UserDict):
|
|
536
554
|
>>> answer_key = {'how_feeling': 'OK', 'how_feeling_yesterday': 'Great'}
|
537
555
|
>>> Result.example().score_with_answer_key(answer_key)
|
538
556
|
{'correct': 2, 'incorrect': 0, 'missing': 0}
|
539
|
-
|
557
|
+
|
540
558
|
>>> # Using answer key with multiple acceptable answers
|
541
559
|
>>> answer_key = {'how_feeling': 'OK', 'how_feeling_yesterday': ['Great', 'Good']}
|
542
560
|
>>> Result.example().score_with_answer_key(answer_key)
|
543
561
|
{'correct': 2, 'incorrect': 0, 'missing': 0}
|
544
562
|
"""
|
545
|
-
final_scores = {
|
563
|
+
final_scores = {"correct": 0, "incorrect": 0, "missing": 0}
|
546
564
|
for question_name, answer in self.answer.items():
|
547
565
|
if question_name in answer_key:
|
548
|
-
if
|
549
|
-
|
566
|
+
if (
|
567
|
+
answer == answer_key[question_name]
|
568
|
+
or answer in answer_key[question_name]
|
569
|
+
):
|
570
|
+
final_scores["correct"] += 1
|
550
571
|
else:
|
551
|
-
final_scores[
|
572
|
+
final_scores["incorrect"] += 1
|
552
573
|
else:
|
553
|
-
final_scores[
|
574
|
+
final_scores["missing"] += 1
|
554
575
|
|
555
576
|
return final_scores
|
556
577
|
|
@@ -570,14 +591,17 @@ class Result(Base, UserDict):
|
|
570
591
|
params[k] = v.default
|
571
592
|
else:
|
572
593
|
from .exceptions import ResultsError
|
594
|
+
|
573
595
|
raise ResultsError(f"Parameter {k} not found in Result object")
|
574
596
|
return scoring_function(**params)
|
575
597
|
|
576
598
|
@classmethod
|
577
|
-
def from_interview(
|
578
|
-
|
579
|
-
|
580
|
-
|
599
|
+
def from_interview(cls, interview) -> Result:
|
600
|
+
"""Return a Result object from an interview dictionary, ensuring no reference to the original interview is maintained."""
|
601
|
+
# Copy the valid results to avoid maintaining references
|
602
|
+
model_response_objects = list(interview.valid_results) if hasattr(interview, 'valid_results') else []
|
603
|
+
# Create a copy of the answers
|
604
|
+
extracted_answers = dict(interview.answers) if hasattr(interview, 'answers') else {}
|
581
605
|
|
582
606
|
def get_question_results(
|
583
607
|
model_response_objects,
|
@@ -638,53 +662,98 @@ class Result(Base, UserDict):
|
|
638
662
|
raw_model_results_dictionary[question_name + "_raw_model_response"] = (
|
639
663
|
result.raw_model_response
|
640
664
|
)
|
641
|
-
raw_model_results_dictionary[question_name + "
|
642
|
-
|
665
|
+
raw_model_results_dictionary[question_name + "_input_tokens"] = (
|
666
|
+
result.input_tokens
|
667
|
+
)
|
668
|
+
raw_model_results_dictionary[question_name + "_output_tokens"] = (
|
669
|
+
result.output_tokens
|
670
|
+
)
|
671
|
+
raw_model_results_dictionary[
|
672
|
+
question_name + "_input_price_per_million_tokens"
|
673
|
+
] = result.input_price_per_million_tokens
|
674
|
+
raw_model_results_dictionary[
|
675
|
+
question_name + "_output_price_per_million_tokens"
|
676
|
+
] = result.output_price_per_million_tokens
|
677
|
+
raw_model_results_dictionary[question_name + "_cost"] = (
|
678
|
+
result.total_cost
|
679
|
+
)
|
680
|
+
one_usd_buys = (
|
643
681
|
"NA"
|
644
|
-
if isinstance(result.
|
645
|
-
or result.
|
646
|
-
or result.
|
647
|
-
else 1.0 / result.
|
682
|
+
if isinstance(result.total_cost, str)
|
683
|
+
or result.total_cost == 0
|
684
|
+
or result.total_cost is None
|
685
|
+
else 1.0 / result.total_cost
|
648
686
|
)
|
649
687
|
raw_model_results_dictionary[question_name + "_one_usd_buys"] = (
|
650
|
-
|
688
|
+
one_usd_buys
|
651
689
|
)
|
652
690
|
cache_used_dictionary[question_name] = result.cache_used
|
653
691
|
|
654
692
|
return raw_model_results_dictionary, cache_used_dictionary
|
655
693
|
|
694
|
+
# Save essential information from the interview before clearing references
|
695
|
+
agent_copy = interview.agent.copy() if hasattr(interview, 'agent') else None
|
696
|
+
scenario_copy = interview.scenario.copy() if hasattr(interview, 'scenario') else None
|
697
|
+
model_copy = interview.model.copy() if hasattr(interview, 'model') else None
|
698
|
+
iteration = interview.iteration if hasattr(interview, 'iteration') else 0
|
699
|
+
survey_copy = interview.survey.copy() if hasattr(interview, 'survey') and interview.survey else None
|
700
|
+
indices_copy = dict(interview.indices) if hasattr(interview, 'indices') and interview.indices else None
|
701
|
+
initial_hash = interview.initial_hash if hasattr(interview, 'initial_hash') else hash(interview)
|
702
|
+
|
703
|
+
# Process data to create dictionaries needed for Result
|
656
704
|
question_results = get_question_results(model_response_objects)
|
657
705
|
answer_key_names = list(question_results.keys())
|
658
|
-
generated_tokens_dict = get_generated_tokens_dict(answer_key_names)
|
659
|
-
comments_dict = get_comments_dict(answer_key_names)
|
660
|
-
|
706
|
+
generated_tokens_dict = get_generated_tokens_dict(answer_key_names) if answer_key_names else {}
|
707
|
+
comments_dict = get_comments_dict(answer_key_names) if answer_key_names else {}
|
708
|
+
|
709
|
+
# Get answers that are in the question results
|
710
|
+
answer_dict = {}
|
711
|
+
for k in answer_key_names:
|
712
|
+
if k in extracted_answers:
|
713
|
+
answer_dict[k] = extracted_answers[k]
|
714
|
+
|
661
715
|
cache_keys = get_cache_keys(model_response_objects)
|
662
716
|
|
663
717
|
question_name_to_prompts = get_question_name_to_prompts(model_response_objects)
|
664
718
|
prompt_dictionary = get_prompt_dictionary(
|
665
719
|
answer_key_names, question_name_to_prompts
|
666
|
-
)
|
720
|
+
) if answer_key_names else {}
|
721
|
+
|
667
722
|
raw_model_results_dictionary, cache_used_dictionary = (
|
668
723
|
get_raw_model_results_and_cache_used_dictionary(model_response_objects)
|
669
724
|
)
|
670
725
|
|
726
|
+
# Create the Result object with all copied data
|
671
727
|
result = cls(
|
672
|
-
agent=
|
673
|
-
scenario=
|
674
|
-
model=
|
675
|
-
iteration=
|
676
|
-
# Computed objects
|
728
|
+
agent=agent_copy,
|
729
|
+
scenario=scenario_copy,
|
730
|
+
model=model_copy,
|
731
|
+
iteration=iteration,
|
677
732
|
answer=answer_dict,
|
678
733
|
prompt=prompt_dictionary,
|
679
734
|
raw_model_response=raw_model_results_dictionary,
|
680
|
-
survey=
|
735
|
+
survey=survey_copy,
|
681
736
|
generated_tokens=generated_tokens_dict,
|
682
737
|
comments_dict=comments_dict,
|
683
738
|
cache_used_dict=cache_used_dictionary,
|
684
|
-
indices=
|
739
|
+
indices=indices_copy,
|
685
740
|
cache_keys=cache_keys,
|
686
741
|
)
|
687
|
-
|
742
|
+
|
743
|
+
# Store only the hash, not the interview
|
744
|
+
result.interview_hash = initial_hash
|
745
|
+
|
746
|
+
# Clear references to help garbage collection of the interview
|
747
|
+
if hasattr(interview, 'clear_references'):
|
748
|
+
interview.clear_references()
|
749
|
+
|
750
|
+
# Clear local references to help with garbage collection
|
751
|
+
del model_response_objects
|
752
|
+
del extracted_answers
|
753
|
+
del question_results
|
754
|
+
del answer_key_names
|
755
|
+
del question_name_to_prompts
|
756
|
+
|
688
757
|
return result
|
689
758
|
|
690
759
|
|