edsl 0.1.39.dev1__py3-none-any.whl → 0.1.39.dev2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- edsl/Base.py +169 -116
- edsl/__init__.py +14 -6
- edsl/__version__.py +1 -1
- edsl/agents/Agent.py +358 -146
- edsl/agents/AgentList.py +211 -73
- edsl/agents/Invigilator.py +88 -36
- edsl/agents/InvigilatorBase.py +59 -70
- edsl/agents/PromptConstructor.py +117 -219
- edsl/agents/QuestionInstructionPromptBuilder.py +128 -0
- edsl/agents/QuestionOptionProcessor.py +172 -0
- edsl/agents/QuestionTemplateReplacementsBuilder.py +137 -0
- edsl/agents/__init__.py +0 -1
- edsl/agents/prompt_helpers.py +3 -3
- edsl/config.py +22 -2
- edsl/conversation/car_buying.py +2 -1
- edsl/coop/CoopFunctionsMixin.py +15 -0
- edsl/coop/ExpectedParrotKeyHandler.py +125 -0
- edsl/coop/PriceFetcher.py +1 -1
- edsl/coop/coop.py +104 -42
- edsl/coop/utils.py +14 -14
- edsl/data/Cache.py +21 -14
- edsl/data/CacheEntry.py +12 -15
- edsl/data/CacheHandler.py +33 -12
- edsl/data/__init__.py +4 -3
- edsl/data_transfer_models.py +2 -1
- edsl/enums.py +20 -0
- edsl/exceptions/__init__.py +50 -50
- edsl/exceptions/agents.py +12 -0
- edsl/exceptions/inference_services.py +5 -0
- edsl/exceptions/questions.py +24 -6
- edsl/exceptions/scenarios.py +7 -0
- edsl/inference_services/AnthropicService.py +0 -3
- edsl/inference_services/AvailableModelCacheHandler.py +184 -0
- edsl/inference_services/AvailableModelFetcher.py +209 -0
- edsl/inference_services/AwsBedrock.py +0 -2
- edsl/inference_services/AzureAI.py +0 -2
- edsl/inference_services/GoogleService.py +2 -11
- edsl/inference_services/InferenceServiceABC.py +18 -85
- edsl/inference_services/InferenceServicesCollection.py +105 -80
- edsl/inference_services/MistralAIService.py +0 -3
- edsl/inference_services/OpenAIService.py +1 -4
- edsl/inference_services/PerplexityService.py +0 -3
- edsl/inference_services/ServiceAvailability.py +135 -0
- edsl/inference_services/TestService.py +11 -8
- edsl/inference_services/data_structures.py +62 -0
- edsl/jobs/AnswerQuestionFunctionConstructor.py +188 -0
- edsl/jobs/Answers.py +1 -14
- edsl/jobs/FetchInvigilator.py +40 -0
- edsl/jobs/InterviewTaskManager.py +98 -0
- edsl/jobs/InterviewsConstructor.py +48 -0
- edsl/jobs/Jobs.py +102 -243
- edsl/jobs/JobsChecks.py +35 -10
- edsl/jobs/JobsComponentConstructor.py +189 -0
- edsl/jobs/JobsPrompts.py +5 -3
- edsl/jobs/JobsRemoteInferenceHandler.py +128 -80
- edsl/jobs/JobsRemoteInferenceLogger.py +239 -0
- edsl/jobs/RequestTokenEstimator.py +30 -0
- edsl/jobs/buckets/BucketCollection.py +44 -3
- edsl/jobs/buckets/TokenBucket.py +53 -21
- edsl/jobs/buckets/TokenBucketAPI.py +211 -0
- edsl/jobs/buckets/TokenBucketClient.py +191 -0
- edsl/jobs/decorators.py +35 -0
- edsl/jobs/interviews/Interview.py +77 -380
- edsl/jobs/jobs_status_enums.py +9 -0
- edsl/jobs/loggers/HTMLTableJobLogger.py +304 -0
- edsl/jobs/runners/JobsRunnerAsyncio.py +4 -49
- edsl/jobs/tasks/QuestionTaskCreator.py +21 -19
- edsl/jobs/tasks/TaskHistory.py +14 -15
- edsl/jobs/tasks/task_status_enum.py +0 -2
- edsl/language_models/ComputeCost.py +63 -0
- edsl/language_models/LanguageModel.py +137 -234
- edsl/language_models/ModelList.py +11 -13
- edsl/language_models/PriceManager.py +127 -0
- edsl/language_models/RawResponseHandler.py +106 -0
- edsl/language_models/ServiceDataSources.py +0 -0
- edsl/language_models/__init__.py +0 -1
- edsl/language_models/key_management/KeyLookup.py +63 -0
- edsl/language_models/key_management/KeyLookupBuilder.py +273 -0
- edsl/language_models/key_management/KeyLookupCollection.py +38 -0
- edsl/language_models/key_management/__init__.py +0 -0
- edsl/language_models/key_management/models.py +131 -0
- edsl/language_models/registry.py +49 -59
- edsl/language_models/repair.py +2 -2
- edsl/language_models/utilities.py +5 -4
- edsl/notebooks/Notebook.py +19 -14
- edsl/notebooks/NotebookToLaTeX.py +142 -0
- edsl/prompts/Prompt.py +29 -39
- edsl/questions/AnswerValidatorMixin.py +47 -2
- edsl/questions/ExceptionExplainer.py +77 -0
- edsl/questions/HTMLQuestion.py +103 -0
- edsl/questions/LoopProcessor.py +149 -0
- edsl/questions/QuestionBase.py +37 -192
- edsl/questions/QuestionBaseGenMixin.py +52 -48
- edsl/questions/QuestionBasePromptsMixin.py +7 -3
- edsl/questions/QuestionCheckBox.py +1 -1
- edsl/questions/QuestionExtract.py +1 -1
- edsl/questions/QuestionFreeText.py +1 -2
- edsl/questions/QuestionList.py +3 -5
- edsl/questions/QuestionMatrix.py +265 -0
- edsl/questions/QuestionMultipleChoice.py +66 -22
- edsl/questions/QuestionNumerical.py +1 -3
- edsl/questions/QuestionRank.py +6 -16
- edsl/questions/ResponseValidatorABC.py +37 -11
- edsl/questions/ResponseValidatorFactory.py +28 -0
- edsl/questions/SimpleAskMixin.py +4 -3
- edsl/questions/__init__.py +1 -0
- edsl/questions/derived/QuestionLinearScale.py +6 -3
- edsl/questions/derived/QuestionTopK.py +1 -1
- edsl/questions/descriptors.py +17 -3
- edsl/questions/question_registry.py +1 -1
- edsl/questions/templates/matrix/__init__.py +1 -0
- edsl/questions/templates/matrix/answering_instructions.jinja +5 -0
- edsl/questions/templates/matrix/question_presentation.jinja +20 -0
- edsl/results/CSSParameterizer.py +1 -1
- edsl/results/Dataset.py +170 -7
- edsl/results/DatasetExportMixin.py +224 -302
- edsl/results/DatasetTree.py +28 -8
- edsl/results/MarkdownToDocx.py +122 -0
- edsl/results/MarkdownToPDF.py +111 -0
- edsl/results/Result.py +192 -206
- edsl/results/Results.py +120 -113
- edsl/results/ResultsExportMixin.py +2 -0
- edsl/results/Selector.py +23 -13
- edsl/results/TableDisplay.py +98 -171
- edsl/results/TextEditor.py +50 -0
- edsl/results/__init__.py +1 -1
- edsl/results/smart_objects.py +96 -0
- edsl/results/table_data_class.py +12 -0
- edsl/results/table_renderers.py +118 -0
- edsl/scenarios/ConstructDownloadLink.py +109 -0
- edsl/scenarios/DirectoryScanner.py +96 -0
- edsl/scenarios/DocumentChunker.py +102 -0
- edsl/scenarios/DocxScenario.py +16 -0
- edsl/scenarios/FileStore.py +118 -239
- edsl/scenarios/PdfExtractor.py +40 -0
- edsl/scenarios/Scenario.py +90 -193
- edsl/scenarios/ScenarioHtmlMixin.py +4 -3
- edsl/scenarios/ScenarioJoin.py +10 -6
- edsl/scenarios/ScenarioList.py +383 -240
- edsl/scenarios/ScenarioListExportMixin.py +0 -7
- edsl/scenarios/ScenarioListPdfMixin.py +15 -37
- edsl/scenarios/ScenarioSelector.py +156 -0
- edsl/scenarios/__init__.py +1 -2
- edsl/scenarios/file_methods.py +85 -0
- edsl/scenarios/handlers/__init__.py +13 -0
- edsl/scenarios/handlers/csv.py +38 -0
- edsl/scenarios/handlers/docx.py +76 -0
- edsl/scenarios/handlers/html.py +37 -0
- edsl/scenarios/handlers/json.py +111 -0
- edsl/scenarios/handlers/latex.py +5 -0
- edsl/scenarios/handlers/md.py +51 -0
- edsl/scenarios/handlers/pdf.py +68 -0
- edsl/scenarios/handlers/png.py +39 -0
- edsl/scenarios/handlers/pptx.py +105 -0
- edsl/scenarios/handlers/py.py +294 -0
- edsl/scenarios/handlers/sql.py +313 -0
- edsl/scenarios/handlers/sqlite.py +149 -0
- edsl/scenarios/handlers/txt.py +33 -0
- edsl/study/ObjectEntry.py +1 -1
- edsl/study/SnapShot.py +1 -1
- edsl/study/Study.py +5 -12
- edsl/surveys/ConstructDAG.py +92 -0
- edsl/surveys/EditSurvey.py +221 -0
- edsl/surveys/InstructionHandler.py +100 -0
- edsl/surveys/MemoryManagement.py +72 -0
- edsl/surveys/Rule.py +5 -4
- edsl/surveys/RuleCollection.py +25 -27
- edsl/surveys/RuleManager.py +172 -0
- edsl/surveys/Simulator.py +75 -0
- edsl/surveys/Survey.py +199 -771
- edsl/surveys/SurveyCSS.py +20 -8
- edsl/surveys/{SurveyFlowVisualizationMixin.py → SurveyFlowVisualization.py} +11 -9
- edsl/surveys/SurveyToApp.py +141 -0
- edsl/surveys/__init__.py +4 -2
- edsl/surveys/descriptors.py +6 -2
- edsl/surveys/instructions/ChangeInstruction.py +1 -2
- edsl/surveys/instructions/Instruction.py +4 -13
- edsl/surveys/instructions/InstructionCollection.py +11 -6
- edsl/templates/error_reporting/interview_details.html +1 -1
- edsl/templates/error_reporting/report.html +1 -1
- edsl/tools/plotting.py +1 -1
- edsl/utilities/PrettyList.py +56 -0
- edsl/utilities/is_notebook.py +18 -0
- edsl/utilities/is_valid_variable_name.py +11 -0
- edsl/utilities/remove_edsl_version.py +24 -0
- edsl/utilities/utilities.py +35 -23
- {edsl-0.1.39.dev1.dist-info → edsl-0.1.39.dev2.dist-info}/METADATA +12 -10
- edsl-0.1.39.dev2.dist-info/RECORD +352 -0
- edsl/language_models/KeyLookup.py +0 -30
- edsl/language_models/unused/ReplicateBase.py +0 -83
- edsl/results/ResultsDBMixin.py +0 -238
- edsl-0.1.39.dev1.dist-info/RECORD +0 -277
- {edsl-0.1.39.dev1.dist-info → edsl-0.1.39.dev2.dist-info}/LICENSE +0 -0
- {edsl-0.1.39.dev1.dist-info → edsl-0.1.39.dev2.dist-info}/WHEEL +0 -0
@@ -0,0 +1,40 @@
|
|
1
|
+
import os
|
2
|
+
|
3
|
+
|
4
|
+
class PdfExtractor:
|
5
|
+
def __init__(self, pdf_path: str, parent_object: object):
|
6
|
+
self.pdf_path = pdf_path
|
7
|
+
self.constructor = parent_object.__class__
|
8
|
+
|
9
|
+
def get_object(self) -> object:
|
10
|
+
return self.constructor(self._get_pdf_dict())
|
11
|
+
|
12
|
+
def _get_pdf_dict(self) -> dict:
|
13
|
+
# Ensure the file exists
|
14
|
+
import fitz
|
15
|
+
|
16
|
+
if not os.path.exists(self.pdf_path):
|
17
|
+
raise FileNotFoundError(f"The file {self.pdf_path} does not exist.")
|
18
|
+
|
19
|
+
# Open the PDF file
|
20
|
+
document = fitz.open(self.pdf_path)
|
21
|
+
|
22
|
+
# Get the filename from the path
|
23
|
+
filename = os.path.basename(self.pdf_path)
|
24
|
+
|
25
|
+
# Iterate through each page and extract text
|
26
|
+
text = ""
|
27
|
+
for page_num in range(len(document)):
|
28
|
+
page = document.load_page(page_num)
|
29
|
+
blocks = page.get_text("blocks") # Extract text blocks
|
30
|
+
|
31
|
+
# Sort blocks by their vertical position (y0) to maintain reading order
|
32
|
+
blocks.sort(key=lambda b: (b[1], b[0])) # Sort by y0 first, then x0
|
33
|
+
|
34
|
+
# Combine the text blocks in order
|
35
|
+
for block in blocks:
|
36
|
+
text += block[4] + "\n"
|
37
|
+
|
38
|
+
# Create a dictionary for the combined text
|
39
|
+
page_info = {"filename": filename, "text": text}
|
40
|
+
return page_info
|
edsl/scenarios/Scenario.py
CHANGED
@@ -2,54 +2,65 @@
|
|
2
2
|
|
3
3
|
from __future__ import annotations
|
4
4
|
import copy
|
5
|
-
import hashlib
|
6
5
|
import os
|
7
6
|
import json
|
8
7
|
from collections import UserDict
|
9
|
-
from typing import Union, List, Optional,
|
8
|
+
from typing import Union, List, Optional, TYPE_CHECKING, Collection
|
10
9
|
from uuid import uuid4
|
11
10
|
|
12
11
|
from edsl.Base import Base
|
13
12
|
from edsl.scenarios.ScenarioHtmlMixin import ScenarioHtmlMixin
|
14
|
-
from edsl.utilities.
|
13
|
+
from edsl.utilities.remove_edsl_version import remove_edsl_version
|
15
14
|
from edsl.exceptions.scenarios import ScenarioError
|
16
15
|
|
16
|
+
if TYPE_CHECKING:
|
17
|
+
from edsl.scenarios.ScenarioList import ScenarioList
|
18
|
+
from edsl.results.Dataset import Dataset
|
19
|
+
|
17
20
|
|
18
21
|
class DisplayJSON:
|
19
|
-
|
20
|
-
|
22
|
+
"""Display a dictionary as JSON."""
|
23
|
+
|
24
|
+
def __init__(self, input_dict: dict):
|
25
|
+
self.text = json.dumps(input_dict, indent=4)
|
21
26
|
|
22
27
|
def __repr__(self):
|
23
28
|
return self.text
|
24
29
|
|
25
30
|
|
26
31
|
class DisplayYAML:
|
27
|
-
|
32
|
+
"""Display a dictionary as YAML."""
|
33
|
+
|
34
|
+
def __init__(self, input_dict: dict):
|
28
35
|
import yaml
|
29
36
|
|
30
|
-
self.text = yaml.dump(
|
37
|
+
self.text = yaml.dump(input_dict)
|
31
38
|
|
32
39
|
def __repr__(self):
|
33
40
|
return self.text
|
34
41
|
|
35
42
|
|
36
43
|
class Scenario(Base, UserDict, ScenarioHtmlMixin):
|
37
|
-
"""A Scenario is a dictionary of keys/values.
|
38
|
-
|
39
|
-
They can be used parameterize EDSL questions."""
|
44
|
+
"""A Scenario is a dictionary of keys/values that can be used to parameterize questions."""
|
40
45
|
|
41
46
|
__documentation__ = "https://docs.expectedparrot.com/en/latest/scenarios.html"
|
42
47
|
|
43
|
-
def __init__(self, data:
|
48
|
+
def __init__(self, data: Optional[dict] = None, name: Optional[str] = None):
|
44
49
|
"""Initialize a new Scenario.
|
45
50
|
|
46
|
-
|
47
|
-
|
51
|
+
:param data: A dictionary of keys/values for parameterizing questions.
|
52
|
+
:param name: The name of the scenario.
|
53
|
+
"""
|
48
54
|
if not isinstance(data, dict) and data is not None:
|
49
|
-
|
50
|
-
|
51
|
-
|
55
|
+
try:
|
56
|
+
data = dict(data)
|
57
|
+
except Exception as e:
|
58
|
+
raise ScenarioError(
|
59
|
+
f"You must pass in a dictionary to initialize a Scenario. You passed in {data}",
|
60
|
+
"Exception message:" + str(e),
|
61
|
+
)
|
52
62
|
|
63
|
+
super().__init__()
|
53
64
|
self.data = data if data is not None else {}
|
54
65
|
self.name = name
|
55
66
|
|
@@ -59,7 +70,6 @@ class Scenario(Base, UserDict, ScenarioHtmlMixin):
|
|
59
70
|
:param n: The number of times to replicate the scenario.
|
60
71
|
|
61
72
|
Example:
|
62
|
-
|
63
73
|
>>> s = Scenario({"food": "wood chips"})
|
64
74
|
>>> s.replicate(2)
|
65
75
|
ScenarioList([Scenario({'food': 'wood chips'}), Scenario({'food': 'wood chips'})])
|
@@ -82,13 +92,13 @@ class Scenario(Base, UserDict, ScenarioHtmlMixin):
|
|
82
92
|
return True
|
83
93
|
return False
|
84
94
|
|
85
|
-
def
|
86
|
-
self, replacement_left="<<", replacement_right=">>"
|
95
|
+
def _convert_jinja_braces(
|
96
|
+
self, replacement_left: str = "<<", replacement_right: str = ">>"
|
87
97
|
) -> Scenario:
|
88
98
|
"""Convert Jinja braces to some other character.
|
89
99
|
|
90
100
|
>>> s = Scenario({"food": "I love {{wood chips}}"})
|
91
|
-
>>> s.
|
101
|
+
>>> s._convert_jinja_braces()
|
92
102
|
Scenario({'food': 'I love <<wood chips>>'})
|
93
103
|
|
94
104
|
"""
|
@@ -102,7 +112,7 @@ class Scenario(Base, UserDict, ScenarioHtmlMixin):
|
|
102
112
|
new_scenario[key] = value
|
103
113
|
return new_scenario
|
104
114
|
|
105
|
-
def __add__(self, other_scenario:
|
115
|
+
def __add__(self, other_scenario: Scenario) -> Scenario:
|
106
116
|
"""Combine two scenarios by taking the union of their keys
|
107
117
|
|
108
118
|
If the other scenario is None, then just return self.
|
@@ -127,11 +137,14 @@ class Scenario(Base, UserDict, ScenarioHtmlMixin):
|
|
127
137
|
return s
|
128
138
|
|
129
139
|
def rename(
|
130
|
-
self,
|
131
|
-
|
140
|
+
self,
|
141
|
+
old_name_or_replacement_dict: Union[str, dict[str, str]],
|
142
|
+
new_name: Optional[str] = None,
|
143
|
+
) -> Scenario:
|
132
144
|
"""Rename the keys of a scenario.
|
133
145
|
|
134
|
-
:param
|
146
|
+
:param old_name_or_replacement_dict: A dictionary of old keys to new keys *OR* a string of the old key.
|
147
|
+
:param new_name: The new name of the key.
|
135
148
|
|
136
149
|
Example:
|
137
150
|
|
@@ -156,13 +169,26 @@ class Scenario(Base, UserDict, ScenarioHtmlMixin):
|
|
156
169
|
new_scenario[key] = value
|
157
170
|
return new_scenario
|
158
171
|
|
159
|
-
def
|
160
|
-
|
172
|
+
def new_column_names(self, new_names: List[str]) -> Scenario:
|
173
|
+
"""Rename the keys of a scenario.
|
174
|
+
|
175
|
+
>>> s = Scenario({"food": "wood chips"})
|
176
|
+
>>> s.new_column_names(["food_preference"])
|
177
|
+
Scenario({'food_preference': 'wood chips'})
|
178
|
+
"""
|
179
|
+
try:
|
180
|
+
assert len(new_names) == len(self.keys())
|
181
|
+
except AssertionError:
|
182
|
+
print("The number of new names must match the number of keys.")
|
183
|
+
|
184
|
+
new_scenario = Scenario()
|
185
|
+
for new_names, value in zip(new_names, self.values()):
|
186
|
+
new_scenario[new_names] = value
|
187
|
+
return new_scenario
|
161
188
|
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
return d.table(tablefmt=tablefmt)
|
189
|
+
def table(self, tablefmt: str = "grid") -> str:
|
190
|
+
"""Display a scenario as a table."""
|
191
|
+
return self.to_dataset().table(tablefmt=tablefmt)
|
166
192
|
|
167
193
|
def json(self):
|
168
194
|
return DisplayJSON(self.to_dict(add_edsl_version=False))
|
@@ -172,7 +198,7 @@ class Scenario(Base, UserDict, ScenarioHtmlMixin):
|
|
172
198
|
|
173
199
|
return DisplayYAML(self.to_dict(add_edsl_version=False))
|
174
200
|
|
175
|
-
def to_dict(self, add_edsl_version=True) -> dict:
|
201
|
+
def to_dict(self, add_edsl_version: bool = True) -> dict:
|
176
202
|
"""Convert a scenario to a dictionary.
|
177
203
|
|
178
204
|
Example:
|
@@ -200,8 +226,7 @@ class Scenario(Base, UserDict, ScenarioHtmlMixin):
|
|
200
226
|
return d
|
201
227
|
|
202
228
|
def __hash__(self) -> int:
|
203
|
-
"""
|
204
|
-
Return a hash of the scenario.
|
229
|
+
"""Return a hash of the scenario.
|
205
230
|
|
206
231
|
Example:
|
207
232
|
|
@@ -213,44 +238,23 @@ class Scenario(Base, UserDict, ScenarioHtmlMixin):
|
|
213
238
|
|
214
239
|
return dict_hash(self.to_dict(add_edsl_version=False))
|
215
240
|
|
216
|
-
def print(self):
|
217
|
-
from rich import print_json
|
218
|
-
import json
|
219
|
-
|
220
|
-
print_json(json.dumps(self.to_dict()))
|
221
|
-
|
222
241
|
def __repr__(self):
|
223
242
|
return "Scenario(" + repr(self.data) + ")"
|
224
243
|
|
225
244
|
def to_dataset(self) -> "Dataset":
|
226
|
-
|
245
|
+
"""Convert a scenario to a dataset.
|
246
|
+
|
247
|
+
>>> s = Scenario({"food": "wood chips"})
|
248
|
+
>>> s.to_dataset()
|
249
|
+
Dataset([{'key': ['food']}, {'value': ['wood chips']}])
|
250
|
+
"""
|
227
251
|
from edsl.results.Dataset import Dataset
|
228
252
|
|
229
|
-
keys =
|
230
|
-
values =
|
253
|
+
keys = list(self.keys())
|
254
|
+
values = list(self.values())
|
231
255
|
return Dataset([{"key": keys}, {"value": values}])
|
232
256
|
|
233
|
-
def
|
234
|
-
from tabulate import tabulate
|
235
|
-
import reprlib
|
236
|
-
|
237
|
-
d = self.to_dict(add_edsl_version=False)
|
238
|
-
# return self.to_dataset()
|
239
|
-
r = reprlib.Repr()
|
240
|
-
r.maxstring = 70
|
241
|
-
|
242
|
-
data = [[k, r.repr(v)] for k, v in d.items()]
|
243
|
-
from tabulate import tabulate
|
244
|
-
|
245
|
-
if hasattr(self, "__documentation__"):
|
246
|
-
footer = f"<a href='{self.__documentation__}'>(docs)</a></p>"
|
247
|
-
else:
|
248
|
-
footer = ""
|
249
|
-
|
250
|
-
table = str(tabulate(data, headers=["keys", "values"], tablefmt="html"))
|
251
|
-
return f"<pre>{table}</pre>" + footer
|
252
|
-
|
253
|
-
def select(self, list_of_keys: List[str]) -> "Scenario":
|
257
|
+
def select(self, list_of_keys: Collection[str]) -> "Scenario":
|
254
258
|
"""Select a subset of keys from a scenario.
|
255
259
|
|
256
260
|
:param list_of_keys: The keys to select.
|
@@ -266,7 +270,7 @@ class Scenario(Base, UserDict, ScenarioHtmlMixin):
|
|
266
270
|
new_scenario[key] = self[key]
|
267
271
|
return new_scenario
|
268
272
|
|
269
|
-
def drop(self, list_of_keys:
|
273
|
+
def drop(self, list_of_keys: Collection[str]) -> "Scenario":
|
270
274
|
"""Drop a subset of keys from a scenario.
|
271
275
|
|
272
276
|
:param list_of_keys: The keys to drop.
|
@@ -320,7 +324,7 @@ class Scenario(Base, UserDict, ScenarioHtmlMixin):
|
|
320
324
|
... _ = f.flush()
|
321
325
|
... s = Scenario.from_file(f.name, "file")
|
322
326
|
>>> s
|
323
|
-
Scenario({'file': FileStore(path='...')})
|
327
|
+
Scenario({'file': FileStore(path='...', ...)})
|
324
328
|
|
325
329
|
"""
|
326
330
|
from edsl.scenarios.FileStore import FileStore
|
@@ -351,35 +355,10 @@ class Scenario(Base, UserDict, ScenarioHtmlMixin):
|
|
351
355
|
return cls.from_file(image_path, image_name)
|
352
356
|
|
353
357
|
@classmethod
|
354
|
-
def from_pdf(cls, pdf_path):
|
355
|
-
|
356
|
-
import fitz
|
357
|
-
|
358
|
-
if not os.path.exists(pdf_path):
|
359
|
-
raise FileNotFoundError(f"The file {pdf_path} does not exist.")
|
360
|
-
|
361
|
-
# Open the PDF file
|
362
|
-
document = fitz.open(pdf_path)
|
363
|
-
|
364
|
-
# Get the filename from the path
|
365
|
-
filename = os.path.basename(pdf_path)
|
358
|
+
def from_pdf(cls, pdf_path: str):
|
359
|
+
from edsl.scenarios.PdfExtractor import PdfExtractor
|
366
360
|
|
367
|
-
|
368
|
-
text = ""
|
369
|
-
for page_num in range(len(document)):
|
370
|
-
page = document.load_page(page_num)
|
371
|
-
blocks = page.get_text("blocks") # Extract text blocks
|
372
|
-
|
373
|
-
# Sort blocks by their vertical position (y0) to maintain reading order
|
374
|
-
blocks.sort(key=lambda b: (b[1], b[0])) # Sort by y0 first, then x0
|
375
|
-
|
376
|
-
# Combine the text blocks in order
|
377
|
-
for block in blocks:
|
378
|
-
text += block[4] + "\n"
|
379
|
-
|
380
|
-
# Create a dictionary for the combined text
|
381
|
-
page_info = {"filename": filename, "text": text}
|
382
|
-
return Scenario(page_info)
|
361
|
+
return PdfExtractor(pdf_path, cls).get_object()
|
383
362
|
|
384
363
|
@classmethod
|
385
364
|
def from_docx(cls, docx_path: str) -> "Scenario":
|
@@ -399,52 +378,9 @@ class Scenario(Base, UserDict, ScenarioHtmlMixin):
|
|
399
378
|
Scenario({'file_path': 'test.docx', 'text': 'EDSL Survey\\nThis is a test.'})
|
400
379
|
>>> import os; os.remove("test.docx")
|
401
380
|
"""
|
402
|
-
from
|
403
|
-
|
404
|
-
doc = Document(docx_path)
|
381
|
+
from edsl.scenarios.DocxScenario import DocxScenario
|
405
382
|
|
406
|
-
|
407
|
-
full_text = []
|
408
|
-
for para in doc.paragraphs:
|
409
|
-
full_text.append(para.text)
|
410
|
-
|
411
|
-
# Join the text from all paragraphs
|
412
|
-
text = "\n".join(full_text)
|
413
|
-
return Scenario({"file_path": docx_path, "text": text})
|
414
|
-
|
415
|
-
@staticmethod
|
416
|
-
def _line_chunks(text, num_lines: int) -> Generator[str, None, None]:
|
417
|
-
"""Split a text into chunks of a given size.
|
418
|
-
|
419
|
-
:param text: The text to split.
|
420
|
-
:param num_lines: The number of lines in each chunk.
|
421
|
-
|
422
|
-
Example:
|
423
|
-
|
424
|
-
>>> list(Scenario._line_chunks("This is a test.\\nThis is a test. This is a test.", 1))
|
425
|
-
['This is a test.', 'This is a test. This is a test.']
|
426
|
-
"""
|
427
|
-
lines = text.split("\n")
|
428
|
-
for i in range(0, len(lines), num_lines):
|
429
|
-
chunk = "\n".join(lines[i : i + num_lines])
|
430
|
-
yield chunk
|
431
|
-
|
432
|
-
@staticmethod
|
433
|
-
def _word_chunks(text, num_words: int) -> Generator[str, None, None]:
|
434
|
-
"""Split a text into chunks of a given size.
|
435
|
-
|
436
|
-
:param text: The text to split.
|
437
|
-
:param num_words: The number of words in each chunk.
|
438
|
-
|
439
|
-
Example:
|
440
|
-
|
441
|
-
>>> list(Scenario._word_chunks("This is a test.", 2))
|
442
|
-
['This is', 'a test.']
|
443
|
-
"""
|
444
|
-
words = text.split()
|
445
|
-
for i in range(0, len(words), num_words):
|
446
|
-
chunk = " ".join(words[i : i + num_words])
|
447
|
-
yield chunk
|
383
|
+
return Scenario(DocxScenario(docx_path).get_scenario_dict())
|
448
384
|
|
449
385
|
def chunk(
|
450
386
|
self,
|
@@ -495,36 +431,11 @@ class Scenario(Base, UserDict, ScenarioHtmlMixin):
|
|
495
431
|
...
|
496
432
|
ValueError: You must specify either num_words or num_lines, but not both.
|
497
433
|
"""
|
498
|
-
from edsl.scenarios.
|
434
|
+
from edsl.scenarios.DocumentChunker import DocumentChunker
|
499
435
|
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
if num_lines is not None:
|
504
|
-
chunks = list(self._line_chunks(self[field], num_lines))
|
505
|
-
|
506
|
-
if num_words is None and num_lines is None:
|
507
|
-
raise ValueError("You must specify either num_words or num_lines.")
|
508
|
-
|
509
|
-
if num_words is not None and num_lines is not None:
|
510
|
-
raise ValueError(
|
511
|
-
"You must specify either num_words or num_lines, but not both."
|
512
|
-
)
|
513
|
-
|
514
|
-
scenarios = []
|
515
|
-
for i, chunk in enumerate(chunks):
|
516
|
-
new_scenario = copy.deepcopy(self)
|
517
|
-
new_scenario[field] = chunk
|
518
|
-
new_scenario[field + "_chunk"] = i
|
519
|
-
if include_original:
|
520
|
-
if hash_original:
|
521
|
-
new_scenario[field + "_original"] = hashlib.md5(
|
522
|
-
self[field].encode()
|
523
|
-
).hexdigest()
|
524
|
-
else:
|
525
|
-
new_scenario[field + "_original"] = self[field]
|
526
|
-
scenarios.append(new_scenario)
|
527
|
-
return ScenarioList(scenarios)
|
436
|
+
return DocumentChunker(self).chunk(
|
437
|
+
field, num_words, num_lines, include_original, hash_original
|
438
|
+
)
|
528
439
|
|
529
440
|
@classmethod
|
530
441
|
@remove_edsl_version
|
@@ -547,44 +458,30 @@ class Scenario(Base, UserDict, ScenarioHtmlMixin):
|
|
547
458
|
return cls(d)
|
548
459
|
|
549
460
|
def _table(self) -> tuple[dict, list]:
|
550
|
-
"""Prepare generic table data.
|
461
|
+
"""Prepare generic table data.
|
462
|
+
>>> s = Scenario({"food": "wood chips"})
|
463
|
+
>>> s._table()
|
464
|
+
([{'Attribute': 'data', 'Value': "{'food': 'wood chips'}"}, {'Attribute': 'name', 'Value': 'None'}], ['Attribute', 'Value'])
|
465
|
+
"""
|
551
466
|
table_data = []
|
552
467
|
for attr_name, attr_value in self.__dict__.items():
|
553
468
|
table_data.append({"Attribute": attr_name, "Value": repr(attr_value)})
|
554
469
|
column_names = ["Attribute", "Value"]
|
555
470
|
return table_data, column_names
|
556
471
|
|
557
|
-
def rich_print(self) -> "Table":
|
558
|
-
"""Display an object as a rich table."""
|
559
|
-
from rich.table import Table
|
560
|
-
|
561
|
-
table_data, column_names = self._table()
|
562
|
-
table = Table(title=f"{self.__class__.__name__} Attributes")
|
563
|
-
for column in column_names:
|
564
|
-
table.add_column(column, style="bold")
|
565
|
-
|
566
|
-
for row in table_data:
|
567
|
-
row_data = [row[column] for column in column_names]
|
568
|
-
table.add_row(*row_data)
|
569
|
-
|
570
|
-
return table
|
571
|
-
|
572
472
|
@classmethod
|
573
|
-
def example(cls, randomize: bool = False
|
473
|
+
def example(cls, randomize: bool = False) -> Scenario:
|
574
474
|
"""
|
575
475
|
Returns an example Scenario instance.
|
576
476
|
|
577
477
|
:param randomize: If True, adds a random string to the value of the example key.
|
578
478
|
"""
|
579
|
-
if not
|
580
|
-
|
581
|
-
|
582
|
-
{
|
583
|
-
|
584
|
-
|
585
|
-
)
|
586
|
-
else:
|
587
|
-
return cls.from_image(cls.example_image())
|
479
|
+
addition = "" if not randomize else str(uuid4())
|
480
|
+
return cls(
|
481
|
+
{
|
482
|
+
"persona": f"A reseacher studying whether LLMs can be used to generate surveys.{addition}",
|
483
|
+
}
|
484
|
+
)
|
588
485
|
|
589
486
|
def code(self) -> List[str]:
|
590
487
|
"""Return the code for the scenario."""
|
@@ -1,7 +1,4 @@
|
|
1
|
-
import requests
|
2
1
|
from typing import Optional
|
3
|
-
from requests.adapters import HTTPAdapter
|
4
|
-
from requests.packages.urllib3.util.retry import Retry
|
5
2
|
|
6
3
|
|
7
4
|
class ScenarioHtmlMixin:
|
@@ -22,6 +19,10 @@ class ScenarioHtmlMixin:
|
|
22
19
|
|
23
20
|
def fetch_html(url):
|
24
21
|
# Define the user-agent to mimic a browser
|
22
|
+
import requests
|
23
|
+
from requests.adapters import HTTPAdapter
|
24
|
+
from requests.packages.urllib3.util.retry import Retry
|
25
|
+
|
25
26
|
headers = {
|
26
27
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
27
28
|
}
|
edsl/scenarios/ScenarioJoin.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
from typing import Union, TYPE_CHECKING
|
3
3
|
|
4
|
-
|
5
|
-
from edsl.scenarios.ScenarioList import ScenarioList
|
6
|
-
from edsl.scenarios.Scenario import Scenario
|
4
|
+
if TYPE_CHECKING:
|
5
|
+
from edsl.scenarios.ScenarioList import ScenarioList
|
6
|
+
from edsl.scenarios.Scenario import Scenario
|
7
7
|
|
8
8
|
|
9
9
|
class ScenarioJoin:
|
@@ -23,7 +23,7 @@ class ScenarioJoin:
|
|
23
23
|
self.left = left
|
24
24
|
self.right = right
|
25
25
|
|
26
|
-
def left_join(self, by: Union[str, list[str]]) -> ScenarioList:
|
26
|
+
def left_join(self, by: Union[str, list[str]]) -> "ScenarioList":
|
27
27
|
"""Perform a left join between the two ScenarioLists.
|
28
28
|
|
29
29
|
Args:
|
@@ -35,6 +35,8 @@ class ScenarioJoin:
|
|
35
35
|
Raises:
|
36
36
|
ValueError: If by is empty or if any join keys don't exist in both ScenarioLists
|
37
37
|
"""
|
38
|
+
from edsl.scenarios.ScenarioList import ScenarioList
|
39
|
+
|
38
40
|
self._validate_join_keys(by)
|
39
41
|
by_keys = [by] if isinstance(by, str) else by
|
40
42
|
|
@@ -86,6 +88,8 @@ class ScenarioJoin:
|
|
86
88
|
self, by_keys: list[str], other_dict: dict, all_keys: set
|
87
89
|
) -> list[Scenario]:
|
88
90
|
"""Create the joined scenarios."""
|
91
|
+
from edsl.scenarios.Scenario import Scenario
|
92
|
+
|
89
93
|
new_scenarios = []
|
90
94
|
|
91
95
|
for scenario in self.left:
|
@@ -105,8 +109,8 @@ class ScenarioJoin:
|
|
105
109
|
def _handle_matching_scenario(
|
106
110
|
self,
|
107
111
|
new_scenario: dict,
|
108
|
-
left_scenario: Scenario,
|
109
|
-
right_scenario: Scenario,
|
112
|
+
left_scenario: "Scenario",
|
113
|
+
right_scenario: "Scenario",
|
110
114
|
by_keys: list[str],
|
111
115
|
) -> None:
|
112
116
|
"""Handle merging of matching scenarios and conflict warnings."""
|