edsl 0.1.33__py3-none-any.whl → 0.1.33.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- edsl/Base.py +3 -9
- edsl/__init__.py +3 -8
- edsl/__version__.py +1 -1
- edsl/agents/Agent.py +8 -40
- edsl/agents/AgentList.py +0 -43
- edsl/agents/Invigilator.py +219 -135
- edsl/agents/InvigilatorBase.py +59 -148
- edsl/agents/{PromptConstructor.py → PromptConstructionMixin.py} +89 -138
- edsl/agents/__init__.py +0 -1
- edsl/config.py +56 -47
- edsl/coop/coop.py +7 -50
- edsl/data/Cache.py +1 -35
- edsl/data_transfer_models.py +38 -73
- edsl/enums.py +0 -4
- edsl/exceptions/language_models.py +1 -25
- edsl/exceptions/questions.py +5 -62
- edsl/exceptions/results.py +0 -4
- edsl/inference_services/AnthropicService.py +11 -13
- edsl/inference_services/AwsBedrock.py +17 -19
- edsl/inference_services/AzureAI.py +20 -37
- edsl/inference_services/GoogleService.py +12 -16
- edsl/inference_services/GroqService.py +0 -2
- edsl/inference_services/InferenceServiceABC.py +3 -58
- edsl/inference_services/OpenAIService.py +54 -48
- edsl/inference_services/models_available_cache.py +6 -0
- edsl/inference_services/registry.py +0 -6
- edsl/jobs/Answers.py +12 -10
- edsl/jobs/Jobs.py +21 -36
- edsl/jobs/buckets/BucketCollection.py +15 -24
- edsl/jobs/buckets/TokenBucket.py +14 -93
- edsl/jobs/interviews/Interview.py +78 -366
- edsl/jobs/interviews/InterviewExceptionEntry.py +19 -85
- edsl/jobs/interviews/InterviewTaskBuildingMixin.py +286 -0
- edsl/jobs/interviews/{InterviewExceptionCollection.py → interview_exception_tracking.py} +68 -14
- edsl/jobs/interviews/retry_management.py +37 -0
- edsl/jobs/runners/JobsRunnerAsyncio.py +175 -146
- edsl/jobs/runners/JobsRunnerStatusMixin.py +333 -0
- edsl/jobs/tasks/QuestionTaskCreator.py +23 -30
- edsl/jobs/tasks/TaskHistory.py +213 -148
- edsl/language_models/LanguageModel.py +156 -261
- edsl/language_models/ModelList.py +2 -2
- edsl/language_models/RegisterLanguageModelsMeta.py +29 -14
- edsl/language_models/registry.py +6 -23
- edsl/language_models/repair.py +19 -0
- edsl/prompts/Prompt.py +2 -52
- edsl/questions/AnswerValidatorMixin.py +26 -23
- edsl/questions/QuestionBase.py +249 -329
- edsl/questions/QuestionBudget.py +41 -99
- edsl/questions/QuestionCheckBox.py +35 -227
- edsl/questions/QuestionExtract.py +27 -98
- edsl/questions/QuestionFreeText.py +29 -52
- edsl/questions/QuestionFunctional.py +0 -7
- edsl/questions/QuestionList.py +22 -141
- edsl/questions/QuestionMultipleChoice.py +65 -159
- edsl/questions/QuestionNumerical.py +46 -88
- edsl/questions/QuestionRank.py +24 -182
- edsl/questions/RegisterQuestionsMeta.py +12 -31
- edsl/questions/__init__.py +4 -3
- edsl/questions/derived/QuestionLikertFive.py +5 -10
- edsl/questions/derived/QuestionLinearScale.py +2 -15
- edsl/questions/derived/QuestionTopK.py +1 -10
- edsl/questions/derived/QuestionYesNo.py +3 -24
- edsl/questions/descriptors.py +7 -43
- edsl/questions/question_registry.py +2 -6
- edsl/results/Dataset.py +0 -20
- edsl/results/DatasetExportMixin.py +48 -46
- edsl/results/Result.py +5 -32
- edsl/results/Results.py +46 -135
- edsl/results/ResultsDBMixin.py +3 -3
- edsl/scenarios/FileStore.py +10 -71
- edsl/scenarios/Scenario.py +25 -96
- edsl/scenarios/ScenarioImageMixin.py +2 -2
- edsl/scenarios/ScenarioList.py +39 -361
- edsl/scenarios/ScenarioListExportMixin.py +0 -9
- edsl/scenarios/ScenarioListPdfMixin.py +4 -150
- edsl/study/SnapShot.py +1 -8
- edsl/study/Study.py +0 -32
- edsl/surveys/Rule.py +1 -10
- edsl/surveys/RuleCollection.py +5 -21
- edsl/surveys/Survey.py +310 -636
- edsl/surveys/SurveyExportMixin.py +9 -71
- edsl/surveys/SurveyFlowVisualizationMixin.py +1 -2
- edsl/surveys/SurveyQualtricsImport.py +4 -75
- edsl/utilities/gcp_bucket/simple_example.py +9 -0
- edsl/utilities/utilities.py +1 -9
- {edsl-0.1.33.dist-info → edsl-0.1.33.dev1.dist-info}/METADATA +2 -5
- edsl-0.1.33.dev1.dist-info/RECORD +209 -0
- edsl/TemplateLoader.py +0 -24
- edsl/auto/AutoStudy.py +0 -117
- edsl/auto/StageBase.py +0 -230
- edsl/auto/StageGenerateSurvey.py +0 -178
- edsl/auto/StageLabelQuestions.py +0 -125
- edsl/auto/StagePersona.py +0 -61
- edsl/auto/StagePersonaDimensionValueRanges.py +0 -88
- edsl/auto/StagePersonaDimensionValues.py +0 -74
- edsl/auto/StagePersonaDimensions.py +0 -69
- edsl/auto/StageQuestions.py +0 -73
- edsl/auto/SurveyCreatorPipeline.py +0 -21
- edsl/auto/utilities.py +0 -224
- edsl/coop/PriceFetcher.py +0 -58
- edsl/inference_services/MistralAIService.py +0 -120
- edsl/inference_services/TestService.py +0 -80
- edsl/inference_services/TogetherAIService.py +0 -170
- edsl/jobs/FailedQuestion.py +0 -78
- edsl/jobs/runners/JobsRunnerStatus.py +0 -331
- edsl/language_models/fake_openai_call.py +0 -15
- edsl/language_models/fake_openai_service.py +0 -61
- edsl/language_models/utilities.py +0 -61
- edsl/questions/QuestionBaseGenMixin.py +0 -133
- edsl/questions/QuestionBasePromptsMixin.py +0 -266
- edsl/questions/Quick.py +0 -41
- edsl/questions/ResponseValidatorABC.py +0 -170
- edsl/questions/decorators.py +0 -21
- edsl/questions/prompt_templates/question_budget.jinja +0 -13
- edsl/questions/prompt_templates/question_checkbox.jinja +0 -32
- edsl/questions/prompt_templates/question_extract.jinja +0 -11
- edsl/questions/prompt_templates/question_free_text.jinja +0 -3
- edsl/questions/prompt_templates/question_linear_scale.jinja +0 -11
- edsl/questions/prompt_templates/question_list.jinja +0 -17
- edsl/questions/prompt_templates/question_multiple_choice.jinja +0 -33
- edsl/questions/prompt_templates/question_numerical.jinja +0 -37
- edsl/questions/templates/__init__.py +0 -0
- edsl/questions/templates/budget/__init__.py +0 -0
- edsl/questions/templates/budget/answering_instructions.jinja +0 -7
- edsl/questions/templates/budget/question_presentation.jinja +0 -7
- edsl/questions/templates/checkbox/__init__.py +0 -0
- edsl/questions/templates/checkbox/answering_instructions.jinja +0 -10
- edsl/questions/templates/checkbox/question_presentation.jinja +0 -22
- edsl/questions/templates/extract/__init__.py +0 -0
- edsl/questions/templates/extract/answering_instructions.jinja +0 -7
- edsl/questions/templates/extract/question_presentation.jinja +0 -1
- edsl/questions/templates/free_text/__init__.py +0 -0
- edsl/questions/templates/free_text/answering_instructions.jinja +0 -0
- edsl/questions/templates/free_text/question_presentation.jinja +0 -1
- edsl/questions/templates/likert_five/__init__.py +0 -0
- edsl/questions/templates/likert_five/answering_instructions.jinja +0 -10
- edsl/questions/templates/likert_five/question_presentation.jinja +0 -12
- edsl/questions/templates/linear_scale/__init__.py +0 -0
- edsl/questions/templates/linear_scale/answering_instructions.jinja +0 -5
- edsl/questions/templates/linear_scale/question_presentation.jinja +0 -5
- edsl/questions/templates/list/__init__.py +0 -0
- edsl/questions/templates/list/answering_instructions.jinja +0 -4
- edsl/questions/templates/list/question_presentation.jinja +0 -5
- edsl/questions/templates/multiple_choice/__init__.py +0 -0
- edsl/questions/templates/multiple_choice/answering_instructions.jinja +0 -9
- edsl/questions/templates/multiple_choice/html.jinja +0 -0
- edsl/questions/templates/multiple_choice/question_presentation.jinja +0 -12
- edsl/questions/templates/numerical/__init__.py +0 -0
- edsl/questions/templates/numerical/answering_instructions.jinja +0 -8
- edsl/questions/templates/numerical/question_presentation.jinja +0 -7
- edsl/questions/templates/rank/__init__.py +0 -0
- edsl/questions/templates/rank/answering_instructions.jinja +0 -11
- edsl/questions/templates/rank/question_presentation.jinja +0 -15
- edsl/questions/templates/top_k/__init__.py +0 -0
- edsl/questions/templates/top_k/answering_instructions.jinja +0 -8
- edsl/questions/templates/top_k/question_presentation.jinja +0 -22
- edsl/questions/templates/yes_no/__init__.py +0 -0
- edsl/questions/templates/yes_no/answering_instructions.jinja +0 -6
- edsl/questions/templates/yes_no/question_presentation.jinja +0 -12
- edsl/results/DatasetTree.py +0 -145
- edsl/results/Selector.py +0 -118
- edsl/results/tree_explore.py +0 -115
- edsl/surveys/instructions/ChangeInstruction.py +0 -47
- edsl/surveys/instructions/Instruction.py +0 -34
- edsl/surveys/instructions/InstructionCollection.py +0 -77
- edsl/surveys/instructions/__init__.py +0 -0
- edsl/templates/error_reporting/base.html +0 -24
- edsl/templates/error_reporting/exceptions_by_model.html +0 -35
- edsl/templates/error_reporting/exceptions_by_question_name.html +0 -17
- edsl/templates/error_reporting/exceptions_by_type.html +0 -17
- edsl/templates/error_reporting/interview_details.html +0 -116
- edsl/templates/error_reporting/interviews.html +0 -10
- edsl/templates/error_reporting/overview.html +0 -5
- edsl/templates/error_reporting/performance_plot.html +0 -2
- edsl/templates/error_reporting/report.css +0 -74
- edsl/templates/error_reporting/report.html +0 -118
- edsl/templates/error_reporting/report.js +0 -25
- edsl-0.1.33.dist-info/RECORD +0 -295
- {edsl-0.1.33.dist-info → edsl-0.1.33.dev1.dist-info}/LICENSE +0 -0
- {edsl-0.1.33.dist-info → edsl-0.1.33.dev1.dist-info}/WHEEL +0 -0
edsl/scenarios/ScenarioList.py
CHANGED
@@ -6,11 +6,6 @@ import csv
|
|
6
6
|
import random
|
7
7
|
from collections import UserList, Counter
|
8
8
|
from collections.abc import Iterable
|
9
|
-
import urllib.parse
|
10
|
-
import urllib.request
|
11
|
-
from io import StringIO
|
12
|
-
from collections import defaultdict
|
13
|
-
import inspect
|
14
9
|
|
15
10
|
from simpleeval import EvalWithCompoundTypes
|
16
11
|
|
@@ -20,9 +15,6 @@ from edsl.scenarios.Scenario import Scenario
|
|
20
15
|
from edsl.scenarios.ScenarioListPdfMixin import ScenarioListPdfMixin
|
21
16
|
from edsl.scenarios.ScenarioListExportMixin import ScenarioListExportMixin
|
22
17
|
|
23
|
-
from edsl.conjure.naming_utilities import sanitize_string
|
24
|
-
from edsl.utilities.utilities import is_valid_variable_name
|
25
|
-
|
26
18
|
|
27
19
|
class ScenarioListMixin(ScenarioListPdfMixin, ScenarioListExportMixin):
|
28
20
|
pass
|
@@ -31,180 +23,12 @@ class ScenarioListMixin(ScenarioListPdfMixin, ScenarioListExportMixin):
|
|
31
23
|
class ScenarioList(Base, UserList, ScenarioListMixin):
|
32
24
|
"""Class for creating a list of scenarios to be used in a survey."""
|
33
25
|
|
34
|
-
def __init__(self, data: Optional[list] = None
|
26
|
+
def __init__(self, data: Optional[list] = None):
|
35
27
|
"""Initialize the ScenarioList class."""
|
36
28
|
if data is not None:
|
37
29
|
super().__init__(data)
|
38
30
|
else:
|
39
31
|
super().__init__([])
|
40
|
-
self.codebook = codebook or {}
|
41
|
-
|
42
|
-
@property
|
43
|
-
def has_jinja_braces(self) -> bool:
|
44
|
-
"""Check if the ScenarioList has Jinja braces."""
|
45
|
-
return any([scenario.has_jinja_braces for scenario in self])
|
46
|
-
|
47
|
-
def convert_jinja_braces(self) -> ScenarioList:
|
48
|
-
"""Convert Jinja braces to Python braces."""
|
49
|
-
return ScenarioList([scenario.convert_jinja_braces() for scenario in self])
|
50
|
-
|
51
|
-
def give_valid_names(self) -> ScenarioList:
|
52
|
-
"""Give valid names to the scenario keys.
|
53
|
-
|
54
|
-
>>> s = ScenarioList([Scenario({'a': 1, 'b': 2}), Scenario({'a': 1, 'b': 1})])
|
55
|
-
>>> s.give_valid_names()
|
56
|
-
ScenarioList([Scenario({'a': 1, 'b': 2}), Scenario({'a': 1, 'b': 1})])
|
57
|
-
>>> s = ScenarioList([Scenario({'are you there John?': 1, 'b': 2}), Scenario({'a': 1, 'b': 1})])
|
58
|
-
>>> s.give_valid_names()
|
59
|
-
ScenarioList([Scenario({'john': 1, 'b': 2}), Scenario({'a': 1, 'b': 1})])
|
60
|
-
"""
|
61
|
-
codebook = {}
|
62
|
-
new_scenaerios = []
|
63
|
-
for scenario in self:
|
64
|
-
new_scenario = {}
|
65
|
-
for key in scenario:
|
66
|
-
if not is_valid_variable_name(key):
|
67
|
-
if key in codebook:
|
68
|
-
new_key = codebook[key]
|
69
|
-
else:
|
70
|
-
new_key = sanitize_string(key)
|
71
|
-
if not is_valid_variable_name(new_key):
|
72
|
-
new_key = f"var_{len(codebook)}"
|
73
|
-
codebook[key] = new_key
|
74
|
-
new_scenario[new_key] = scenario[key]
|
75
|
-
else:
|
76
|
-
new_scenario[key] = scenario[key]
|
77
|
-
new_scenaerios.append(Scenario(new_scenario))
|
78
|
-
return ScenarioList(new_scenaerios, codebook)
|
79
|
-
|
80
|
-
def unpivot(self, id_vars=None, value_vars=None):
|
81
|
-
"""
|
82
|
-
Unpivot the ScenarioList, allowing for id variables to be specified.
|
83
|
-
|
84
|
-
Parameters:
|
85
|
-
id_vars (list): Fields to use as identifier variables (kept in each entry)
|
86
|
-
value_vars (list): Fields to unpivot. If None, all fields not in id_vars will be used.
|
87
|
-
|
88
|
-
Example:
|
89
|
-
>>> s = ScenarioList([
|
90
|
-
... Scenario({'id': 1, 'year': 2020, 'a': 10, 'b': 20}),
|
91
|
-
... Scenario({'id': 2, 'year': 2021, 'a': 15, 'b': 25})
|
92
|
-
... ])
|
93
|
-
>>> s.unpivot(id_vars=['id', 'year'], value_vars=['a', 'b'])
|
94
|
-
ScenarioList([Scenario({'id': 1, 'year': 2020, 'variable': 'a', 'value': 10}), Scenario({'id': 1, 'year': 2020, 'variable': 'b', 'value': 20}), Scenario({'id': 2, 'year': 2021, 'variable': 'a', 'value': 15}), Scenario({'id': 2, 'year': 2021, 'variable': 'b', 'value': 25})])
|
95
|
-
"""
|
96
|
-
if id_vars is None:
|
97
|
-
id_vars = []
|
98
|
-
if value_vars is None:
|
99
|
-
value_vars = [field for field in self[0].keys() if field not in id_vars]
|
100
|
-
|
101
|
-
new_scenarios = []
|
102
|
-
for scenario in self:
|
103
|
-
for var in value_vars:
|
104
|
-
new_scenario = {id_var: scenario[id_var] for id_var in id_vars}
|
105
|
-
new_scenario["variable"] = var
|
106
|
-
new_scenario["value"] = scenario[var]
|
107
|
-
new_scenarios.append(Scenario(new_scenario))
|
108
|
-
|
109
|
-
return ScenarioList(new_scenarios)
|
110
|
-
|
111
|
-
def pivot(self, id_vars, var_name="variable", value_name="value"):
|
112
|
-
"""
|
113
|
-
Pivot the ScenarioList from long to wide format.
|
114
|
-
|
115
|
-
Parameters:
|
116
|
-
id_vars (list): Fields to use as identifier variables
|
117
|
-
var_name (str): Name of the variable column (default: 'variable')
|
118
|
-
value_name (str): Name of the value column (default: 'value')
|
119
|
-
|
120
|
-
Example:
|
121
|
-
>>> s = ScenarioList([
|
122
|
-
... Scenario({'id': 1, 'year': 2020, 'variable': 'a', 'value': 10}),
|
123
|
-
... Scenario({'id': 1, 'year': 2020, 'variable': 'b', 'value': 20}),
|
124
|
-
... Scenario({'id': 2, 'year': 2021, 'variable': 'a', 'value': 15}),
|
125
|
-
... Scenario({'id': 2, 'year': 2021, 'variable': 'b', 'value': 25})
|
126
|
-
... ])
|
127
|
-
>>> s.pivot(id_vars=['id', 'year'])
|
128
|
-
ScenarioList([Scenario({'id': 1, 'year': 2020, 'a': 10, 'b': 20}), Scenario({'id': 2, 'year': 2021, 'a': 15, 'b': 25})])
|
129
|
-
"""
|
130
|
-
pivoted_dict = {}
|
131
|
-
|
132
|
-
for scenario in self:
|
133
|
-
# Create a tuple of id values to use as a key
|
134
|
-
id_key = tuple(scenario[id_var] for id_var in id_vars)
|
135
|
-
|
136
|
-
# If this combination of id values hasn't been seen before, initialize it
|
137
|
-
if id_key not in pivoted_dict:
|
138
|
-
pivoted_dict[id_key] = {id_var: scenario[id_var] for id_var in id_vars}
|
139
|
-
|
140
|
-
# Add the variable-value pair to the dict
|
141
|
-
variable = scenario[var_name]
|
142
|
-
value = scenario[value_name]
|
143
|
-
pivoted_dict[id_key][variable] = value
|
144
|
-
|
145
|
-
# Convert the dict of dicts to a list of Scenarios
|
146
|
-
pivoted_scenarios = [
|
147
|
-
Scenario(dict(zip(id_vars, id_key), **values))
|
148
|
-
for id_key, values in pivoted_dict.items()
|
149
|
-
]
|
150
|
-
|
151
|
-
return ScenarioList(pivoted_scenarios)
|
152
|
-
|
153
|
-
def group_by(self, id_vars, variables, func):
|
154
|
-
"""
|
155
|
-
Group the ScenarioList by id_vars and apply a function to the specified variables.
|
156
|
-
|
157
|
-
Parameters:
|
158
|
-
id_vars (list): Fields to use as identifier variables for grouping
|
159
|
-
variables (list): Fields to pass to the aggregation function
|
160
|
-
func (callable): Function to apply to the grouped variables.
|
161
|
-
Should accept lists of values for each variable.
|
162
|
-
|
163
|
-
Returns:
|
164
|
-
ScenarioList: A new ScenarioList with the grouped and aggregated results
|
165
|
-
|
166
|
-
Example:
|
167
|
-
>>> def avg_sum(a, b):
|
168
|
-
... return {'avg_a': sum(a) / len(a), 'sum_b': sum(b)}
|
169
|
-
>>> s = ScenarioList([
|
170
|
-
... Scenario({'group': 'A', 'year': 2020, 'a': 10, 'b': 20}),
|
171
|
-
... Scenario({'group': 'A', 'year': 2021, 'a': 15, 'b': 25}),
|
172
|
-
... Scenario({'group': 'B', 'year': 2020, 'a': 12, 'b': 22}),
|
173
|
-
... Scenario({'group': 'B', 'year': 2021, 'a': 17, 'b': 27})
|
174
|
-
... ])
|
175
|
-
>>> s.group_by(id_vars=['group'], variables=['a', 'b'], func=avg_sum)
|
176
|
-
ScenarioList([Scenario({'group': 'A', 'avg_a': 12.5, 'sum_b': 45}), Scenario({'group': 'B', 'avg_a': 14.5, 'sum_b': 49})])
|
177
|
-
"""
|
178
|
-
# Check if the function is compatible with the specified variables
|
179
|
-
func_params = inspect.signature(func).parameters
|
180
|
-
if len(func_params) != len(variables):
|
181
|
-
raise ValueError(
|
182
|
-
f"Function {func.__name__} expects {len(func_params)} arguments, but {len(variables)} variables were provided"
|
183
|
-
)
|
184
|
-
|
185
|
-
# Group the scenarios
|
186
|
-
grouped = defaultdict(lambda: defaultdict(list))
|
187
|
-
for scenario in self:
|
188
|
-
key = tuple(scenario[id_var] for id_var in id_vars)
|
189
|
-
for var in variables:
|
190
|
-
grouped[key][var].append(scenario[var])
|
191
|
-
|
192
|
-
# Apply the function to each group
|
193
|
-
result = []
|
194
|
-
for key, group in grouped.items():
|
195
|
-
try:
|
196
|
-
aggregated = func(*[group[var] for var in variables])
|
197
|
-
except Exception as e:
|
198
|
-
raise ValueError(f"Error applying function to group {key}: {str(e)}")
|
199
|
-
|
200
|
-
if not isinstance(aggregated, dict):
|
201
|
-
raise ValueError(f"Function {func.__name__} must return a dictionary")
|
202
|
-
|
203
|
-
new_scenario = dict(zip(id_vars, key))
|
204
|
-
new_scenario.update(aggregated)
|
205
|
-
result.append(Scenario(new_scenario))
|
206
|
-
|
207
|
-
return ScenarioList(result)
|
208
32
|
|
209
33
|
@property
|
210
34
|
def parameters(self) -> set:
|
@@ -282,10 +106,6 @@ class ScenarioList(Base, UserList, ScenarioListMixin):
|
|
282
106
|
for s in data["scenarios"]:
|
283
107
|
_ = s.pop("edsl_version")
|
284
108
|
_ = s.pop("edsl_class_name")
|
285
|
-
for scenario in data["scenarios"]:
|
286
|
-
for key, value in scenario.items():
|
287
|
-
if hasattr(value, "to_dict"):
|
288
|
-
data[key] = value.to_dict()
|
289
109
|
return data_to_html(data)
|
290
110
|
|
291
111
|
def tally(self, field) -> dict:
|
@@ -333,71 +153,6 @@ class ScenarioList(Base, UserList, ScenarioListMixin):
|
|
333
153
|
new_scenarios.append(new_scenario)
|
334
154
|
return ScenarioList(new_scenarios)
|
335
155
|
|
336
|
-
def concatenate(self, fields: List[str], separator: str = ";") -> "ScenarioList":
|
337
|
-
"""Concatenate specified fields into a single field.
|
338
|
-
|
339
|
-
Args:
|
340
|
-
fields (List[str]): List of field names to concatenate.
|
341
|
-
separator (str, optional): Separator to use between field values. Defaults to ";".
|
342
|
-
|
343
|
-
Returns:
|
344
|
-
ScenarioList: A new ScenarioList with concatenated fields.
|
345
|
-
|
346
|
-
Example:
|
347
|
-
>>> s = ScenarioList([Scenario({'a': 1, 'b': 2, 'c': 3}), Scenario({'a': 4, 'b': 5, 'c': 6})])
|
348
|
-
>>> s.concatenate(['a', 'b', 'c'])
|
349
|
-
ScenarioList([Scenario({'concat_a_b_c': '1;2;3'}), Scenario({'concat_a_b_c': '4;5;6'})])
|
350
|
-
"""
|
351
|
-
new_scenarios = []
|
352
|
-
for scenario in self:
|
353
|
-
new_scenario = scenario.copy()
|
354
|
-
concat_values = []
|
355
|
-
for field in fields:
|
356
|
-
if field in new_scenario:
|
357
|
-
concat_values.append(str(new_scenario[field]))
|
358
|
-
del new_scenario[field]
|
359
|
-
|
360
|
-
new_field_name = f"concat_{'_'.join(fields)}"
|
361
|
-
new_scenario[new_field_name] = separator.join(concat_values)
|
362
|
-
new_scenarios.append(new_scenario)
|
363
|
-
|
364
|
-
return ScenarioList(new_scenarios)
|
365
|
-
|
366
|
-
def unpack_dict(
|
367
|
-
self, field: str, prefix: Optional[str] = None, drop_field: bool = False
|
368
|
-
) -> ScenarioList:
|
369
|
-
"""Unpack a dictionary field into separate fields.
|
370
|
-
|
371
|
-
Example:
|
372
|
-
|
373
|
-
>>> s = ScenarioList([Scenario({'a': 1, 'b': {'c': 2, 'd': 3}})])
|
374
|
-
>>> s.unpack_dict('b')
|
375
|
-
ScenarioList([Scenario({'a': 1, 'b': {'c': 2, 'd': 3}, 'c': 2, 'd': 3})])
|
376
|
-
"""
|
377
|
-
new_scenarios = []
|
378
|
-
for scenario in self:
|
379
|
-
new_scenario = scenario.copy()
|
380
|
-
for key, value in scenario[field].items():
|
381
|
-
if prefix:
|
382
|
-
new_scenario[prefix + key] = value
|
383
|
-
else:
|
384
|
-
new_scenario[key] = value
|
385
|
-
if drop_field:
|
386
|
-
new_scenario.pop(field)
|
387
|
-
new_scenarios.append(new_scenario)
|
388
|
-
return ScenarioList(new_scenarios)
|
389
|
-
|
390
|
-
def transform(
|
391
|
-
self, field: str, func: Callable, new_name: Optional[str] = None
|
392
|
-
) -> ScenarioList:
|
393
|
-
"""Transform a field using a function."""
|
394
|
-
new_scenarios = []
|
395
|
-
for scenario in self:
|
396
|
-
new_scenario = scenario.copy()
|
397
|
-
new_scenario[new_name or field] = func(scenario[field])
|
398
|
-
new_scenarios.append(new_scenario)
|
399
|
-
return ScenarioList(new_scenarios)
|
400
|
-
|
401
156
|
def mutate(
|
402
157
|
self, new_var_string: str, functions_dict: Optional[dict[str, Callable]] = None
|
403
158
|
) -> ScenarioList:
|
@@ -555,19 +310,6 @@ class ScenarioList(Base, UserList, ScenarioListMixin):
|
|
555
310
|
data = [{key: [scenario[key] for scenario in self.data]} for key in keys]
|
556
311
|
return Dataset(data)
|
557
312
|
|
558
|
-
def split(
|
559
|
-
self, field: str, split_on: str, index: int, new_name: Optional[str] = None
|
560
|
-
) -> ScenarioList:
|
561
|
-
"""Split a scenario fiel in multiple fields."""
|
562
|
-
if new_name is None:
|
563
|
-
new_name = field + "_split_" + str(index)
|
564
|
-
new_scenarios = []
|
565
|
-
for scenario in self:
|
566
|
-
new_scenario = scenario.copy()
|
567
|
-
new_scenario[new_name] = scenario[field].split(split_on)[index]
|
568
|
-
new_scenarios.append(new_scenario)
|
569
|
-
return ScenarioList(new_scenarios)
|
570
|
-
|
571
313
|
def add_list(self, name, values) -> ScenarioList:
|
572
314
|
"""Add a list of values to a ScenarioList.
|
573
315
|
|
@@ -650,6 +392,37 @@ class ScenarioList(Base, UserList, ScenarioListMixin):
|
|
650
392
|
|
651
393
|
return ScenarioList([Scenario(entry) for entry in processed_lines])
|
652
394
|
|
395
|
+
@classmethod
|
396
|
+
def from_docx(cls, docx_file_path: str):
|
397
|
+
from docx import Document
|
398
|
+
|
399
|
+
doc = Document(docx_file_path)
|
400
|
+
lines = []
|
401
|
+
|
402
|
+
# Extract text from paragraphs, treating each paragraph as a line
|
403
|
+
for para in doc.paragraphs:
|
404
|
+
lines.extend(para.text.splitlines())
|
405
|
+
|
406
|
+
processed_lines = []
|
407
|
+
non_blank_lines = [
|
408
|
+
(i, line.strip()) for i, line in enumerate(lines) if line.strip()
|
409
|
+
]
|
410
|
+
|
411
|
+
for index, (line_no, text) in enumerate(non_blank_lines):
|
412
|
+
entry = {
|
413
|
+
"line_no": line_no + 1, # Using 1-based index for line numbers
|
414
|
+
"text": text,
|
415
|
+
"line_before": non_blank_lines[index - 1][1] if index > 0 else None,
|
416
|
+
"line_after": (
|
417
|
+
non_blank_lines[index + 1][1]
|
418
|
+
if index < len(non_blank_lines) - 1
|
419
|
+
else None
|
420
|
+
),
|
421
|
+
}
|
422
|
+
processed_lines.append(entry)
|
423
|
+
|
424
|
+
return ScenarioList([Scenario(entry) for entry in processed_lines])
|
425
|
+
|
653
426
|
@classmethod
|
654
427
|
def from_google_doc(cls, url: str) -> ScenarioList:
|
655
428
|
"""Create a ScenarioList from a Google Doc.
|
@@ -700,62 +473,6 @@ class ScenarioList(Base, UserList, ScenarioListMixin):
|
|
700
473
|
"""
|
701
474
|
return cls([Scenario(row) for row in df.to_dict(orient="records")])
|
702
475
|
|
703
|
-
@classmethod
|
704
|
-
def from_wikipedia(cls, url: str, table_index: int = 0):
|
705
|
-
"""
|
706
|
-
Extracts a table from a Wikipedia page.
|
707
|
-
|
708
|
-
Parameters:
|
709
|
-
url (str): The URL of the Wikipedia page.
|
710
|
-
table_index (int): The index of the table to extract (default is 0).
|
711
|
-
|
712
|
-
Returns:
|
713
|
-
pd.DataFrame: A DataFrame containing the extracted table.
|
714
|
-
# # Example usage
|
715
|
-
# url = "https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)"
|
716
|
-
# df = from_wikipedia(url, 0)
|
717
|
-
|
718
|
-
# if not df.empty:
|
719
|
-
# print(df.head())
|
720
|
-
# else:
|
721
|
-
# print("Failed to extract table.")
|
722
|
-
|
723
|
-
|
724
|
-
"""
|
725
|
-
import pandas as pd
|
726
|
-
import requests
|
727
|
-
from requests.exceptions import RequestException
|
728
|
-
|
729
|
-
try:
|
730
|
-
# Check if the URL is reachable
|
731
|
-
response = requests.get(url)
|
732
|
-
response.raise_for_status() # Raises HTTPError for bad responses
|
733
|
-
|
734
|
-
# Extract tables from the Wikipedia page
|
735
|
-
tables = pd.read_html(url)
|
736
|
-
|
737
|
-
# Ensure the requested table index is within the range of available tables
|
738
|
-
if table_index >= len(tables) or table_index < 0:
|
739
|
-
raise IndexError(
|
740
|
-
f"Table index {table_index} is out of range. This page has {len(tables)} table(s)."
|
741
|
-
)
|
742
|
-
|
743
|
-
# Return the requested table as a DataFrame
|
744
|
-
# return tables[table_index]
|
745
|
-
return cls.from_pandas(tables[table_index])
|
746
|
-
|
747
|
-
except RequestException as e:
|
748
|
-
print(f"Error fetching the URL: {e}")
|
749
|
-
except ValueError as e:
|
750
|
-
print(f"Error parsing tables: {e}")
|
751
|
-
except IndexError as e:
|
752
|
-
print(e)
|
753
|
-
except Exception as e:
|
754
|
-
print(f"An unexpected error occurred: {e}")
|
755
|
-
|
756
|
-
# Return an empty DataFrame in case of an error
|
757
|
-
# return cls.from_pandas(pd.DataFrame())
|
758
|
-
|
759
476
|
def to_key_value(self, field: str, value=None) -> Union[dict, set]:
|
760
477
|
"""Return the set of values in the field.
|
761
478
|
|
@@ -877,15 +594,8 @@ class ScenarioList(Base, UserList, ScenarioListMixin):
|
|
877
594
|
return cls.from_excel(temp_filename, sheet_name=sheet_name)
|
878
595
|
|
879
596
|
@classmethod
|
880
|
-
def from_csv(cls,
|
881
|
-
"""Create a ScenarioList from a CSV file
|
882
|
-
|
883
|
-
Args:
|
884
|
-
source: A string representing either a local file path or a URL to a CSV file,
|
885
|
-
or a urllib.parse.ParseResult object for a URL.
|
886
|
-
|
887
|
-
Returns:
|
888
|
-
ScenarioList: A ScenarioList object containing the data from the CSV.
|
597
|
+
def from_csv(cls, filename: str) -> ScenarioList:
|
598
|
+
"""Create a ScenarioList from a CSV file.
|
889
599
|
|
890
600
|
Example:
|
891
601
|
|
@@ -901,37 +611,15 @@ class ScenarioList(Base, UserList, ScenarioListMixin):
|
|
901
611
|
'Alice'
|
902
612
|
>>> scenario_list[1]['age']
|
903
613
|
'25'
|
904
|
-
|
905
|
-
>>> url = "https://example.com/data.csv"
|
906
|
-
>>> ## scenario_list_from_url = ScenarioList.from_csv(url)
|
907
614
|
"""
|
908
615
|
from edsl.scenarios.Scenario import Scenario
|
909
616
|
|
910
|
-
|
911
|
-
|
912
|
-
|
913
|
-
return all([result.scheme, result.netloc])
|
914
|
-
except ValueError:
|
915
|
-
return False
|
916
|
-
|
917
|
-
if isinstance(source, str) and is_url(source):
|
918
|
-
with urllib.request.urlopen(source) as response:
|
919
|
-
csv_content = response.read().decode("utf-8")
|
920
|
-
csv_file = StringIO(csv_content)
|
921
|
-
elif isinstance(source, urllib.parse.ParseResult):
|
922
|
-
with urllib.request.urlopen(source.geturl()) as response:
|
923
|
-
csv_content = response.read().decode("utf-8")
|
924
|
-
csv_file = StringIO(csv_content)
|
925
|
-
else:
|
926
|
-
csv_file = open(source, "r")
|
927
|
-
|
928
|
-
try:
|
929
|
-
reader = csv.reader(csv_file)
|
617
|
+
observations = []
|
618
|
+
with open(filename, "r") as f:
|
619
|
+
reader = csv.reader(f)
|
930
620
|
header = next(reader)
|
931
|
-
|
932
|
-
|
933
|
-
csv_file.close()
|
934
|
-
|
621
|
+
for row in reader:
|
622
|
+
observations.append(Scenario(dict(zip(header, row))))
|
935
623
|
return cls(observations)
|
936
624
|
|
937
625
|
def _to_dict(self, sort=False) -> dict:
|
@@ -975,16 +663,6 @@ class ScenarioList(Base, UserList, ScenarioListMixin):
|
|
975
663
|
|
976
664
|
return cls([Scenario.from_dict(s) for s in data["scenarios"]])
|
977
665
|
|
978
|
-
@classmethod
|
979
|
-
def from_nested_dict(cls, data: dict) -> ScenarioList:
|
980
|
-
"""Create a `ScenarioList` from a nested dictionary."""
|
981
|
-
from edsl.scenarios.Scenario import Scenario
|
982
|
-
|
983
|
-
s = ScenarioList()
|
984
|
-
for key, value in data.items():
|
985
|
-
s.add_list(key, value)
|
986
|
-
return s
|
987
|
-
|
988
666
|
def code(self) -> str:
|
989
667
|
## TODO: Refactor to only use the questions actually in the survey
|
990
668
|
"""Create the Python code representation of a survey."""
|
@@ -41,12 +41,3 @@ class ScenarioListExportMixin(DatasetExportMixin):
|
|
41
41
|
def __init_subclass__(cls, **kwargs):
|
42
42
|
super().__init_subclass__(**kwargs)
|
43
43
|
decorate_methods_from_mixin(cls, DatasetExportMixin)
|
44
|
-
|
45
|
-
def to_docx(self, filename: str):
|
46
|
-
"""Export the ScenarioList to a .docx file."""
|
47
|
-
dataset = self.to_dataset()
|
48
|
-
from edsl.results.DatasetTree import Tree
|
49
|
-
|
50
|
-
tree = Tree(dataset)
|
51
|
-
tree.construct_tree()
|
52
|
-
tree.to_docx(filename)
|
@@ -1,161 +1,15 @@
|
|
1
1
|
import fitz # PyMuPDF
|
2
2
|
import os
|
3
|
-
import copy
|
4
3
|
import subprocess
|
5
|
-
import requests
|
6
|
-
import tempfile
|
7
|
-
import os
|
8
|
-
|
9
|
-
# import urllib.parse as urlparse
|
10
|
-
from urllib.parse import urlparse
|
11
4
|
|
12
5
|
# from edsl import Scenario
|
13
6
|
|
14
|
-
import requests
|
15
|
-
import re
|
16
|
-
import tempfile
|
17
|
-
import os
|
18
|
-
import atexit
|
19
|
-
from urllib.parse import urlparse, parse_qs
|
20
|
-
|
21
|
-
|
22
|
-
class GoogleDriveDownloader:
|
23
|
-
_temp_dir = None
|
24
|
-
_temp_file_path = None
|
25
|
-
|
26
|
-
@classmethod
|
27
|
-
def fetch_from_drive(cls, url, filename=None):
|
28
|
-
# Extract file ID from the URL
|
29
|
-
file_id = cls._extract_file_id(url)
|
30
|
-
if not file_id:
|
31
|
-
raise ValueError("Invalid Google Drive URL")
|
32
|
-
|
33
|
-
# Construct the download URL
|
34
|
-
download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
|
35
|
-
|
36
|
-
# Send a GET request to the URL
|
37
|
-
session = requests.Session()
|
38
|
-
response = session.get(download_url, stream=True)
|
39
|
-
response.raise_for_status()
|
40
|
-
|
41
|
-
# Check for large file download prompt
|
42
|
-
for key, value in response.cookies.items():
|
43
|
-
if key.startswith("download_warning"):
|
44
|
-
params = {"id": file_id, "confirm": value}
|
45
|
-
response = session.get(download_url, params=params, stream=True)
|
46
|
-
break
|
47
|
-
|
48
|
-
# Create a temporary file to save the download
|
49
|
-
if not filename:
|
50
|
-
filename = "downloaded_file"
|
51
|
-
|
52
|
-
if cls._temp_dir is None:
|
53
|
-
cls._temp_dir = tempfile.TemporaryDirectory()
|
54
|
-
atexit.register(cls._cleanup)
|
55
|
-
|
56
|
-
cls._temp_file_path = os.path.join(cls._temp_dir.name, filename)
|
57
|
-
|
58
|
-
# Write the content to the temporary file
|
59
|
-
with open(cls._temp_file_path, "wb") as f:
|
60
|
-
for chunk in response.iter_content(32768):
|
61
|
-
if chunk:
|
62
|
-
f.write(chunk)
|
63
|
-
|
64
|
-
print(f"File saved to: {cls._temp_file_path}")
|
65
|
-
|
66
|
-
return cls._temp_file_path
|
67
|
-
|
68
|
-
@staticmethod
|
69
|
-
def _extract_file_id(url):
|
70
|
-
# Try to extract file ID from '/file/d/' format
|
71
|
-
file_id_match = re.search(r"/d/([a-zA-Z0-9-_]+)", url)
|
72
|
-
if file_id_match:
|
73
|
-
return file_id_match.group(1)
|
74
|
-
|
75
|
-
# If not found, try to extract from 'open?id=' format
|
76
|
-
parsed_url = urlparse(url)
|
77
|
-
query_params = parse_qs(parsed_url.query)
|
78
|
-
if "id" in query_params:
|
79
|
-
return query_params["id"][0]
|
80
|
-
|
81
|
-
return None
|
82
|
-
|
83
|
-
@classmethod
|
84
|
-
def _cleanup(cls):
|
85
|
-
if cls._temp_dir:
|
86
|
-
cls._temp_dir.cleanup()
|
87
|
-
|
88
|
-
@classmethod
|
89
|
-
def get_temp_file_path(cls):
|
90
|
-
return cls._temp_file_path
|
91
|
-
|
92
|
-
|
93
|
-
def fetch_and_save_pdf(url, filename):
|
94
|
-
# Send a GET request to the URL
|
95
|
-
response = requests.get(url)
|
96
|
-
|
97
|
-
# Check if the request was successful
|
98
|
-
response.raise_for_status()
|
99
|
-
|
100
|
-
# Create a temporary directory
|
101
|
-
with tempfile.TemporaryDirectory() as temp_dir:
|
102
|
-
# Construct the full path for the file
|
103
|
-
temp_file_path = os.path.join(temp_dir, filename)
|
104
|
-
|
105
|
-
# Write the content to the temporary file
|
106
|
-
with open(temp_file_path, "wb") as file:
|
107
|
-
file.write(response.content)
|
108
|
-
|
109
|
-
print(f"PDF saved to: {temp_file_path}")
|
110
|
-
|
111
|
-
# Here you can perform operations with the file
|
112
|
-
# The file will be automatically deleted when you exit this block
|
113
|
-
|
114
|
-
return temp_file_path
|
115
|
-
|
116
|
-
|
117
|
-
# Example usage:
|
118
|
-
# url = "https://example.com/sample.pdf"
|
119
|
-
# fetch_and_save_pdf(url, "sample.pdf")
|
120
|
-
|
121
7
|
|
122
8
|
class ScenarioListPdfMixin:
|
123
9
|
@classmethod
|
124
|
-
def from_pdf(cls,
|
125
|
-
|
126
|
-
|
127
|
-
# Check if it's a Google Drive URL
|
128
|
-
if "drive.google.com" in filename_or_url:
|
129
|
-
temp_filename = GoogleDriveDownloader.fetch_from_drive(
|
130
|
-
filename_or_url, "temp_pdf.pdf"
|
131
|
-
)
|
132
|
-
else:
|
133
|
-
# For other URLs, use the previous fetch_and_save_pdf function
|
134
|
-
temp_filename = fetch_and_save_pdf(filename_or_url, "temp_pdf.pdf")
|
135
|
-
|
136
|
-
scenarios = list(cls.extract_text_from_pdf(temp_filename))
|
137
|
-
else:
|
138
|
-
# If it's not a URL, assume it's a local file path
|
139
|
-
scenarios = list(cls.extract_text_from_pdf(filename_or_url))
|
140
|
-
if not collapse_pages:
|
141
|
-
return cls(scenarios)
|
142
|
-
else:
|
143
|
-
txt = ""
|
144
|
-
for scenario in scenarios:
|
145
|
-
txt += scenario["text"]
|
146
|
-
from edsl.scenarios import Scenario
|
147
|
-
|
148
|
-
base_scenario = copy.copy(scenarios[0])
|
149
|
-
base_scenario["text"] = txt
|
150
|
-
return base_scenario
|
151
|
-
|
152
|
-
@staticmethod
|
153
|
-
def is_url(string):
|
154
|
-
try:
|
155
|
-
result = urlparse(string)
|
156
|
-
return all([result.scheme, result.netloc])
|
157
|
-
except ValueError:
|
158
|
-
return False
|
10
|
+
def from_pdf(cls, filename):
|
11
|
+
scenarios = list(cls.extract_text_from_pdf(filename))
|
12
|
+
return cls(scenarios)
|
159
13
|
|
160
14
|
@classmethod
|
161
15
|
def _from_pdf_to_image(cls, pdf_path, image_format="jpeg"):
|
@@ -184,7 +38,7 @@ class ScenarioListPdfMixin:
|
|
184
38
|
scenario = Scenario._from_filepath_image(image_path)
|
185
39
|
scenarios.append(scenario)
|
186
40
|
|
187
|
-
|
41
|
+
print(f"Saved {len(images)} pages as images in {output_folder}")
|
188
42
|
return cls(scenarios)
|
189
43
|
|
190
44
|
@staticmethod
|
edsl/study/SnapShot.py
CHANGED
@@ -57,17 +57,10 @@ class SnapShot:
|
|
57
57
|
from edsl.Base import Base
|
58
58
|
from edsl.study.Study import Study
|
59
59
|
|
60
|
-
def is_edsl_object(obj):
|
61
|
-
package_name = "edsl"
|
62
|
-
cls = obj.__class__
|
63
|
-
module_name = cls.__module__
|
64
|
-
return module_name.startswith(package_name)
|
65
|
-
|
66
60
|
for name, value in namespace.items():
|
67
61
|
# TODO check this code logic (if there are other objects with to_dict method that are not from edsl)
|
68
62
|
if (
|
69
|
-
|
70
|
-
and hasattr(value, "to_dict")
|
63
|
+
hasattr(value, "to_dict")
|
71
64
|
and not inspect.isclass(value)
|
72
65
|
and value.__class__ not in [o.__class__ for o in self.exclude]
|
73
66
|
):
|