edsl 0.1.54__py3-none-any.whl → 0.1.55__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- edsl/__init__.py +8 -1
- edsl/__init__original.py +134 -0
- edsl/__version__.py +1 -1
- edsl/agents/agent.py +29 -0
- edsl/agents/agent_list.py +36 -1
- edsl/base/base_class.py +281 -151
- edsl/buckets/__init__.py +8 -3
- edsl/buckets/bucket_collection.py +9 -3
- edsl/buckets/model_buckets.py +4 -2
- edsl/buckets/token_bucket.py +2 -2
- edsl/buckets/token_bucket_client.py +5 -3
- edsl/caching/cache.py +131 -62
- edsl/caching/cache_entry.py +70 -58
- edsl/caching/sql_dict.py +17 -0
- edsl/cli.py +99 -0
- edsl/config/config_class.py +16 -0
- edsl/conversation/__init__.py +31 -0
- edsl/coop/coop.py +276 -242
- edsl/coop/coop_jobs_objects.py +59 -0
- edsl/coop/coop_objects.py +29 -0
- edsl/coop/coop_regular_objects.py +26 -0
- edsl/coop/utils.py +24 -19
- edsl/dataset/dataset.py +338 -101
- edsl/db_list/sqlite_list.py +349 -0
- edsl/inference_services/__init__.py +40 -5
- edsl/inference_services/exceptions.py +11 -0
- edsl/inference_services/services/anthropic_service.py +5 -2
- edsl/inference_services/services/aws_bedrock.py +6 -2
- edsl/inference_services/services/azure_ai.py +6 -2
- edsl/inference_services/services/google_service.py +3 -2
- edsl/inference_services/services/mistral_ai_service.py +6 -2
- edsl/inference_services/services/open_ai_service.py +6 -2
- edsl/inference_services/services/perplexity_service.py +6 -2
- edsl/inference_services/services/test_service.py +94 -5
- edsl/interviews/answering_function.py +167 -59
- edsl/interviews/interview.py +124 -72
- edsl/interviews/interview_task_manager.py +10 -0
- edsl/invigilators/invigilators.py +9 -0
- edsl/jobs/async_interview_runner.py +146 -104
- edsl/jobs/data_structures.py +6 -4
- edsl/jobs/decorators.py +61 -0
- edsl/jobs/fetch_invigilator.py +61 -18
- edsl/jobs/html_table_job_logger.py +14 -2
- edsl/jobs/jobs.py +180 -104
- edsl/jobs/jobs_component_constructor.py +2 -2
- edsl/jobs/jobs_interview_constructor.py +2 -0
- edsl/jobs/jobs_remote_inference_logger.py +4 -0
- edsl/jobs/jobs_runner_status.py +30 -25
- edsl/jobs/progress_bar_manager.py +79 -0
- edsl/jobs/remote_inference.py +35 -1
- edsl/key_management/key_lookup_builder.py +6 -1
- edsl/language_models/language_model.py +86 -6
- edsl/language_models/model.py +10 -3
- edsl/language_models/price_manager.py +45 -75
- edsl/language_models/registry.py +5 -0
- edsl/notebooks/notebook.py +77 -10
- edsl/questions/VALIDATION_README.md +134 -0
- edsl/questions/__init__.py +24 -1
- edsl/questions/exceptions.py +21 -0
- edsl/questions/question_dict.py +201 -16
- edsl/questions/question_multiple_choice_with_other.py +624 -0
- edsl/questions/question_registry.py +2 -1
- edsl/questions/templates/multiple_choice_with_other/__init__.py +0 -0
- edsl/questions/templates/multiple_choice_with_other/answering_instructions.jinja +15 -0
- edsl/questions/templates/multiple_choice_with_other/question_presentation.jinja +17 -0
- edsl/questions/validation_analysis.py +185 -0
- edsl/questions/validation_cli.py +131 -0
- edsl/questions/validation_html_report.py +404 -0
- edsl/questions/validation_logger.py +136 -0
- edsl/results/result.py +63 -16
- edsl/results/results.py +702 -171
- edsl/scenarios/construct_download_link.py +16 -3
- edsl/scenarios/directory_scanner.py +226 -226
- edsl/scenarios/file_methods.py +5 -0
- edsl/scenarios/file_store.py +117 -6
- edsl/scenarios/handlers/__init__.py +5 -1
- edsl/scenarios/handlers/mp4_file_store.py +104 -0
- edsl/scenarios/handlers/webm_file_store.py +104 -0
- edsl/scenarios/scenario.py +120 -101
- edsl/scenarios/scenario_list.py +800 -727
- edsl/scenarios/scenario_list_gc_test.py +146 -0
- edsl/scenarios/scenario_list_memory_test.py +214 -0
- edsl/scenarios/scenario_list_source_refactor.md +35 -0
- edsl/scenarios/scenario_selector.py +5 -4
- edsl/scenarios/scenario_source.py +1990 -0
- edsl/scenarios/tests/test_scenario_list_sources.py +52 -0
- edsl/surveys/survey.py +22 -0
- edsl/tasks/__init__.py +4 -2
- edsl/tasks/task_history.py +198 -36
- edsl/tests/scenarios/test_ScenarioSource.py +51 -0
- edsl/tests/scenarios/test_scenario_list_sources.py +51 -0
- edsl/utilities/__init__.py +2 -1
- edsl/utilities/decorators.py +121 -0
- edsl/utilities/memory_debugger.py +1010 -0
- {edsl-0.1.54.dist-info → edsl-0.1.55.dist-info}/METADATA +51 -76
- {edsl-0.1.54.dist-info → edsl-0.1.55.dist-info}/RECORD +99 -75
- edsl/jobs/jobs_runner_asyncio.py +0 -281
- edsl/language_models/unused/fake_openai_service.py +0 -60
- {edsl-0.1.54.dist-info → edsl-0.1.55.dist-info}/LICENSE +0 -0
- {edsl-0.1.54.dist-info → edsl-0.1.55.dist-info}/WHEEL +0 -0
- {edsl-0.1.54.dist-info → edsl-0.1.55.dist-info}/entry_points.txt +0 -0
edsl/scenarios/scenario_list.py
CHANGED
@@ -34,7 +34,13 @@ import os
|
|
34
34
|
from io import StringIO
|
35
35
|
import inspect
|
36
36
|
from collections import UserList, defaultdict
|
37
|
-
from collections.abc import Iterable
|
37
|
+
from collections.abc import Iterable, MutableSequence
|
38
|
+
import json
|
39
|
+
import pickle
|
40
|
+
|
41
|
+
|
42
|
+
# Import for refactoring to Source classes
|
43
|
+
from edsl.scenarios.scenario_source import deprecated_classmethod, TuplesSource
|
38
44
|
|
39
45
|
from simpleeval import EvalWithCompoundTypes, NameNotDefined # type: ignore
|
40
46
|
from tabulate import tabulate_formats
|
@@ -53,9 +59,17 @@ if TYPE_CHECKING:
|
|
53
59
|
|
54
60
|
|
55
61
|
from ..base import Base
|
56
|
-
from ..utilities import
|
62
|
+
from ..utilities import (
|
63
|
+
remove_edsl_version,
|
64
|
+
sanitize_string,
|
65
|
+
is_valid_variable_name,
|
66
|
+
dict_hash,
|
67
|
+
memory_profile,
|
68
|
+
)
|
57
69
|
from ..dataset import ScenarioListOperationsMixin
|
58
70
|
|
71
|
+
from ..db_list.sqlite_list import SQLiteList
|
72
|
+
|
59
73
|
from .exceptions import ScenarioError
|
60
74
|
from .scenario import Scenario
|
61
75
|
from .scenario_list_pdf_tools import PdfTools
|
@@ -83,41 +97,39 @@ TableFormat: TypeAlias = Literal[
|
|
83
97
|
"tsv",
|
84
98
|
]
|
85
99
|
|
86
|
-
|
100
|
+
|
101
|
+
|
102
|
+
class ScenarioSQLiteList(SQLiteList):
|
103
|
+
"""SQLite-backed list specifically for storing Scenario objects."""
|
104
|
+
|
105
|
+
def serialize(self, obj):
|
106
|
+
"""Serialize a Scenario object or other data to bytes using pickle."""
|
107
|
+
return pickle.dumps(obj)
|
108
|
+
|
109
|
+
def deserialize(self, data):
|
110
|
+
"""Deserialize pickled bytes back to a Scenario object or other data."""
|
111
|
+
if isinstance(data, str):
|
112
|
+
return pickle.loads(data.encode())
|
113
|
+
return pickle.loads(data)
|
114
|
+
|
115
|
+
|
116
|
+
if use_sqlite := True:
|
117
|
+
data_class = ScenarioSQLiteList
|
118
|
+
else:
|
119
|
+
data_class = list
|
120
|
+
|
121
|
+
class ScenarioList(MutableSequence, Base, ScenarioListOperationsMixin):
|
87
122
|
"""
|
88
123
|
A collection of Scenario objects with advanced operations for manipulation and analysis.
|
89
|
-
|
90
|
-
ScenarioList
|
91
|
-
|
92
|
-
with EDSL's object model and from ScenarioListOperationsMixin
|
93
|
-
powerful data manipulation capabilities.
|
94
|
-
|
95
|
-
The class provides methods for filtering, sorting, joining, transforming, and
|
96
|
-
analyzing collections of Scenarios. It's designed to work seamlessly with other
|
97
|
-
EDSL components like Surveys, Jobs, and Questions.
|
98
|
-
|
124
|
+
|
125
|
+
ScenarioList provides specialized functionality for working with collections of
|
126
|
+
Scenario objects. It inherits from MutableSequence to provide standard list operations,
|
127
|
+
from Base to integrate with EDSL's object model, and from ScenarioListOperationsMixin
|
128
|
+
to provide powerful data manipulation capabilities.
|
129
|
+
|
99
130
|
Attributes:
|
100
|
-
data (list): The underlying list
|
131
|
+
data (list): The underlying list containing Scenario objects.
|
101
132
|
codebook (dict): Optional metadata describing the fields in the scenarios.
|
102
|
-
|
103
|
-
Examples:
|
104
|
-
Create a ScenarioList from Scenario objects:
|
105
|
-
>>> from edsl.scenarios import Scenario, ScenarioList
|
106
|
-
>>> s1 = Scenario({"product": "apple", "price": 1.99})
|
107
|
-
>>> s2 = Scenario({"product": "banana", "price": 0.99})
|
108
|
-
>>> sl = ScenarioList([s1, s2])
|
109
|
-
|
110
|
-
Filter scenarios based on a condition:
|
111
|
-
>>> cheap_fruits = sl.filter("price < 1.50")
|
112
|
-
>>> len(cheap_fruits)
|
113
|
-
1
|
114
|
-
>>> cheap_fruits[0]["product"]
|
115
|
-
'banana'
|
116
|
-
|
117
|
-
Add a new column based on existing data:
|
118
|
-
>>> sl_with_tax = sl.mutate("tax = price * 0.08")
|
119
|
-
>>> sl_with_tax[0]["tax"]
|
120
|
-
0.1592
|
121
133
|
"""
|
122
134
|
|
123
135
|
__documentation__ = (
|
@@ -125,43 +137,64 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
125
137
|
)
|
126
138
|
|
127
139
|
def __init__(
|
128
|
-
self,
|
140
|
+
self,
|
141
|
+
data: Optional[list] = None,
|
142
|
+
codebook: Optional[dict[str, str]] = None,
|
143
|
+
data_class: Optional[type] = data_class,
|
129
144
|
):
|
130
|
-
"""
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
"""
|
148
|
-
if data is not None:
|
149
|
-
super().__init__(data)
|
150
|
-
else:
|
151
|
-
super().__init__([])
|
145
|
+
"""Initialize a new ScenarioList with optional data and codebook."""
|
146
|
+
self._data_class = data_class
|
147
|
+
self.data = self._data_class([])
|
148
|
+
warned = False
|
149
|
+
for item in data or []:
|
150
|
+
try:
|
151
|
+
_ = json.dumps(item.to_dict())
|
152
|
+
except:
|
153
|
+
import warnings
|
154
|
+
if not warned:
|
155
|
+
warnings.warn(
|
156
|
+
f"One or more items in the data list are not JSON serializable. "
|
157
|
+
"This would prevent running a job that uses this ScenarioList."
|
158
|
+
"One solution is to use 'str(item)' to convert the item to a string before adding."
|
159
|
+
)
|
160
|
+
warned = True
|
161
|
+
self.data.append(item)
|
152
162
|
self.codebook = codebook or {}
|
153
163
|
|
164
|
+
# Required MutableSequence abstract methods
|
165
|
+
def __getitem__(self, index):
|
166
|
+
"""Get item at index."""
|
167
|
+
if isinstance(index, slice):
|
168
|
+
return self.__class__(list(self.data[index]), self.codebook.copy())
|
169
|
+
return self.data[index]
|
170
|
+
|
171
|
+
def __setitem__(self, index, value):
|
172
|
+
"""Set item at index."""
|
173
|
+
self.data[index] = value
|
174
|
+
|
175
|
+
def __delitem__(self, index):
|
176
|
+
"""Delete item at index."""
|
177
|
+
del self.data[index]
|
178
|
+
|
179
|
+
def __len__(self):
|
180
|
+
"""Return number of items."""
|
181
|
+
return len(self.data)
|
182
|
+
|
183
|
+
def insert(self, index, value):
|
184
|
+
"""Insert value at index."""
|
185
|
+
self.data.insert(index, value)
|
186
|
+
|
154
187
|
def unique(self) -> ScenarioList:
|
155
188
|
"""
|
156
189
|
Return a new ScenarioList containing only unique Scenario objects.
|
157
|
-
|
190
|
+
|
158
191
|
This method removes duplicate Scenario objects based on their hash values,
|
159
192
|
which are determined by their content. Two Scenarios with identical key-value
|
160
193
|
pairs will have the same hash and be considered duplicates.
|
161
|
-
|
194
|
+
|
162
195
|
Returns:
|
163
196
|
A new ScenarioList containing only unique Scenario objects.
|
164
|
-
|
197
|
+
|
165
198
|
Examples:
|
166
199
|
>>> from edsl.scenarios import Scenario, ScenarioList
|
167
200
|
>>> s1 = Scenario({"a": 1})
|
@@ -173,26 +206,36 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
173
206
|
2
|
174
207
|
>>> unique_sl
|
175
208
|
ScenarioList([Scenario({'a': 1}), Scenario({'a': 2})])
|
176
|
-
|
209
|
+
|
177
210
|
Notes:
|
178
211
|
- The order of scenarios in the result is not guaranteed due to the use of sets
|
179
212
|
- Uniqueness is determined by the Scenario's __hash__ method
|
180
213
|
- The original ScenarioList is not modified
|
214
|
+
- This implementation is memory efficient as it processes scenarios one at a time
|
181
215
|
"""
|
182
|
-
|
216
|
+
seen_hashes = set()
|
217
|
+
result = ScenarioList()
|
218
|
+
|
219
|
+
for scenario in self.data:
|
220
|
+
scenario_hash = hash(scenario)
|
221
|
+
if scenario_hash not in seen_hashes:
|
222
|
+
seen_hashes.add(scenario_hash)
|
223
|
+
result.append(scenario)
|
224
|
+
|
225
|
+
return result
|
183
226
|
|
184
227
|
@property
|
185
228
|
def has_jinja_braces(self) -> bool:
|
186
229
|
"""
|
187
230
|
Check if any Scenario in the list contains values with Jinja template braces.
|
188
|
-
|
231
|
+
|
189
232
|
This property checks all Scenarios in the list to determine if any contain
|
190
233
|
string values with Jinja template syntax ({{ and }}). This is important for
|
191
234
|
rendering templates and avoiding conflicts with other templating systems.
|
192
|
-
|
235
|
+
|
193
236
|
Returns:
|
194
237
|
True if any Scenario contains values with Jinja braces, False otherwise.
|
195
|
-
|
238
|
+
|
196
239
|
Examples:
|
197
240
|
>>> from edsl.scenarios import Scenario, ScenarioList
|
198
241
|
>>> s1 = Scenario({"text": "Plain text"})
|
@@ -204,20 +247,23 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
204
247
|
>>> sl2.has_jinja_braces
|
205
248
|
True
|
206
249
|
"""
|
207
|
-
|
250
|
+
for scenario in self:
|
251
|
+
if scenario.has_jinja_braces:
|
252
|
+
return True
|
253
|
+
return False
|
208
254
|
|
209
255
|
def _convert_jinja_braces(self) -> ScenarioList:
|
210
256
|
"""
|
211
257
|
Convert Jinja braces to alternative symbols in all Scenarios in the list.
|
212
|
-
|
258
|
+
|
213
259
|
This method creates a new ScenarioList where all Jinja template braces
|
214
260
|
({{ and }}) in string values are converted to alternative symbols (<< and >>).
|
215
261
|
This is useful when you need to prevent template processing or avoid conflicts
|
216
262
|
with other templating systems.
|
217
|
-
|
263
|
+
|
218
264
|
Returns:
|
219
265
|
A new ScenarioList with converted braces in all Scenarios.
|
220
|
-
|
266
|
+
|
221
267
|
Examples:
|
222
268
|
>>> from edsl.scenarios import Scenario, ScenarioList
|
223
269
|
>>> s = Scenario({"text": "Template with {{variable}}"})
|
@@ -225,13 +271,16 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
225
271
|
>>> converted = sl._convert_jinja_braces()
|
226
272
|
>>> converted[0]["text"]
|
227
273
|
'Template with <<variable>>'
|
228
|
-
|
274
|
+
|
229
275
|
Notes:
|
230
276
|
- The original ScenarioList is not modified
|
231
277
|
- This is primarily intended for internal use
|
232
278
|
- The default replacement symbols are << and >>
|
233
279
|
"""
|
234
|
-
|
280
|
+
converted_sl = ScenarioList()
|
281
|
+
for scenario in self:
|
282
|
+
converted_sl.append(scenario._convert_jinja_braces())
|
283
|
+
return converted_sl
|
235
284
|
|
236
285
|
def give_valid_names(self, existing_codebook: dict = None) -> ScenarioList:
|
237
286
|
"""Give valid names to the scenario keys, using an existing codebook if provided.
|
@@ -253,7 +302,8 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
253
302
|
ScenarioList([Scenario({'custom_name': 1, 'b': 2}), Scenario({'a': 1, 'b': 1})])
|
254
303
|
"""
|
255
304
|
codebook = existing_codebook.copy() if existing_codebook else {}
|
256
|
-
|
305
|
+
|
306
|
+
new_scenarios = ScenarioList(data = [], codebook = codebook)
|
257
307
|
|
258
308
|
for scenario in self:
|
259
309
|
new_scenario = {}
|
@@ -274,7 +324,7 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
274
324
|
|
275
325
|
new_scenarios.append(Scenario(new_scenario))
|
276
326
|
|
277
|
-
return
|
327
|
+
return new_scenarios
|
278
328
|
|
279
329
|
def unpivot(
|
280
330
|
self,
|
@@ -301,7 +351,7 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
301
351
|
if value_vars is None:
|
302
352
|
value_vars = [field for field in self[0].keys() if field not in id_vars]
|
303
353
|
|
304
|
-
new_scenarios = []
|
354
|
+
new_scenarios = ScenarioList(data = [], codebook = {})
|
305
355
|
for scenario in self:
|
306
356
|
for var in value_vars:
|
307
357
|
new_scenario = {id_var: scenario[id_var] for id_var in id_vars}
|
@@ -309,35 +359,7 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
309
359
|
new_scenario["value"] = scenario[var]
|
310
360
|
new_scenarios.append(Scenario(new_scenario))
|
311
361
|
|
312
|
-
return
|
313
|
-
|
314
|
-
def sem_filter(self, language_predicate: str) -> ScenarioList:
|
315
|
-
"""Filter the ScenarioList based on a language predicate.
|
316
|
-
|
317
|
-
:param language_predicate: The language predicate to use.
|
318
|
-
|
319
|
-
Inspired by:
|
320
|
-
@misc{patel2024semanticoperators,
|
321
|
-
title={Semantic Operators: A Declarative Model for Rich, AI-based Analytics Over Text Data},
|
322
|
-
author={Liana Patel and Siddharth Jha and Parth Asawa and Melissa Pan and Carlos Guestrin and Matei Zaharia},
|
323
|
-
year={2024},
|
324
|
-
eprint={2407.11418},
|
325
|
-
archivePrefix={arXiv},
|
326
|
-
primaryClass={cs.DB},
|
327
|
-
url={https://arxiv.org/abs/2407.11418},
|
328
|
-
}
|
329
|
-
"""
|
330
|
-
from ..questions import QuestionYesNo
|
331
|
-
|
332
|
-
new_scenario_list = self.duplicate()
|
333
|
-
q = QuestionYesNo(
|
334
|
-
question_text=language_predicate, question_name="binary_outcome"
|
335
|
-
)
|
336
|
-
results = q.by(new_scenario_list).run(verbose=False)
|
337
|
-
new_scenario_list = new_scenario_list.add_list(
|
338
|
-
"criteria", results.select("binary_outcome").to_list()
|
339
|
-
)
|
340
|
-
return new_scenario_list.filter("criteria == 'Yes'").drop("criteria")
|
362
|
+
return new_scenarios
|
341
363
|
|
342
364
|
def pivot(
|
343
365
|
self,
|
@@ -378,14 +400,11 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
378
400
|
value = scenario[value_name]
|
379
401
|
pivoted_dict[id_key][variable] = value
|
380
402
|
|
381
|
-
|
382
|
-
|
383
|
-
Scenario(dict(zip(id_vars, id_key), **values))
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
return ScenarioList(pivoted_scenarios)
|
388
|
-
|
403
|
+
new_sl = ScenarioList(data = [], codebook = self.codebook)
|
404
|
+
for id_key, values in pivoted_dict.items():
|
405
|
+
new_sl.append(Scenario(dict(zip(id_vars, id_key), **values)))
|
406
|
+
return new_sl
|
407
|
+
|
389
408
|
def group_by(
|
390
409
|
self, id_vars: List[str], variables: List[str], func: Callable
|
391
410
|
) -> ScenarioList:
|
@@ -426,7 +445,7 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
426
445
|
grouped[key][var].append(scenario[var])
|
427
446
|
|
428
447
|
# Apply the function to each group
|
429
|
-
|
448
|
+
new_sl= ScenarioList(data = [], codebook = self.codebook)
|
430
449
|
for key, group in grouped.items():
|
431
450
|
try:
|
432
451
|
aggregated = func(*[group[var] for var in variables])
|
@@ -440,9 +459,9 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
440
459
|
|
441
460
|
new_scenario = dict(zip(id_vars, key))
|
442
461
|
new_scenario.update(aggregated)
|
443
|
-
|
462
|
+
new_sl.append(Scenario(new_scenario))
|
444
463
|
|
445
|
-
return
|
464
|
+
return new_sl
|
446
465
|
|
447
466
|
@property
|
448
467
|
def parameters(self) -> set:
|
@@ -457,22 +476,51 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
457
476
|
if len(self) == 0:
|
458
477
|
return set()
|
459
478
|
|
460
|
-
|
479
|
+
params = set()
|
480
|
+
for scenario in self:
|
481
|
+
params.update(scenario.keys())
|
482
|
+
return params
|
461
483
|
|
462
|
-
def
|
463
|
-
"""Return the hash of the ScenarioList.
|
484
|
+
def __original_hash__(self) -> int:
|
485
|
+
"""Return the original hash of the ScenarioList using the dictionary-based approach.
|
464
486
|
|
465
487
|
>>> s = ScenarioList.example()
|
466
|
-
>>>
|
488
|
+
>>> s.__original_hash__()
|
467
489
|
1262252885757976162
|
468
490
|
"""
|
469
491
|
return dict_hash(self.to_dict(sort=True, add_edsl_version=False))
|
470
492
|
|
493
|
+
def __hash__(self) -> int:
|
494
|
+
"""Return the hash of the ScenarioList using a memory-efficient streaming approach.
|
495
|
+
|
496
|
+
>>> s = ScenarioList.example()
|
497
|
+
>>> hash(s)
|
498
|
+
1219708685929871252
|
499
|
+
"""
|
500
|
+
# Start with a seed value
|
501
|
+
running_hash = 0
|
502
|
+
|
503
|
+
# Use a heap to maintain sorted order as we go
|
504
|
+
import heapq
|
505
|
+
heap = []
|
506
|
+
|
507
|
+
# Process each scenario's hash and add to heap
|
508
|
+
for scenario in self:
|
509
|
+
heapq.heappush(heap, hash(scenario))
|
510
|
+
|
511
|
+
# Combine hashes in sorted order
|
512
|
+
while heap:
|
513
|
+
h = heapq.heappop(heap)
|
514
|
+
# Use a large prime number to mix the bits
|
515
|
+
running_hash = (running_hash * 31) ^ h
|
516
|
+
|
517
|
+
return running_hash
|
518
|
+
|
471
519
|
def __eq__(self, other: Any) -> bool:
|
472
520
|
return hash(self) == hash(other)
|
473
521
|
|
474
522
|
def __repr__(self):
|
475
|
-
return f"ScenarioList({self.data})"
|
523
|
+
return f"ScenarioList({list(self.data)})"
|
476
524
|
|
477
525
|
def __mul__(self, other: ScenarioList) -> ScenarioList:
|
478
526
|
"""Takes the cross product of two ScenarioLists.
|
@@ -484,16 +532,18 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
484
532
|
"""
|
485
533
|
from itertools import product
|
486
534
|
from .scenario import Scenario
|
535
|
+
|
487
536
|
if isinstance(other, Scenario):
|
488
537
|
other = ScenarioList([other])
|
489
538
|
elif not isinstance(other, ScenarioList):
|
490
539
|
from .exceptions import TypeScenarioError
|
540
|
+
|
491
541
|
raise TypeScenarioError(f"Cannot multiply ScenarioList with {type(other)}")
|
492
542
|
|
493
|
-
new_sl = []
|
494
|
-
for s1, s2 in
|
543
|
+
new_sl = ScenarioList(data=[], codebook=self.codebook)
|
544
|
+
for s1, s2 in product(self, other):
|
495
545
|
new_sl.append(s1 + s2)
|
496
|
-
return
|
546
|
+
return new_sl
|
497
547
|
|
498
548
|
def times(self, other: ScenarioList) -> ScenarioList:
|
499
549
|
"""Takes the cross product of two ScenarioLists.
|
@@ -505,6 +555,8 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
505
555
|
>>> s1.times(s2)
|
506
556
|
ScenarioList([Scenario({'a': 1, 'b': 1}), Scenario({'a': 1, 'b': 2}), Scenario({'a': 2, 'b': 1}), Scenario({'a': 2, 'b': 2})])
|
507
557
|
"""
|
558
|
+
import warnings
|
559
|
+
warnings.warn("times is deprecated, use * instead", DeprecationWarning)
|
508
560
|
return self.__mul__(other)
|
509
561
|
|
510
562
|
def shuffle(self, seed: Optional[str] = None) -> ScenarioList:
|
@@ -524,14 +576,16 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
524
576
|
"""Return a random sample from the ScenarioList
|
525
577
|
|
526
578
|
>>> s = ScenarioList.from_list("a", [1,2,3,4,5,6])
|
527
|
-
>>> s.sample(3, seed = "edsl")
|
579
|
+
>>> s.sample(3, seed = "edsl") # doctest: +SKIP
|
528
580
|
ScenarioList([Scenario({'a': 2}), Scenario({'a': 1}), Scenario({'a': 3})])
|
529
581
|
"""
|
530
582
|
if seed:
|
531
583
|
random.seed(seed)
|
532
584
|
|
533
585
|
sl = self.duplicate()
|
534
|
-
|
586
|
+
# Convert to list if necessary for random.sample
|
587
|
+
data_list = list(sl.data)
|
588
|
+
return ScenarioList(random.sample(data_list, n))
|
535
589
|
|
536
590
|
def expand(self, expand_field: str, number_field: bool = False) -> ScenarioList:
|
537
591
|
"""Expand the ScenarioList by a field.
|
@@ -560,15 +614,21 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
560
614
|
new_scenarios.append(new_scenario)
|
561
615
|
return ScenarioList(new_scenarios)
|
562
616
|
|
563
|
-
def _concatenate(
|
617
|
+
def _concatenate(
|
618
|
+
self,
|
619
|
+
fields: List[str],
|
620
|
+
output_type: str = "string",
|
621
|
+
separator: str = ";",
|
622
|
+
new_field_name: Optional[str] = None,
|
623
|
+
) -> ScenarioList:
|
564
624
|
"""Private method to handle concatenation logic for different output types.
|
565
|
-
|
625
|
+
|
566
626
|
:param fields: The fields to concatenate.
|
567
627
|
:param output_type: The type of output ("string", "list", or "set").
|
568
628
|
:param separator: The separator to use for string concatenation.
|
569
629
|
:param new_field_name: Optional custom name for the concatenated field.
|
570
630
|
If None, defaults to "concat_field1_field2_..."
|
571
|
-
|
631
|
+
|
572
632
|
Returns:
|
573
633
|
ScenarioList: A new ScenarioList with concatenated fields.
|
574
634
|
"""
|
@@ -577,7 +637,7 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
577
637
|
raise ScenarioError(
|
578
638
|
f"The 'fields' parameter must be a list of field names, not a string. Got '{fields}'."
|
579
639
|
)
|
580
|
-
|
640
|
+
|
581
641
|
new_scenarios = []
|
582
642
|
for scenario in self:
|
583
643
|
new_scenario = scenario.copy()
|
@@ -587,8 +647,12 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
587
647
|
values.append(new_scenario[field])
|
588
648
|
del new_scenario[field]
|
589
649
|
|
590
|
-
field_name =
|
591
|
-
|
650
|
+
field_name = (
|
651
|
+
new_field_name
|
652
|
+
if new_field_name is not None
|
653
|
+
else f"concat_{'_'.join(fields)}"
|
654
|
+
)
|
655
|
+
|
592
656
|
if output_type == "string":
|
593
657
|
# Convert all values to strings and join with separator
|
594
658
|
new_scenario[field_name] = separator.join(str(v) for v in values)
|
@@ -600,13 +664,21 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
600
664
|
new_scenario[field_name] = set(values)
|
601
665
|
else:
|
602
666
|
from .exceptions import ValueScenarioError
|
603
|
-
|
604
|
-
|
667
|
+
|
668
|
+
raise ValueScenarioError(
|
669
|
+
f"Invalid output_type: {output_type}. Must be 'string', 'list', or 'set'."
|
670
|
+
)
|
671
|
+
|
605
672
|
new_scenarios.append(new_scenario)
|
606
673
|
|
607
674
|
return ScenarioList(new_scenarios)
|
608
675
|
|
609
|
-
def concatenate(
|
676
|
+
def concatenate(
|
677
|
+
self,
|
678
|
+
fields: List[str],
|
679
|
+
separator: str = ";",
|
680
|
+
new_field_name: Optional[str] = None,
|
681
|
+
) -> ScenarioList:
|
610
682
|
"""Concatenate specified fields into a single string field.
|
611
683
|
|
612
684
|
:param fields: The fields to concatenate.
|
@@ -623,9 +695,16 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
623
695
|
>>> s.concatenate(['a', 'b', 'c'], new_field_name='combined')
|
624
696
|
ScenarioList([Scenario({'combined': '1;2;3'}), Scenario({'combined': '4;5;6'})])
|
625
697
|
"""
|
626
|
-
return self._concatenate(
|
698
|
+
return self._concatenate(
|
699
|
+
fields,
|
700
|
+
output_type="string",
|
701
|
+
separator=separator,
|
702
|
+
new_field_name=new_field_name,
|
703
|
+
)
|
627
704
|
|
628
|
-
def concatenate_to_list(
|
705
|
+
def concatenate_to_list(
|
706
|
+
self, fields: List[str], new_field_name: Optional[str] = None
|
707
|
+
) -> ScenarioList:
|
629
708
|
"""Concatenate specified fields into a single list field.
|
630
709
|
|
631
710
|
:param fields: The fields to concatenate.
|
@@ -641,9 +720,13 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
641
720
|
>>> s.concatenate_to_list(['a', 'b', 'c'], new_field_name='values')
|
642
721
|
ScenarioList([Scenario({'values': [1, 2, 3]}), Scenario({'values': [4, 5, 6]})])
|
643
722
|
"""
|
644
|
-
return self._concatenate(
|
723
|
+
return self._concatenate(
|
724
|
+
fields, output_type="list", new_field_name=new_field_name
|
725
|
+
)
|
645
726
|
|
646
|
-
def concatenate_to_set(
|
727
|
+
def concatenate_to_set(
|
728
|
+
self, fields: List[str], new_field_name: Optional[str] = None
|
729
|
+
) -> ScenarioList:
|
647
730
|
"""Concatenate specified fields into a single set field.
|
648
731
|
|
649
732
|
:param fields: The fields to concatenate.
|
@@ -659,7 +742,9 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
659
742
|
>>> s.concatenate_to_set(['a', 'b', 'c'], new_field_name='unique_values')
|
660
743
|
ScenarioList([Scenario({'unique_values': {1, 2, 3}}), Scenario({'unique_values': {4, 5, 6}})])
|
661
744
|
"""
|
662
|
-
return self._concatenate(
|
745
|
+
return self._concatenate(
|
746
|
+
fields, output_type="set", new_field_name=new_field_name
|
747
|
+
)
|
663
748
|
|
664
749
|
def unpack_dict(
|
665
750
|
self, field: str, prefix: Optional[str] = None, drop_field: bool = False
|
@@ -773,10 +858,10 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
773
858
|
def get_sort_key(scenario: Any) -> tuple:
|
774
859
|
return tuple(scenario[field] for field in fields)
|
775
860
|
|
776
|
-
return ScenarioList(sorted(self, key=get_sort_key, reverse=reverse))
|
861
|
+
return ScenarioList(sorted(self.data, key=get_sort_key, reverse=reverse))
|
777
862
|
|
778
863
|
def duplicate(self) -> ScenarioList:
|
779
|
-
"""Return a copy of the ScenarioList.
|
864
|
+
"""Return a copy of the ScenarioList using streaming to avoid loading everything into memory.
|
780
865
|
|
781
866
|
>>> sl = ScenarioList.example()
|
782
867
|
>>> sl_copy = sl.duplicate()
|
@@ -785,8 +870,30 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
785
870
|
>>> sl is sl_copy
|
786
871
|
False
|
787
872
|
"""
|
788
|
-
|
873
|
+
new_list = ScenarioList()
|
874
|
+
for scenario in self.data:
|
875
|
+
new_list.append(scenario.copy())
|
876
|
+
return new_list
|
789
877
|
|
878
|
+
def __iter__(self):
|
879
|
+
"""Iterate over scenarios using streaming."""
|
880
|
+
return iter(self.data)
|
881
|
+
|
882
|
+
def equals(self, other: Any) -> bool:
|
883
|
+
"""Memory-efficient comparison of two ScenarioLists."""
|
884
|
+
if not isinstance(other, ScenarioList):
|
885
|
+
return False
|
886
|
+
if len(self) != len(other):
|
887
|
+
return False
|
888
|
+
if self.codebook != other.codebook:
|
889
|
+
return False
|
890
|
+
return self.data == other.data
|
891
|
+
|
892
|
+
def __eq__(self, other: Any) -> bool:
|
893
|
+
"""Use memory-efficient comparison by default."""
|
894
|
+
return self.equals(other)
|
895
|
+
|
896
|
+
@memory_profile
|
790
897
|
def filter(self, expression: str) -> ScenarioList:
|
791
898
|
"""
|
792
899
|
Filter a list of scenarios based on an expression.
|
@@ -799,36 +906,62 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
799
906
|
>>> s.filter("b == 2")
|
800
907
|
ScenarioList([Scenario({'a': 1, 'b': 2})])
|
801
908
|
"""
|
802
|
-
|
803
|
-
|
804
|
-
|
805
|
-
|
806
|
-
|
807
|
-
|
808
|
-
|
809
|
-
|
810
|
-
|
811
|
-
|
812
|
-
|
909
|
+
# Get first item to check keys if available
|
910
|
+
try:
|
911
|
+
first_item = self[0] if len(self) > 0 else None
|
912
|
+
if first_item:
|
913
|
+
# Check for ragged keys by examining a sample of scenarios
|
914
|
+
# rather than iterating through all of them
|
915
|
+
sample_size = min(len(self), 100) # Check at most 100 scenarios
|
916
|
+
base_keys = set(first_item.keys())
|
917
|
+
keys = set()
|
918
|
+
|
919
|
+
# Use a counter to check only the sample_size
|
920
|
+
count = 0
|
921
|
+
for scenario in self:
|
922
|
+
keys.update(scenario.keys())
|
923
|
+
count += 1
|
924
|
+
if count >= sample_size:
|
925
|
+
break
|
926
|
+
|
927
|
+
if keys != base_keys:
|
928
|
+
import warnings
|
929
|
+
warnings.warn(
|
930
|
+
"Ragged ScenarioList detected (different keys for different scenario entries). This may cause unexpected behavior."
|
931
|
+
)
|
932
|
+
except IndexError:
|
933
|
+
pass
|
934
|
+
|
935
|
+
# Create new ScenarioList with filtered data
|
936
|
+
new_sl = ScenarioList(data=[], codebook=self.codebook)
|
813
937
|
|
814
938
|
def create_evaluator(scenario: Scenario):
|
815
|
-
"""Create an evaluator for the given
|
816
|
-
The 'combined_dict' is a mapping of all values for that Result object.
|
817
|
-
"""
|
939
|
+
"""Create an evaluator for the given scenario."""
|
818
940
|
return EvalWithCompoundTypes(names=scenario)
|
819
941
|
|
820
942
|
try:
|
821
|
-
#
|
822
|
-
|
823
|
-
|
943
|
+
# Process one scenario at a time to minimize memory usage
|
944
|
+
for scenario in self:
|
945
|
+
# Check if scenario matches the filter expression
|
824
946
|
if create_evaluator(scenario).eval(expression):
|
825
|
-
|
947
|
+
# Create a copy and immediately append to the new list
|
948
|
+
scenario_copy = scenario.copy()
|
949
|
+
new_sl.append(scenario_copy)
|
950
|
+
|
951
|
+
# Remove reference to allow for garbage collection
|
952
|
+
del scenario_copy
|
953
|
+
|
826
954
|
except NameNotDefined as e:
|
827
|
-
|
955
|
+
# Get available fields for error message
|
956
|
+
try:
|
957
|
+
first_item = self[0] if len(self) > 0 else None
|
958
|
+
available_fields = ", ".join(first_item.keys() if first_item else [])
|
959
|
+
except:
|
960
|
+
available_fields = "unknown"
|
961
|
+
|
828
962
|
raise ScenarioError(
|
829
963
|
f"Error in filter: '{e}'\n"
|
830
964
|
f"The expression '{expression}' refers to a field that does not exist.\n"
|
831
|
-
f"Scenario: {scenario}\n"
|
832
965
|
f"Available fields: {available_fields}\n"
|
833
966
|
"Check your filter expression or consult the documentation: "
|
834
967
|
"https://docs.expectedparrot.com/en/latest/scenarios.html#module-edsl.scenarios.Scenario"
|
@@ -836,18 +969,24 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
836
969
|
except Exception as e:
|
837
970
|
raise ScenarioError(f"Error in filter. Exception:{e}")
|
838
971
|
|
839
|
-
return
|
972
|
+
return new_sl
|
840
973
|
|
841
|
-
def from_urls(
|
842
|
-
self, urls: list[str], field_name: Optional[str] = "text"
|
843
|
-
) -> ScenarioList:
|
844
|
-
"""Create a ScenarioList from a list of URLs.
|
845
|
-
|
846
|
-
:param urls: A list of URLs.
|
847
|
-
:param field_name: The name of the field to store the text from the URLs.
|
848
974
|
|
975
|
+
@classmethod
|
976
|
+
def from_urls(cls, urls: list[str], field_name: Optional[str] = "text") -> ScenarioList:
|
977
|
+
from .scenario_source import URLSource
|
978
|
+
return URLSource(urls, field_name).to_scenario_list()
|
979
|
+
|
980
|
+
@classmethod
|
981
|
+
def from_list(cls, field_name: str, values: list, use_indexes: bool = False) -> ScenarioList:
|
982
|
+
"""Create a ScenarioList from a list of values with a specified field name.
|
983
|
+
|
984
|
+
>>> ScenarioList.from_list('text', ['a', 'b', 'c'])
|
985
|
+
ScenarioList([Scenario({'text': 'a'}), Scenario({'text': 'b'}), Scenario({'text': 'c'})])
|
849
986
|
"""
|
850
|
-
|
987
|
+
from .scenario_source import ListSource
|
988
|
+
return ListSource(field_name, values, use_indexes).to_scenario_list()
|
989
|
+
|
851
990
|
|
852
991
|
def select(self, *fields: str) -> ScenarioList:
|
853
992
|
"""
|
@@ -874,8 +1013,10 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
874
1013
|
>>> s.drop('a')
|
875
1014
|
ScenarioList([Scenario({'b': 1}), Scenario({'b': 2})])
|
876
1015
|
"""
|
877
|
-
|
878
|
-
|
1016
|
+
new_sl = ScenarioList(data=[], codebook=self.codebook)
|
1017
|
+
for scenario in self:
|
1018
|
+
new_sl.append(scenario.drop(fields))
|
1019
|
+
return new_sl
|
879
1020
|
|
880
1021
|
def keep(self, *fields: str) -> ScenarioList:
|
881
1022
|
"""Keep only the specified fields in the scenarios.
|
@@ -888,8 +1029,10 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
888
1029
|
>>> s.keep('a')
|
889
1030
|
ScenarioList([Scenario({'a': 1}), Scenario({'a': 1})])
|
890
1031
|
"""
|
891
|
-
|
892
|
-
|
1032
|
+
new_sl = ScenarioList(data=[], codebook=self.codebook)
|
1033
|
+
for scenario in self:
|
1034
|
+
new_sl.append(scenario.keep(fields))
|
1035
|
+
return new_sl
|
893
1036
|
|
894
1037
|
@classmethod
|
895
1038
|
def from_directory(
|
@@ -899,12 +1042,12 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
899
1042
|
key_name: str = "content",
|
900
1043
|
) -> "ScenarioList":
|
901
1044
|
"""Create a ScenarioList of Scenario objects from files in a directory.
|
902
|
-
|
1045
|
+
|
903
1046
|
This method scans a directory and creates a Scenario object for each file found,
|
904
1047
|
where each Scenario contains a FileStore object under the specified key.
|
905
1048
|
Optionally filters files based on a wildcard pattern. If no path is provided,
|
906
1049
|
the current working directory is used.
|
907
|
-
|
1050
|
+
|
908
1051
|
Args:
|
909
1052
|
path: The directory path to scan, optionally including a wildcard pattern.
|
910
1053
|
If None, uses the current working directory.
|
@@ -914,124 +1057,84 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
914
1057
|
- "*.txt" - scans only text files in the current working directory
|
915
1058
|
recursive: Whether to scan subdirectories recursively. Defaults to False.
|
916
1059
|
key_name: The key to use for the FileStore object in each Scenario. Defaults to "content".
|
917
|
-
|
1060
|
+
|
918
1061
|
Returns:
|
919
1062
|
A ScenarioList containing Scenario objects for all matching files, where each Scenario
|
920
1063
|
has a FileStore object under the specified key.
|
921
|
-
|
1064
|
+
|
922
1065
|
Raises:
|
923
1066
|
FileNotFoundError: If the specified directory does not exist.
|
924
|
-
|
1067
|
+
|
925
1068
|
Examples:
|
926
1069
|
# Get all files in the current directory with default key "content"
|
927
1070
|
sl = ScenarioList.from_directory()
|
928
|
-
|
1071
|
+
|
929
1072
|
# Get all Python files in a specific directory with custom key "python_file"
|
930
1073
|
sl = ScenarioList.from_directory('*.py', key_name="python_file")
|
931
|
-
|
1074
|
+
|
932
1075
|
# Get all image files in the current directory
|
933
1076
|
sl = ScenarioList.from_directory('*.png', key_name="image")
|
934
|
-
|
1077
|
+
|
935
1078
|
# Get all files recursively including subdirectories
|
936
1079
|
sl = ScenarioList.from_directory(recursive=True, key_name="document")
|
937
1080
|
"""
|
938
|
-
|
939
|
-
|
940
|
-
|
941
|
-
|
942
|
-
|
943
|
-
|
944
|
-
|
945
|
-
|
946
|
-
# Check if path contains any wildcard
|
947
|
-
if path and ('*' in path):
|
948
|
-
# Handle "**/*.ext" pattern - find the directory part before the **
|
949
|
-
if has_recursive_pattern:
|
950
|
-
# Extract the base directory by finding the part before **
|
951
|
-
parts = path.split('**')
|
952
|
-
if parts and parts[0]:
|
953
|
-
# Remove trailing slash if any
|
954
|
-
directory_path = parts[0].rstrip('/')
|
955
|
-
if not directory_path:
|
956
|
-
directory_path = os.getcwd()
|
957
|
-
# Get the pattern after **
|
958
|
-
pattern = parts[1] if len(parts) > 1 else None
|
959
|
-
if pattern and pattern.startswith('/'):
|
960
|
-
pattern = pattern[1:] # Remove leading slash
|
961
|
-
else:
|
962
|
-
directory_path = os.getcwd()
|
963
|
-
pattern = None
|
964
|
-
# Handle case where path is just a pattern (e.g., "*.py")
|
965
|
-
elif os.path.dirname(path) == '':
|
966
|
-
directory_path = os.getcwd()
|
967
|
-
pattern = os.path.basename(path)
|
968
|
-
else:
|
969
|
-
# Split into directory and pattern
|
970
|
-
directory_path = os.path.dirname(path)
|
971
|
-
if not directory_path:
|
972
|
-
directory_path = os.getcwd()
|
973
|
-
pattern = os.path.basename(path)
|
974
|
-
else:
|
975
|
-
# Path is a directory with no pattern
|
976
|
-
directory_path = path
|
977
|
-
pattern = None
|
978
|
-
|
979
|
-
# Ensure directory exists
|
980
|
-
if not os.path.isdir(directory_path):
|
981
|
-
from .exceptions import FileNotFoundScenarioError
|
982
|
-
raise FileNotFoundScenarioError(f"Directory not found: {directory_path}")
|
983
|
-
|
984
|
-
# Create a DirectoryScanner for the directory
|
985
|
-
scanner = DirectoryScanner(directory_path)
|
986
|
-
|
987
|
-
# Configure wildcard pattern filtering
|
988
|
-
suffix_allow_list = None
|
989
|
-
example_suffix = None
|
990
|
-
|
991
|
-
if pattern:
|
992
|
-
if pattern.startswith('*.'):
|
993
|
-
# Simple extension filter (e.g., "*.py")
|
994
|
-
suffix_allow_list = [pattern[2:]]
|
995
|
-
elif '*' in pattern:
|
996
|
-
# Other wildcard patterns
|
997
|
-
example_suffix = pattern
|
998
|
-
else:
|
999
|
-
# Handle simple non-wildcard pattern (exact match)
|
1000
|
-
example_suffix = pattern
|
1081
|
+
import warnings
|
1082
|
+
warnings.warn(
|
1083
|
+
"from_directory is deprecated. Use ScenarioSource.from_source('directory', ...) instead.",
|
1084
|
+
DeprecationWarning,
|
1085
|
+
stacklevel=2
|
1086
|
+
)
|
1087
|
+
from .scenario_source import DirectorySource
|
1001
1088
|
|
1002
|
-
|
1003
|
-
|
1004
|
-
|
1089
|
+
source = DirectorySource(
|
1090
|
+
directory=path or os.getcwd(),
|
1091
|
+
pattern="*",
|
1005
1092
|
recursive=recursive,
|
1006
|
-
|
1007
|
-
example_suffix=example_suffix
|
1093
|
+
metadata=True
|
1008
1094
|
)
|
1009
1095
|
|
1010
|
-
#
|
1011
|
-
|
1096
|
+
# Get the ScenarioList with FileStore objects under "file" key
|
1097
|
+
sl = source.to_scenario_list()
|
1012
1098
|
|
1013
|
-
|
1014
|
-
|
1015
|
-
|
1016
|
-
|
1017
|
-
|
1018
|
-
|
1019
|
-
|
1099
|
+
# If the requested key is different from the default "file" key used by DirectoryScanner.scan_directory,
|
1100
|
+
# rename the keys in all scenarios
|
1101
|
+
if key_name != "file":
|
1102
|
+
# Create a new ScenarioList
|
1103
|
+
result = ScenarioList([])
|
1104
|
+
for scenario in sl:
|
1105
|
+
# Create a new scenario with the file under the specified key
|
1106
|
+
new_data = {key_name: scenario["file"]}
|
1107
|
+
# Add all other fields from the original scenario
|
1108
|
+
for k, v in scenario.items():
|
1109
|
+
if k != "file":
|
1110
|
+
new_data[k] = v
|
1111
|
+
result.append(Scenario(new_data))
|
1112
|
+
return result
|
1113
|
+
|
1114
|
+
return sl
|
1020
1115
|
|
1021
|
-
|
1022
|
-
|
1023
|
-
|
1116
|
+
# @classmethod
|
1117
|
+
# def from_list(
|
1118
|
+
# cls, name: str, values: list, func: Optional[Callable] = None
|
1119
|
+
# ) -> ScenarioList:
|
1120
|
+
# """Create a ScenarioList from a list of values.
|
1024
1121
|
|
1025
|
-
|
1122
|
+
# :param name: The name of the field.
|
1123
|
+
# :param values: The list of values.
|
1124
|
+
# :param func: An optional function to apply to the values.
|
1026
1125
|
|
1027
|
-
|
1028
|
-
|
1029
|
-
|
1030
|
-
|
1031
|
-
|
1032
|
-
|
1033
|
-
|
1034
|
-
|
1126
|
+
# Example:
|
1127
|
+
|
1128
|
+
# >>> ScenarioList.from_list('name', ['Alice', 'Bob'])
|
1129
|
+
# ScenarioList([Scenario({'name': 'Alice'}), Scenario({'name': 'Bob'})])
|
1130
|
+
# """
|
1131
|
+
# if not func:
|
1132
|
+
|
1133
|
+
# def identity(x):
|
1134
|
+
# return x
|
1135
|
+
|
1136
|
+
# func = identity
|
1137
|
+
# return cls([Scenario({name: func(value)}) for value in values])
|
1035
1138
|
|
1036
1139
|
def table(
|
1037
1140
|
self,
|
@@ -1041,7 +1144,6 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
1041
1144
|
) -> str:
|
1042
1145
|
"""Return the ScenarioList as a table."""
|
1043
1146
|
|
1044
|
-
|
1045
1147
|
if tablefmt is not None and tablefmt not in tabulate_formats:
|
1046
1148
|
raise ValueError(
|
1047
1149
|
f"Invalid table format: {tablefmt}",
|
@@ -1084,11 +1186,11 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
1084
1186
|
"""
|
1085
1187
|
assert set(new_order) == set(self.parameters)
|
1086
1188
|
|
1087
|
-
|
1189
|
+
new_sl = ScenarioList(data=[], codebook=self.codebook)
|
1088
1190
|
for scenario in self:
|
1089
1191
|
new_scenario = Scenario({key: scenario[key] for key in new_order})
|
1090
|
-
|
1091
|
-
return
|
1192
|
+
new_sl.append(new_scenario)
|
1193
|
+
return new_sl
|
1092
1194
|
|
1093
1195
|
def to_dataset(self) -> "Dataset":
|
1094
1196
|
"""
|
@@ -1128,7 +1230,7 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
1128
1230
|
|
1129
1231
|
"""
|
1130
1232
|
new_names = new_names or [f"{field}_{i}" for i in range(len(self[0][field]))]
|
1131
|
-
|
1233
|
+
new_sl = ScenarioList(data=[], codebook=self.codebook)
|
1132
1234
|
for scenario in self:
|
1133
1235
|
new_scenario = scenario.copy()
|
1134
1236
|
if len(new_names) == 1:
|
@@ -1139,15 +1241,25 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
1139
1241
|
|
1140
1242
|
if not keep_original:
|
1141
1243
|
del new_scenario[field]
|
1142
|
-
|
1143
|
-
return
|
1244
|
+
new_sl.append(new_scenario)
|
1245
|
+
return new_sl
|
1144
1246
|
|
1145
1247
|
@classmethod
|
1146
|
-
|
1147
|
-
|
1148
|
-
|
1149
|
-
|
1150
|
-
|
1248
|
+
@deprecated_classmethod("ScenarioSource.from_source('list_of_tuples', ...)")
|
1249
|
+
def from_list_of_tuples(cls, field_names: list[str], values: list[tuple], use_indexes: bool = False) -> ScenarioList:
|
1250
|
+
"""Create a ScenarioList from a list of tuples with specified field names.
|
1251
|
+
|
1252
|
+
Args:
|
1253
|
+
field_names: A list of field names for the tuples
|
1254
|
+
values: A list of tuples with values matching the field_names
|
1255
|
+
use_indexes: Whether to add an index field to each scenario
|
1256
|
+
|
1257
|
+
Returns:
|
1258
|
+
A ScenarioList containing the data from the tuples
|
1259
|
+
"""
|
1260
|
+
from .scenario_source import TuplesSource
|
1261
|
+
source = TuplesSource(field_names, values, use_indexes)
|
1262
|
+
return source.to_scenario_list()
|
1151
1263
|
|
1152
1264
|
def add_list(self, name: str, values: List[Any]) -> ScenarioList:
|
1153
1265
|
"""Add a list of values to a ScenarioList.
|
@@ -1158,19 +1270,25 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
1158
1270
|
>>> s.add_list('age', [30, 25])
|
1159
1271
|
ScenarioList([Scenario({'name': 'Alice', 'age': 30}), Scenario({'name': 'Bob', 'age': 25})])
|
1160
1272
|
"""
|
1161
|
-
sl = self.duplicate()
|
1162
|
-
if len(values) != len(
|
1273
|
+
#sl = self.duplicate()
|
1274
|
+
if len(values) != len(self.data):
|
1163
1275
|
raise ScenarioError(
|
1164
|
-
f"Length of values ({len(values)}) does not match length of ScenarioList ({len(
|
1276
|
+
f"Length of values ({len(values)}) does not match length of ScenarioList ({len(self)})"
|
1165
1277
|
)
|
1278
|
+
new_sl = ScenarioList(data=[], codebook=self.codebook)
|
1166
1279
|
for i, value in enumerate(values):
|
1167
|
-
|
1168
|
-
|
1280
|
+
scenario = self.data[i]
|
1281
|
+
scenario[name] = value
|
1282
|
+
new_sl.append(scenario)
|
1283
|
+
return new_sl
|
1169
1284
|
|
1170
1285
|
@classmethod
|
1171
1286
|
def create_empty_scenario_list(cls, n: int) -> ScenarioList:
|
1172
1287
|
"""Create an empty ScenarioList with n scenarios.
|
1173
1288
|
|
1289
|
+
Args:
|
1290
|
+
n: The number of empty scenarios to create
|
1291
|
+
|
1174
1292
|
Example:
|
1175
1293
|
|
1176
1294
|
>>> ScenarioList.create_empty_scenario_list(3)
|
@@ -1187,11 +1305,12 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
1187
1305
|
>>> s.add_value('age', 30)
|
1188
1306
|
ScenarioList([Scenario({'name': 'Alice', 'age': 30}), Scenario({'name': 'Bob', 'age': 30})])
|
1189
1307
|
"""
|
1190
|
-
|
1191
|
-
for scenario in
|
1308
|
+
new_sl = ScenarioList(data=[], codebook=self.codebook)
|
1309
|
+
for scenario in self:
|
1192
1310
|
scenario[name] = value
|
1193
|
-
|
1194
|
-
|
1311
|
+
new_sl.append(scenario)
|
1312
|
+
return new_sl
|
1313
|
+
|
1195
1314
|
def rename(self, replacement_dict: dict) -> ScenarioList:
|
1196
1315
|
"""Rename the fields in the scenarios.
|
1197
1316
|
|
@@ -1204,13 +1323,11 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
1204
1323
|
ScenarioList([Scenario({'first_name': 'Alice', 'years': 30}), Scenario({'first_name': 'Bob', 'years': 25})])
|
1205
1324
|
|
1206
1325
|
"""
|
1207
|
-
|
1208
|
-
for
|
1209
|
-
|
1210
|
-
|
1211
|
-
return
|
1212
|
-
|
1213
|
-
|
1326
|
+
new_sl = ScenarioList(data = [], codebook=self.codebook)
|
1327
|
+
for scenario in self:
|
1328
|
+
new_scenario = scenario.rename(replacement_dict)
|
1329
|
+
new_sl.append(new_scenario)
|
1330
|
+
return new_sl
|
1214
1331
|
|
1215
1332
|
def replace_names(self, new_names: list) -> ScenarioList:
|
1216
1333
|
"""Replace the field names in the scenarios with a new list of names.
|
@@ -1225,7 +1342,7 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
1225
1342
|
"""
|
1226
1343
|
if not self:
|
1227
1344
|
return ScenarioList([])
|
1228
|
-
|
1345
|
+
|
1229
1346
|
if len(new_names) != len(self[0].keys()):
|
1230
1347
|
raise ScenarioError(
|
1231
1348
|
f"Length of new names ({len(new_names)}) does not match number of fields ({len(self[0].keys())})"
|
@@ -1253,72 +1370,71 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
1253
1370
|
# return new_list
|
1254
1371
|
|
1255
1372
|
@classmethod
|
1256
|
-
|
1373
|
+
@deprecated_classmethod("ScenarioSource.from_source('sqlite', ...)")
|
1374
|
+
def from_sqlite(
|
1375
|
+
cls, filepath: str, table: Optional[str] = None, sql_query: Optional[str] = None
|
1376
|
+
):
|
1257
1377
|
"""Create a ScenarioList from a SQLite database.
|
1258
|
-
|
1378
|
+
|
1259
1379
|
Args:
|
1260
1380
|
filepath (str): Path to the SQLite database file
|
1261
1381
|
table (Optional[str]): Name of table to query. If None, sql_query must be provided.
|
1262
1382
|
sql_query (Optional[str]): SQL query to execute. Used if table is None.
|
1263
|
-
|
1383
|
+
|
1264
1384
|
Returns:
|
1265
1385
|
ScenarioList: List of scenarios created from database rows
|
1266
|
-
|
1386
|
+
|
1267
1387
|
Raises:
|
1268
1388
|
ValueError: If both table and sql_query are None
|
1269
1389
|
sqlite3.Error: If there is an error executing the database query
|
1270
1390
|
"""
|
1271
|
-
import
|
1272
|
-
|
1391
|
+
from .scenario_source import SQLiteSource
|
1392
|
+
|
1393
|
+
# Handle the case where sql_query is provided instead of table
|
1273
1394
|
if table is None and sql_query is None:
|
1274
1395
|
from .exceptions import ValueScenarioError
|
1275
1396
|
raise ValueScenarioError("Either table or sql_query must be provided")
|
1397
|
+
|
1398
|
+
if table is None:
|
1399
|
+
# We need to use the old implementation for SQL queries
|
1400
|
+
import sqlite3
|
1276
1401
|
|
1277
|
-
|
1278
|
-
|
1279
|
-
|
1280
|
-
|
1281
|
-
if table is not None:
|
1282
|
-
cursor.execute(f"SELECT * FROM {table}")
|
1283
|
-
else:
|
1402
|
+
try:
|
1403
|
+
with sqlite3.connect(filepath) as conn:
|
1404
|
+
cursor = conn.cursor()
|
1284
1405
|
cursor.execute(sql_query)
|
1285
|
-
|
1286
|
-
|
1287
|
-
data = cursor.fetchall()
|
1288
|
-
|
1289
|
-
return cls([Scenario(dict(zip(columns, row))) for row in data])
|
1290
|
-
|
1291
|
-
except sqlite3.Error as e:
|
1292
|
-
raise sqlite3.Error(f"Database error occurred: {str(e)}")
|
1406
|
+
columns = [description[0] for description in cursor.description]
|
1407
|
+
data = cursor.fetchall()
|
1293
1408
|
|
1294
|
-
|
1295
|
-
def from_latex(cls, tex_file_path: str):
|
1296
|
-
with open(tex_file_path, "r") as file:
|
1297
|
-
lines = file.readlines()
|
1409
|
+
return cls([Scenario(dict(zip(columns, row))) for row in data])
|
1298
1410
|
|
1299
|
-
|
1300
|
-
|
1301
|
-
|
1302
|
-
|
1411
|
+
except sqlite3.Error as e:
|
1412
|
+
raise sqlite3.Error(f"Database error occurred: {str(e)}")
|
1413
|
+
else:
|
1414
|
+
# If a table is specified, use SQLiteSource
|
1415
|
+
source = SQLiteSource(filepath, table)
|
1416
|
+
return source.to_scenario_list()
|
1303
1417
|
|
1304
|
-
|
1305
|
-
|
1306
|
-
|
1307
|
-
|
1308
|
-
|
1309
|
-
|
1310
|
-
|
1311
|
-
|
1312
|
-
|
1313
|
-
|
1314
|
-
|
1315
|
-
|
1316
|
-
|
1317
|
-
|
1318
|
-
|
1319
|
-
|
1418
|
+
@classmethod
|
1419
|
+
@deprecated_classmethod("ScenarioSource.from_source('latex', ...)")
|
1420
|
+
def from_latex(cls, tex_file_path: str, table_index: int = 0, has_header: bool = True):
|
1421
|
+
"""Create a ScenarioList from a LaTeX file.
|
1422
|
+
|
1423
|
+
Args:
|
1424
|
+
tex_file_path: The path to the LaTeX file.
|
1425
|
+
table_index: The index of the table to extract (if multiple tables exist).
|
1426
|
+
Default is 0 (first table).
|
1427
|
+
has_header: Whether the table has a header row. Default is True.
|
1428
|
+
|
1429
|
+
Returns:
|
1430
|
+
ScenarioList: A new ScenarioList containing the data from the LaTeX table.
|
1431
|
+
"""
|
1432
|
+
from .scenario_source import LaTeXSource
|
1433
|
+
source = LaTeXSource(tex_file_path, table_index, has_header)
|
1434
|
+
return source.to_scenario_list()
|
1320
1435
|
|
1321
1436
|
@classmethod
|
1437
|
+
@deprecated_classmethod("ScenarioSource.from_source('google_doc', ...)")
|
1322
1438
|
def from_google_doc(cls, url: str) -> ScenarioList:
|
1323
1439
|
"""Create a ScenarioList from a Google Doc.
|
1324
1440
|
|
@@ -1332,30 +1448,12 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
1332
1448
|
ScenarioList: An instance of the ScenarioList class.
|
1333
1449
|
|
1334
1450
|
"""
|
1335
|
-
import
|
1336
|
-
|
1337
|
-
|
1338
|
-
if "/edit" in url:
|
1339
|
-
doc_id = url.split("/d/")[1].split("/edit")[0]
|
1340
|
-
else:
|
1341
|
-
from .exceptions import ValueScenarioError
|
1342
|
-
raise ValueScenarioError("Invalid Google Doc URL format.")
|
1343
|
-
|
1344
|
-
export_url = f"https://docs.google.com/document/d/{doc_id}/export?format=docx"
|
1345
|
-
|
1346
|
-
# Download the Google Doc as a Word file (.docx)
|
1347
|
-
response = requests.get(export_url)
|
1348
|
-
response.raise_for_status() # Ensure the request was successful
|
1349
|
-
|
1350
|
-
# Save the Word file to a temporary file
|
1351
|
-
with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as temp_file:
|
1352
|
-
temp_file.write(response.content)
|
1353
|
-
temp_filename = temp_file.name
|
1354
|
-
|
1355
|
-
# Call the from_docx class method with the temporary file
|
1356
|
-
return cls.from_docx(temp_filename)
|
1451
|
+
from .scenario_source import GoogleDocSource
|
1452
|
+
source = GoogleDocSource(url)
|
1453
|
+
return source.to_scenario_list()
|
1357
1454
|
|
1358
1455
|
@classmethod
|
1456
|
+
@deprecated_classmethod("ScenarioSource.from_source('pandas', ...)")
|
1359
1457
|
def from_pandas(cls, df) -> ScenarioList:
|
1360
1458
|
"""Create a ScenarioList from a pandas DataFrame.
|
1361
1459
|
|
@@ -1366,105 +1464,48 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
1366
1464
|
>>> ScenarioList.from_pandas(df)
|
1367
1465
|
ScenarioList([Scenario({'name': 'Alice', 'age': 30, 'location': 'New York'}), Scenario({'name': 'Bob', 'age': 25, 'location': 'Los Angeles'})])
|
1368
1466
|
"""
|
1369
|
-
|
1370
|
-
|
1467
|
+
from .scenario_source import PandasSource
|
1468
|
+
source = PandasSource(df)
|
1469
|
+
return source.to_scenario_list()
|
1371
1470
|
|
1372
1471
|
@classmethod
|
1472
|
+
@deprecated_classmethod("ScenarioSource.from_source('dta', ...)")
|
1373
1473
|
def from_dta(cls, filepath: str, include_metadata: bool = True) -> ScenarioList:
|
1374
1474
|
"""Create a ScenarioList from a Stata file.
|
1375
|
-
|
1475
|
+
|
1376
1476
|
Args:
|
1377
1477
|
filepath (str): Path to the Stata (.dta) file
|
1378
1478
|
include_metadata (bool): If True, extract and preserve variable labels and value labels
|
1379
1479
|
as additional metadata in the ScenarioList
|
1380
|
-
|
1480
|
+
|
1381
1481
|
Returns:
|
1382
1482
|
ScenarioList: A ScenarioList containing the data from the Stata file
|
1383
1483
|
"""
|
1384
|
-
|
1385
|
-
|
1386
|
-
|
1387
|
-
df = pd.read_stata(filepath)
|
1388
|
-
|
1389
|
-
# Create the basic ScenarioList
|
1390
|
-
scenario_list = cls.from_pandas(df)
|
1391
|
-
|
1392
|
-
# Extract and preserve metadata if requested
|
1393
|
-
if include_metadata:
|
1394
|
-
# Get variable labels (if any)
|
1395
|
-
variable_labels = {}
|
1396
|
-
if hasattr(df, 'variable_labels') and df.variable_labels:
|
1397
|
-
variable_labels = df.variable_labels
|
1398
|
-
|
1399
|
-
# Get value labels (if any)
|
1400
|
-
value_labels = {}
|
1401
|
-
if hasattr(df, 'value_labels') and df.value_labels:
|
1402
|
-
value_labels = df.value_labels
|
1403
|
-
|
1404
|
-
# Store the metadata in the ScenarioList's codebook
|
1405
|
-
if variable_labels or value_labels:
|
1406
|
-
scenario_list.codebook = {
|
1407
|
-
'variable_labels': variable_labels,
|
1408
|
-
'value_labels': value_labels
|
1409
|
-
}
|
1410
|
-
|
1411
|
-
return scenario_list
|
1484
|
+
from .scenario_source import StataSource
|
1485
|
+
source = StataSource(filepath, include_metadata)
|
1486
|
+
return source.to_scenario_list()
|
1412
1487
|
|
1413
1488
|
@classmethod
|
1414
|
-
|
1489
|
+
@deprecated_classmethod("ScenarioSource.from_source('wikipedia', ...)")
|
1490
|
+
def from_wikipedia(cls, url: str, table_index: int = 0, header: bool = True):
|
1415
1491
|
"""
|
1416
1492
|
Extracts a table from a Wikipedia page.
|
1417
1493
|
|
1418
1494
|
Parameters:
|
1419
1495
|
url (str): The URL of the Wikipedia page.
|
1420
1496
|
table_index (int): The index of the table to extract (default is 0).
|
1497
|
+
header (bool): Whether the table has a header row (default is True).
|
1421
1498
|
|
1422
1499
|
Returns:
|
1423
|
-
|
1424
|
-
|
1425
|
-
|
1426
|
-
|
1427
|
-
|
1428
|
-
# if not df.empty:
|
1429
|
-
# print(df.head())
|
1430
|
-
# else:
|
1431
|
-
# print("Failed to extract table.")
|
1432
|
-
|
1433
|
-
|
1500
|
+
ScenarioList: A ScenarioList containing data from the Wikipedia table.
|
1501
|
+
|
1502
|
+
Example usage:
|
1503
|
+
url = "https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)"
|
1504
|
+
scenarios = ScenarioList.from_wikipedia(url, 0)
|
1434
1505
|
"""
|
1435
|
-
|
1436
|
-
|
1437
|
-
|
1438
|
-
|
1439
|
-
try:
|
1440
|
-
# Check if the URL is reachable
|
1441
|
-
response = requests.get(url)
|
1442
|
-
response.raise_for_status() # Raises HTTPError for bad responses
|
1443
|
-
|
1444
|
-
# Extract tables from the Wikipedia page
|
1445
|
-
tables = pd.read_html(url)
|
1446
|
-
|
1447
|
-
# Ensure the requested table index is within the range of available tables
|
1448
|
-
if table_index >= len(tables) or table_index < 0:
|
1449
|
-
raise IndexError(
|
1450
|
-
f"Table index {table_index} is out of range. This page has {len(tables)} table(s)."
|
1451
|
-
)
|
1452
|
-
|
1453
|
-
# Return the requested table as a DataFrame
|
1454
|
-
# return tables[table_index]
|
1455
|
-
return cls.from_pandas(tables[table_index])
|
1456
|
-
|
1457
|
-
except RequestException as e:
|
1458
|
-
print(f"Error fetching the URL: {e}")
|
1459
|
-
except ValueError as e:
|
1460
|
-
print(f"Error parsing tables: {e}")
|
1461
|
-
except IndexError as e:
|
1462
|
-
print(e)
|
1463
|
-
except Exception as e:
|
1464
|
-
print(f"An unexpected error occurred: {e}")
|
1465
|
-
|
1466
|
-
# Return an empty DataFrame in case of an error
|
1467
|
-
# return cls.from_pandas(pd.DataFrame())
|
1506
|
+
from .scenario_source import WikipediaSource
|
1507
|
+
source = WikipediaSource(url, table_index, header)
|
1508
|
+
return source.to_scenario_list()
|
1468
1509
|
|
1469
1510
|
def to_key_value(self, field: str, value=None) -> Union[dict, set]:
|
1470
1511
|
"""Return the set of values in the field.
|
@@ -1484,8 +1525,14 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
1484
1525
|
return {scenario[field]: scenario[value] for scenario in self}
|
1485
1526
|
|
1486
1527
|
@classmethod
|
1528
|
+
@deprecated_classmethod("ScenarioSource.from_source('excel', ...)")
|
1487
1529
|
def from_excel(
|
1488
|
-
cls,
|
1530
|
+
cls,
|
1531
|
+
filename: str,
|
1532
|
+
sheet_name: Optional[str] = None,
|
1533
|
+
skip_rows: Optional[List[int]] = None,
|
1534
|
+
use_codebook: bool = False,
|
1535
|
+
**kwargs
|
1489
1536
|
) -> ScenarioList:
|
1490
1537
|
"""Create a ScenarioList from an Excel file.
|
1491
1538
|
|
@@ -1497,6 +1544,8 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
1497
1544
|
sheet_name (Optional[str]): Name of the sheet to load. If None and multiple sheets exist,
|
1498
1545
|
will raise an error listing available sheets.
|
1499
1546
|
skip_rows (Optional[List[int]]): List of row indices to skip (0-based). If None, all rows are included.
|
1547
|
+
use_codebook (bool): If True, rename columns to standard format and store original names in codebook.
|
1548
|
+
**kwargs: Additional parameters to pass to pandas.read_excel.
|
1500
1549
|
|
1501
1550
|
Example:
|
1502
1551
|
|
@@ -1531,52 +1580,21 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
1531
1580
|
>>> scenario_list[1]['name']
|
1532
1581
|
'Charlie'
|
1533
1582
|
"""
|
1534
|
-
from .
|
1535
|
-
|
1536
|
-
|
1537
|
-
|
1538
|
-
|
1539
|
-
|
1540
|
-
|
1541
|
-
|
1542
|
-
|
1543
|
-
print("The Excel file contains multiple sheets:")
|
1544
|
-
for name in all_sheets.keys():
|
1545
|
-
print(f"- {name}")
|
1546
|
-
from .exceptions import ValueScenarioError
|
1547
|
-
raise ValueScenarioError("Please provide a sheet name to load data from.")
|
1548
|
-
else:
|
1549
|
-
# If there is only one sheet, use it
|
1550
|
-
sheet_name = list(all_sheets.keys())[0]
|
1551
|
-
|
1552
|
-
# Load the specified or determined sheet
|
1553
|
-
df = pd.read_excel(filename, sheet_name=sheet_name)
|
1554
|
-
|
1555
|
-
# Skip specified rows if any
|
1556
|
-
if skip_rows:
|
1557
|
-
df = df.drop(skip_rows)
|
1558
|
-
# Reset index to ensure continuous indexing
|
1559
|
-
df = df.reset_index(drop=True)
|
1560
|
-
|
1561
|
-
if use_codebook:
|
1562
|
-
codebook = {f"col_{i}": col for i, col in enumerate(df.columns)}
|
1563
|
-
koobedoc = {col:f"col_{i}" for i, col in enumerate(df.columns)}
|
1564
|
-
|
1565
|
-
observations = []
|
1566
|
-
for _, row in df.iterrows():
|
1567
|
-
if use_codebook:
|
1568
|
-
observations.append(Scenario({koobedoc.get(k):v for k,v in row.to_dict().items()}))
|
1569
|
-
else:
|
1570
|
-
observations.append(Scenario(row.to_dict()))
|
1571
|
-
|
1572
|
-
|
1573
|
-
if use_codebook:
|
1574
|
-
return cls(observations, codebook=codebook)
|
1575
|
-
else:
|
1576
|
-
return cls(observations)
|
1583
|
+
from .scenario_source import ExcelSource
|
1584
|
+
source = ExcelSource(
|
1585
|
+
file_path=filename,
|
1586
|
+
sheet_name=sheet_name,
|
1587
|
+
skip_rows=skip_rows,
|
1588
|
+
use_codebook=use_codebook,
|
1589
|
+
**kwargs
|
1590
|
+
)
|
1591
|
+
return source.to_scenario_list()
|
1577
1592
|
|
1578
1593
|
@classmethod
|
1579
|
-
|
1594
|
+
@deprecated_classmethod("ScenarioSource.from_source('google_sheet', ...)")
|
1595
|
+
def from_google_sheet(
|
1596
|
+
cls, url: str, sheet_name: str = None, column_names: Optional[List[str]] = None, **kwargs
|
1597
|
+
) -> ScenarioList:
|
1580
1598
|
"""Create a ScenarioList from a Google Sheet.
|
1581
1599
|
|
1582
1600
|
This method downloads the Google Sheet as an Excel file, saves it to a temporary file,
|
@@ -1588,126 +1606,111 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
1588
1606
|
the same as from_excel regarding multiple sheets.
|
1589
1607
|
column_names (List[str], optional): If provided, use these names for the columns instead
|
1590
1608
|
of the default column names from the sheet.
|
1609
|
+
**kwargs: Additional parameters to pass to pandas.read_excel.
|
1591
1610
|
|
1592
1611
|
Returns:
|
1593
1612
|
ScenarioList: An instance of the ScenarioList class.
|
1594
1613
|
|
1595
1614
|
"""
|
1596
|
-
import
|
1597
|
-
|
1598
|
-
|
1599
|
-
if "/edit" in url:
|
1600
|
-
sheet_id = url.split("/d/")[1].split("/edit")[0]
|
1601
|
-
else:
|
1602
|
-
from .exceptions import ValueScenarioError
|
1603
|
-
raise ValueScenarioError("Invalid Google Sheet URL format.")
|
1604
|
-
|
1605
|
-
export_url = (
|
1606
|
-
f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=xlsx"
|
1607
|
-
)
|
1608
|
-
|
1609
|
-
# Download the Google Sheet as an Excel file
|
1610
|
-
response = requests.get(export_url)
|
1611
|
-
response.raise_for_status() # Ensure the request was successful
|
1612
|
-
|
1613
|
-
# Save the Excel file to a temporary file
|
1614
|
-
with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as temp_file:
|
1615
|
-
temp_file.write(response.content)
|
1616
|
-
temp_filename = temp_file.name
|
1617
|
-
|
1618
|
-
# First create the ScenarioList with default column names
|
1619
|
-
scenario_list = cls.from_excel(temp_filename, sheet_name=sheet_name)
|
1620
|
-
|
1621
|
-
# If column_names is provided, create a new ScenarioList with the specified names
|
1622
|
-
if column_names is not None:
|
1623
|
-
if len(column_names) != len(scenario_list[0].keys()):
|
1624
|
-
raise ValueError(
|
1625
|
-
f"Number of provided column names ({len(column_names)}) "
|
1626
|
-
f"does not match number of columns in sheet ({len(scenario_list[0].keys())})"
|
1627
|
-
)
|
1628
|
-
|
1629
|
-
# Create a codebook mapping original keys to new names
|
1630
|
-
original_keys = list(scenario_list[0].keys())
|
1631
|
-
codebook = dict(zip(original_keys, column_names))
|
1632
|
-
|
1633
|
-
# Return new ScenarioList with renamed columns
|
1634
|
-
return scenario_list.rename(codebook)
|
1635
|
-
else:
|
1636
|
-
return scenario_list
|
1615
|
+
from .scenario_source import GoogleSheetSource
|
1616
|
+
source = GoogleSheetSource(url, sheet_name=sheet_name, column_names=column_names, **kwargs)
|
1617
|
+
return source.to_scenario_list()
|
1637
1618
|
|
1638
1619
|
@classmethod
|
1620
|
+
@deprecated_classmethod("ScenarioSource.from_source('delimited_file', ...)")
|
1639
1621
|
def from_delimited_file(
|
1640
|
-
cls, source: Union[str, "ParseResult"], delimiter: str = ","
|
1622
|
+
cls, source: Union[str, "ParseResult"], delimiter: str = ",", encoding: str = "utf-8", **kwargs
|
1641
1623
|
) -> ScenarioList:
|
1642
|
-
"""Create a ScenarioList from a delimited file (CSV/TSV) or URL.
|
1643
|
-
|
1644
|
-
|
1645
|
-
|
1624
|
+
"""Create a ScenarioList from a delimited file (CSV/TSV) or URL.
|
1625
|
+
|
1626
|
+
Args:
|
1627
|
+
source: Path to a local file or URL to a remote file.
|
1628
|
+
delimiter: The delimiter character used in the file (default is ',').
|
1629
|
+
encoding: The file encoding to use (default is 'utf-8').
|
1630
|
+
**kwargs: Additional parameters for csv reader.
|
1631
|
+
|
1632
|
+
Returns:
|
1633
|
+
ScenarioList: An instance of the ScenarioList class.
|
1634
|
+
"""
|
1635
|
+
from .scenario_source import DelimitedFileSource
|
1646
1636
|
from urllib.parse import ParseResult
|
1647
|
-
|
1648
|
-
|
1649
|
-
|
1650
|
-
|
1651
|
-
|
1652
|
-
|
1653
|
-
|
1654
|
-
|
1655
|
-
|
1656
|
-
|
1657
|
-
|
1658
|
-
|
1659
|
-
|
1660
|
-
|
1661
|
-
if isinstance(source, str) and is_url(source):
|
1662
|
-
response = requests.get(source, headers=headers)
|
1663
|
-
response.raise_for_status()
|
1664
|
-
file_obj = StringIO(response.text)
|
1665
|
-
elif isinstance(source, ParseResult):
|
1666
|
-
response = requests.get(source.geturl(), headers=headers)
|
1667
|
-
response.raise_for_status()
|
1668
|
-
file_obj = StringIO(response.text)
|
1669
|
-
else:
|
1670
|
-
# Try different encodings if the default fails
|
1671
|
-
encodings_to_try = ["utf-8", "latin-1", "cp1252", "ISO-8859-1"]
|
1672
|
-
last_exception = None
|
1673
|
-
file_obj = None
|
1674
|
-
|
1675
|
-
for encoding in encodings_to_try:
|
1676
|
-
try:
|
1677
|
-
file_obj = open(source, "r", encoding=encoding)
|
1678
|
-
# Test reading a bit to verify encoding
|
1679
|
-
file_obj.readline()
|
1680
|
-
file_obj.seek(0) # Reset file position
|
1681
|
-
break
|
1682
|
-
except UnicodeDecodeError as e:
|
1683
|
-
last_exception = e
|
1684
|
-
if file_obj:
|
1685
|
-
file_obj.close()
|
1686
|
-
file_obj = None
|
1687
|
-
|
1688
|
-
if file_obj is None:
|
1689
|
-
from .exceptions import ValueScenarioError
|
1690
|
-
raise ValueScenarioError(f"Could not decode file {source} with any of the attempted encodings. Original error: {last_exception}")
|
1691
|
-
|
1692
|
-
reader = csv.reader(file_obj, delimiter=delimiter)
|
1693
|
-
try:
|
1694
|
-
header = next(reader)
|
1695
|
-
observations = [Scenario(dict(zip(header, row))) for row in reader]
|
1696
|
-
except StopIteration:
|
1697
|
-
from .exceptions import ValueScenarioError
|
1698
|
-
raise ValueScenarioError(f"File {source} appears to be empty or has an invalid format")
|
1699
|
-
|
1700
|
-
finally:
|
1701
|
-
if file_obj:
|
1702
|
-
file_obj.close()
|
1703
|
-
|
1704
|
-
return cls(observations)
|
1637
|
+
|
1638
|
+
if isinstance(source, ParseResult):
|
1639
|
+
# Convert ParseResult to string URL
|
1640
|
+
file_or_url = source.geturl()
|
1641
|
+
else:
|
1642
|
+
file_or_url = source
|
1643
|
+
|
1644
|
+
source = DelimitedFileSource(
|
1645
|
+
file_or_url=file_or_url,
|
1646
|
+
delimiter=delimiter,
|
1647
|
+
encoding=encoding,
|
1648
|
+
**kwargs
|
1649
|
+
)
|
1650
|
+
return source.to_scenario_list()
|
1705
1651
|
|
1706
1652
|
# Convenience methods for specific file types
|
1707
1653
|
@classmethod
|
1708
|
-
|
1709
|
-
|
1710
|
-
|
1654
|
+
@deprecated_classmethod("ScenarioSource.from_source('csv', ...)")
|
1655
|
+
def from_csv(cls, source: Union[str, "ParseResult"], has_header: bool = True, encoding: str = "utf-8", **kwargs) -> ScenarioList:
|
1656
|
+
"""Create a ScenarioList from a CSV file or URL.
|
1657
|
+
|
1658
|
+
Args:
|
1659
|
+
source: Path to a local file or URL to a remote file.
|
1660
|
+
has_header: Whether the file has a header row (default is True).
|
1661
|
+
encoding: The file encoding to use (default is 'utf-8').
|
1662
|
+
**kwargs: Additional parameters for csv reader.
|
1663
|
+
|
1664
|
+
Returns:
|
1665
|
+
ScenarioList: An instance of the ScenarioList class.
|
1666
|
+
"""
|
1667
|
+
from .scenario_source import CSVSource
|
1668
|
+
from urllib.parse import ParseResult
|
1669
|
+
|
1670
|
+
if isinstance(source, ParseResult):
|
1671
|
+
# Convert ParseResult to string URL
|
1672
|
+
file_or_url = source.geturl()
|
1673
|
+
else:
|
1674
|
+
file_or_url = source
|
1675
|
+
|
1676
|
+
source = CSVSource(
|
1677
|
+
file_or_url=file_or_url,
|
1678
|
+
has_header=has_header,
|
1679
|
+
encoding=encoding,
|
1680
|
+
**kwargs
|
1681
|
+
)
|
1682
|
+
return source.to_scenario_list()
|
1683
|
+
|
1684
|
+
@classmethod
|
1685
|
+
@deprecated_classmethod("ScenarioSource.from_source('tsv', ...)")
|
1686
|
+
def from_tsv(cls, source: Union[str, "ParseResult"], has_header: bool = True, encoding: str = "utf-8", **kwargs) -> ScenarioList:
|
1687
|
+
"""Create a ScenarioList from a TSV file or URL.
|
1688
|
+
|
1689
|
+
Args:
|
1690
|
+
source: Path to a local file or URL to a remote file.
|
1691
|
+
has_header: Whether the file has a header row (default is True).
|
1692
|
+
encoding: The file encoding to use (default is 'utf-8').
|
1693
|
+
**kwargs: Additional parameters for csv reader.
|
1694
|
+
|
1695
|
+
Returns:
|
1696
|
+
ScenarioList: An instance of the ScenarioList class.
|
1697
|
+
"""
|
1698
|
+
from .scenario_source import TSVSource
|
1699
|
+
from urllib.parse import ParseResult
|
1700
|
+
|
1701
|
+
if isinstance(source, ParseResult):
|
1702
|
+
# Convert ParseResult to string URL
|
1703
|
+
file_or_url = source.geturl()
|
1704
|
+
else:
|
1705
|
+
file_or_url = source
|
1706
|
+
|
1707
|
+
source = TSVSource(
|
1708
|
+
file_or_url=file_or_url,
|
1709
|
+
has_header=has_header,
|
1710
|
+
encoding=encoding,
|
1711
|
+
**kwargs
|
1712
|
+
)
|
1713
|
+
return source.to_scenario_list()
|
1711
1714
|
|
1712
1715
|
def left_join(self, other: ScenarioList, by: Union[str, list[str]]) -> ScenarioList:
|
1713
1716
|
"""Perform a left join with another ScenarioList, following SQL join semantics.
|
@@ -1730,21 +1733,35 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
1730
1733
|
@classmethod
|
1731
1734
|
def from_tsv(cls, source: Union[str, "ParseResult"]) -> ScenarioList:
|
1732
1735
|
"""Create a ScenarioList from a TSV file or URL."""
|
1733
|
-
|
1736
|
+
from .scenario_source import ScenarioSource
|
1737
|
+
|
1738
|
+
# Delegate to ScenarioSource implementation
|
1739
|
+
return ScenarioSource._from_tsv(source)
|
1734
1740
|
|
1735
1741
|
def to_dict(self, sort: bool = False, add_edsl_version: bool = True) -> dict:
|
1736
1742
|
"""
|
1737
1743
|
>>> s = ScenarioList([Scenario({'food': 'wood chips'}), Scenario({'food': 'wood-fired pizza'})])
|
1738
|
-
>>> s.to_dict()
|
1744
|
+
>>> s.to_dict() # doctest: +ELLIPSIS
|
1739
1745
|
{'scenarios': [{'food': 'wood chips', 'edsl_version': '...', 'edsl_class_name': 'Scenario'}, {'food': 'wood-fired pizza', 'edsl_version': '...', 'edsl_class_name': 'Scenario'}], 'edsl_version': '...', 'edsl_class_name': 'ScenarioList'}
|
1740
1746
|
|
1747
|
+
>>> s = ScenarioList([Scenario({'food': 'wood chips'})], codebook={'food': 'description'})
|
1748
|
+
>>> d = s.to_dict()
|
1749
|
+
>>> 'codebook' in d
|
1750
|
+
True
|
1751
|
+
>>> d['codebook'] == {'food': 'description'}
|
1752
|
+
True
|
1741
1753
|
"""
|
1742
1754
|
if sort:
|
1743
1755
|
data = sorted(self, key=lambda x: hash(x))
|
1744
1756
|
else:
|
1745
1757
|
data = self
|
1758
|
+
|
1746
1759
|
d = {"scenarios": [s.to_dict(add_edsl_version=add_edsl_version) for s in data]}
|
1747
1760
|
|
1761
|
+
# Add codebook if it exists
|
1762
|
+
if hasattr(self, 'codebook') and self.codebook:
|
1763
|
+
d['codebook'] = self.codebook
|
1764
|
+
|
1748
1765
|
if add_edsl_version:
|
1749
1766
|
from .. import __version__
|
1750
1767
|
|
@@ -1758,8 +1775,8 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
1758
1775
|
:param survey: The Survey object to use for the Jobs object.
|
1759
1776
|
|
1760
1777
|
Example:
|
1761
|
-
>>> from edsl import Survey, Jobs, ScenarioList
|
1762
|
-
>>> isinstance(ScenarioList.example().to(Survey.example()), Jobs)
|
1778
|
+
>>> from edsl import Survey, Jobs, ScenarioList # doctest: +SKIP
|
1779
|
+
>>> isinstance(ScenarioList.example().to(Survey.example()), Jobs) # doctest: +SKIP
|
1763
1780
|
True
|
1764
1781
|
"""
|
1765
1782
|
from ..surveys import Survey
|
@@ -1786,11 +1803,23 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
1786
1803
|
|
1787
1804
|
@classmethod
|
1788
1805
|
@remove_edsl_version
|
1789
|
-
def from_dict(cls, data) -> ScenarioList:
|
1790
|
-
"""Create a `ScenarioList` from a dictionary.
|
1806
|
+
def from_dict(cls, data: dict) -> ScenarioList:
|
1807
|
+
"""Create a `ScenarioList` from a dictionary.
|
1808
|
+
|
1809
|
+
>>> d = {'scenarios': [{'food': 'wood chips'}], 'codebook': {'food': 'description'}}
|
1810
|
+
>>> s = ScenarioList.from_dict(d)
|
1811
|
+
>>> s.codebook == {'food': 'description'}
|
1812
|
+
True
|
1813
|
+
>>> s[0]['food']
|
1814
|
+
'wood chips'
|
1815
|
+
"""
|
1791
1816
|
from .scenario import Scenario
|
1792
1817
|
|
1793
|
-
|
1818
|
+
# Extract codebook if it exists
|
1819
|
+
codebook = data.get('codebook', None)
|
1820
|
+
|
1821
|
+
# Create ScenarioList with scenarios and codebook
|
1822
|
+
return cls([Scenario.from_dict(s) for s in data["scenarios"]], codebook=codebook)
|
1794
1823
|
|
1795
1824
|
@classmethod
|
1796
1825
|
def from_nested_dict(cls, data: dict) -> ScenarioList:
|
@@ -1835,62 +1864,80 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
1835
1864
|
"""
|
1836
1865
|
return cls([Scenario.example(randomize), Scenario.example(randomize)])
|
1837
1866
|
|
1838
|
-
def __getitem__(self, key: Union[int, slice]) -> Any:
|
1839
|
-
"""Return the item at the given index.
|
1840
1867
|
|
1841
|
-
|
1842
|
-
|
1843
|
-
>>> s[0]
|
1844
|
-
Scenario({'age': 22, 'hair': 'brown', 'height': 5.5})
|
1868
|
+
def items(self):
|
1869
|
+
"""Make this class compatible with dict.items() by accessing first scenario items.
|
1845
1870
|
|
1846
|
-
|
1847
|
-
|
1871
|
+
This ensures the class works as a drop-in replacement for UserList in code
|
1872
|
+
that expects a dictionary-like interface.
|
1848
1873
|
|
1874
|
+
Returns:
|
1875
|
+
items view from the first scenario object if available, empty list otherwise
|
1849
1876
|
"""
|
1850
|
-
if
|
1851
|
-
return
|
1852
|
-
|
1853
|
-
return super().__getitem__(key)
|
1854
|
-
else:
|
1855
|
-
return self.to_dict(add_edsl_version=False)[key]
|
1877
|
+
if len(self.data) > 0:
|
1878
|
+
return self.data[0].items()
|
1879
|
+
return {}.items()
|
1856
1880
|
|
1857
|
-
def
|
1858
|
-
"""
|
1881
|
+
def copy(self):
|
1882
|
+
"""Create a copy of this ScenarioList.
|
1859
1883
|
|
1860
|
-
|
1861
|
-
|
1862
|
-
>>> s = ScenarioList([Scenario({'age': 22, 'hair': 'brown', 'height': 5.5}), Scenario({'age': 22, 'hair': 'brown', 'height': 5.5})])
|
1863
|
-
>>> s.to_agent_list()
|
1864
|
-
AgentList([Agent(traits = {'age': 22, 'hair': 'brown', 'height': 5.5}), Agent(traits = {'age': 22, 'hair': 'brown', 'height': 5.5})])
|
1884
|
+
Returns:
|
1885
|
+
A new ScenarioList with copies of the same scenarios
|
1865
1886
|
"""
|
1866
|
-
|
1887
|
+
# Get copies of all scenarios
|
1888
|
+
if len(self.data) > 0:
|
1889
|
+
# If we have at least one scenario, copy the first one
|
1890
|
+
if hasattr(self.data[0], "copy"):
|
1891
|
+
return self.data[0].copy()
|
1892
|
+
# Otherwise try to convert to Scenario
|
1893
|
+
from .scenario import Scenario
|
1867
1894
|
|
1868
|
-
|
1869
|
-
|
1870
|
-
|
1871
|
-
|
1872
|
-
|
1873
|
-
proposed_agent_name = "agent_name"
|
1874
|
-
while proposed_agent_name not in new_scenario:
|
1875
|
-
proposed_agent_name += "_"
|
1876
|
-
warnings.warn(
|
1877
|
-
f"The 'name' field is reserved for the agent's name---putting this value in {proposed_agent_name}"
|
1878
|
-
)
|
1879
|
-
new_scenario[proposed_agent_name] = name
|
1880
|
-
new_agent = Agent(traits=new_scenario, name=name)
|
1881
|
-
if "agent_parameters" in new_scenario:
|
1882
|
-
agent_parameters = new_scenario.pop("agent_parameters")
|
1883
|
-
instruction = agent_parameters.get("instruction", None)
|
1884
|
-
name = agent_parameters.get("name", None)
|
1885
|
-
new_agent = Agent(
|
1886
|
-
traits=new_scenario, name=name, instruction=instruction
|
1887
|
-
)
|
1888
|
-
else:
|
1889
|
-
new_agent = Agent(traits=new_scenario)
|
1895
|
+
try:
|
1896
|
+
return Scenario(dict(self.data[0]))
|
1897
|
+
except (TypeError, ValueError):
|
1898
|
+
# Fallback to empty scenario
|
1899
|
+
return Scenario({})
|
1890
1900
|
|
1891
|
-
agents.append(new_agent)
|
1892
1901
|
|
1893
|
-
|
1902
|
+
def to_agent_list(self):
|
1903
|
+
"""Convert the ScenarioList to an AgentList.
|
1904
|
+
|
1905
|
+
This method supports special fields that map to Agent parameters:
|
1906
|
+
- "name": Will be used as the agent's name
|
1907
|
+
- "agent_parameters": A dictionary containing:
|
1908
|
+
- "instruction": The agent's instruction text
|
1909
|
+
- "name": The agent's name (overrides the "name" field if present)
|
1910
|
+
|
1911
|
+
Example:
|
1912
|
+
>>> from edsl import ScenarioList, Scenario
|
1913
|
+
>>> # Basic usage with traits
|
1914
|
+
>>> s = ScenarioList([Scenario({'age': 22, 'hair': 'brown', 'height': 5.5})])
|
1915
|
+
>>> al = s.to_agent_list()
|
1916
|
+
>>> al
|
1917
|
+
AgentList([Agent(traits = {'age': 22, 'hair': 'brown', 'height': 5.5})])
|
1918
|
+
|
1919
|
+
>>> # Using agent name
|
1920
|
+
>>> s = ScenarioList([Scenario({'name': 'Alice', 'age': 22})])
|
1921
|
+
>>> al = s.to_agent_list()
|
1922
|
+
>>> al[0].name
|
1923
|
+
'Alice'
|
1924
|
+
|
1925
|
+
>>> # Using agent parameters for instructions
|
1926
|
+
>>> s = ScenarioList([Scenario({
|
1927
|
+
... 'age': 22,
|
1928
|
+
... 'agent_parameters': {
|
1929
|
+
... 'instruction': 'You are a helpful assistant',
|
1930
|
+
... 'name': 'Assistant'
|
1931
|
+
... }
|
1932
|
+
... })])
|
1933
|
+
>>> al = s.to_agent_list()
|
1934
|
+
>>> al[0].instruction
|
1935
|
+
'You are a helpful assistant'
|
1936
|
+
>>> al[0].name
|
1937
|
+
'Assistant'
|
1938
|
+
"""
|
1939
|
+
from ..agents import AgentList
|
1940
|
+
return AgentList.from_scenario_list(self)
|
1894
1941
|
|
1895
1942
|
def chunk(
|
1896
1943
|
self,
|
@@ -1920,7 +1967,9 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
1920
1967
|
new_scenarios.extend(replacement_scenarios)
|
1921
1968
|
return ScenarioList(new_scenarios)
|
1922
1969
|
|
1923
|
-
def collapse(
|
1970
|
+
def collapse(
|
1971
|
+
self, field: str, separator: Optional[str] = None, add_count: bool = False
|
1972
|
+
) -> ScenarioList:
|
1924
1973
|
"""Collapse a ScenarioList by grouping on all fields except the specified one,
|
1925
1974
|
collecting the values of the specified field into a list.
|
1926
1975
|
|
@@ -1943,10 +1992,10 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
1943
1992
|
"""
|
1944
1993
|
if not self:
|
1945
1994
|
return ScenarioList([])
|
1946
|
-
|
1995
|
+
|
1947
1996
|
# Determine all fields except the one to collapse
|
1948
1997
|
id_vars = [key for key in self[0].keys() if key != field]
|
1949
|
-
|
1998
|
+
|
1950
1999
|
# Group the scenarios
|
1951
2000
|
grouped = defaultdict(list)
|
1952
2001
|
for scenario in self:
|
@@ -1954,33 +2003,34 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
1954
2003
|
key = tuple(scenario[id_var] for id_var in id_vars)
|
1955
2004
|
# Add the value of the field to collapse to the list for this key
|
1956
2005
|
grouped[key].append(scenario[field])
|
1957
|
-
|
2006
|
+
|
1958
2007
|
# Create a new ScenarioList with the collapsed field
|
1959
|
-
|
2008
|
+
new_sl = ScenarioList(data = [], codebook=self.codebook)
|
1960
2009
|
for key, values in grouped.items():
|
1961
2010
|
new_scenario = dict(zip(id_vars, key))
|
1962
2011
|
if separator:
|
1963
|
-
new_scenario[field] = separator.join(values)
|
2012
|
+
new_scenario[field] = separator.join([str(x) for x in values])
|
1964
2013
|
else:
|
1965
2014
|
new_scenario[field] = values
|
1966
2015
|
if add_count:
|
1967
|
-
new_scenario[
|
1968
|
-
|
1969
|
-
|
1970
|
-
return ScenarioList(result)
|
2016
|
+
new_scenario["num_collapsed_rows"] = len(values)
|
2017
|
+
new_sl.append(Scenario(new_scenario))
|
2018
|
+
|
2019
|
+
#return ScenarioList(result)
|
2020
|
+
return new_sl
|
1971
2021
|
|
1972
2022
|
def create_comparisons(
|
1973
|
-
self,
|
1974
|
-
bidirectional: bool = False,
|
2023
|
+
self,
|
2024
|
+
bidirectional: bool = False,
|
1975
2025
|
num_options: int = 2,
|
1976
2026
|
option_prefix: str = "option_",
|
1977
|
-
use_alphabet: bool = False
|
2027
|
+
use_alphabet: bool = False,
|
1978
2028
|
) -> ScenarioList:
|
1979
2029
|
"""Create a new ScenarioList with comparisons between scenarios.
|
1980
|
-
|
2030
|
+
|
1981
2031
|
Each scenario in the result contains multiple original scenarios as dictionaries,
|
1982
2032
|
allowing for side-by-side comparison.
|
1983
|
-
|
2033
|
+
|
1984
2034
|
Args:
|
1985
2035
|
bidirectional (bool): If True, include both (A,B) and (B,A) comparisons.
|
1986
2036
|
If False, only include (A,B) where A comes before B in the original list.
|
@@ -1991,11 +2041,11 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
1991
2041
|
Ignored if use_alphabet is True.
|
1992
2042
|
use_alphabet (bool): If True, use letters as keys (A, B, C, etc.) instead of
|
1993
2043
|
the option_prefix with numbers.
|
1994
|
-
|
2044
|
+
|
1995
2045
|
Returns:
|
1996
2046
|
ScenarioList: A new ScenarioList where each scenario contains multiple original
|
1997
2047
|
scenarios as dictionaries.
|
1998
|
-
|
2048
|
+
|
1999
2049
|
Example:
|
2000
2050
|
>>> s = ScenarioList([
|
2001
2051
|
... Scenario({'id': 1, 'text': 'Option A'}),
|
@@ -2009,22 +2059,29 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
2009
2059
|
"""
|
2010
2060
|
from itertools import combinations, permutations
|
2011
2061
|
import string
|
2012
|
-
|
2062
|
+
|
2013
2063
|
if num_options < 2:
|
2014
2064
|
from .exceptions import ValueScenarioError
|
2065
|
+
|
2015
2066
|
raise ValueScenarioError("num_options must be at least 2")
|
2016
|
-
|
2067
|
+
|
2017
2068
|
if num_options > len(self):
|
2018
2069
|
from .exceptions import ValueScenarioError
|
2019
|
-
|
2020
|
-
|
2070
|
+
|
2071
|
+
raise ValueScenarioError(
|
2072
|
+
f"num_options ({num_options}) cannot exceed the number of scenarios ({len(self)})"
|
2073
|
+
)
|
2074
|
+
|
2021
2075
|
if use_alphabet and num_options > 26:
|
2022
2076
|
from .exceptions import ValueScenarioError
|
2023
|
-
|
2024
|
-
|
2077
|
+
|
2078
|
+
raise ValueScenarioError(
|
2079
|
+
"When using alphabet labels, num_options cannot exceed 26 (the number of letters in the English alphabet)"
|
2080
|
+
)
|
2081
|
+
|
2025
2082
|
# Convert each scenario to a dictionary
|
2026
2083
|
scenario_dicts = [scenario.to_dict(add_edsl_version=False) for scenario in self]
|
2027
|
-
|
2084
|
+
|
2028
2085
|
# Generate combinations or permutations based on bidirectional flag
|
2029
2086
|
if bidirectional:
|
2030
2087
|
# For bidirectional, use permutations to get all ordered arrangements
|
@@ -2032,13 +2089,13 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
2032
2089
|
# For pairwise, we can use permutations with r=2
|
2033
2090
|
scenario_groups = permutations(scenario_dicts, 2)
|
2034
2091
|
else:
|
2035
|
-
# For more than 2 options with bidirectional=True,
|
2092
|
+
# For more than 2 options with bidirectional=True,
|
2036
2093
|
# we need all permutations of the specified size
|
2037
2094
|
scenario_groups = permutations(scenario_dicts, num_options)
|
2038
2095
|
else:
|
2039
2096
|
# For unidirectional, use combinations to get unordered groups
|
2040
2097
|
scenario_groups = combinations(scenario_dicts, num_options)
|
2041
|
-
|
2098
|
+
|
2042
2099
|
# Create new scenarios with the combinations
|
2043
2100
|
result = []
|
2044
2101
|
for group in scenario_groups:
|
@@ -2052,64 +2109,35 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
2052
2109
|
key = f"{option_prefix}{i+1}"
|
2053
2110
|
new_scenario[key] = scenario_dict
|
2054
2111
|
result.append(Scenario(new_scenario))
|
2055
|
-
|
2112
|
+
|
2056
2113
|
return ScenarioList(result)
|
2114
|
+
|
2057
2115
|
|
2058
2116
|
@classmethod
|
2117
|
+
@deprecated_classmethod("ScenarioSource.from_source('parquet', ...)")
|
2059
2118
|
def from_parquet(cls, filepath: str) -> ScenarioList:
|
2060
2119
|
"""Create a ScenarioList from a Parquet file.
|
2061
|
-
|
2120
|
+
|
2062
2121
|
Args:
|
2063
|
-
filepath (str):
|
2064
|
-
|
2122
|
+
filepath (str): The path to the Parquet file.
|
2123
|
+
|
2065
2124
|
Returns:
|
2066
|
-
ScenarioList: A ScenarioList containing the
|
2067
|
-
|
2068
|
-
Example:
|
2069
|
-
>>> import pandas as pd
|
2070
|
-
>>> import tempfile
|
2071
|
-
>>> df = pd.DataFrame({'name': ['Alice', 'Bob'], 'age': [30, 25]})
|
2072
|
-
>>> # The following would create and read a parquet file if dependencies are installed:
|
2073
|
-
>>> # with tempfile.NamedTemporaryFile(suffix='.parquet', delete=False) as f:
|
2074
|
-
>>> # df.to_parquet(f.name)
|
2075
|
-
>>> # scenario_list = ScenarioList.from_parquet(f.name)
|
2076
|
-
>>> # Instead, we'll demonstrate the equivalent result:
|
2077
|
-
>>> scenario_list = ScenarioList.from_pandas(df)
|
2078
|
-
>>> len(scenario_list)
|
2079
|
-
2
|
2080
|
-
>>> scenario_list[0]['name']
|
2081
|
-
'Alice'
|
2125
|
+
ScenarioList: A new ScenarioList containing the scenarios from the Parquet file.
|
2082
2126
|
"""
|
2083
|
-
|
2084
|
-
|
2085
|
-
|
2086
|
-
# Try to read the Parquet file with pandas
|
2087
|
-
df = pd.read_parquet(filepath)
|
2088
|
-
except ImportError as e:
|
2089
|
-
# Handle missing dependencies with a helpful error message
|
2090
|
-
if "pyarrow" in str(e) or "fastparquet" in str(e):
|
2091
|
-
raise ImportError(
|
2092
|
-
"Missing dependencies for Parquet support. Please install either pyarrow or fastparquet:\n"
|
2093
|
-
" pip install pyarrow\n"
|
2094
|
-
" or\n"
|
2095
|
-
" pip install fastparquet"
|
2096
|
-
) from e
|
2097
|
-
else:
|
2098
|
-
raise
|
2099
|
-
|
2100
|
-
# Convert the DataFrame to a ScenarioList
|
2101
|
-
return cls.from_pandas(df)
|
2127
|
+
from .scenario_source import ParquetSource
|
2128
|
+
source = ParquetSource(filepath)
|
2129
|
+
return source.to_scenario_list()
|
2102
2130
|
|
2103
|
-
def replace_values(self, replacements:dict) -> "ScenarioList":
|
2131
|
+
def replace_values(self, replacements: dict) -> "ScenarioList":
|
2104
2132
|
"""
|
2105
2133
|
Create new scenarios with values replaced according to the provided replacement dictionary.
|
2106
|
-
|
2134
|
+
|
2107
2135
|
Args:
|
2108
2136
|
replacements (dict): Dictionary of values to replace {old_value: new_value}
|
2109
|
-
|
2137
|
+
|
2110
2138
|
Returns:
|
2111
2139
|
ScenarioList: A new ScenarioList with replaced values
|
2112
|
-
|
2140
|
+
|
2113
2141
|
Examples:
|
2114
2142
|
>>> scenarios = ScenarioList([
|
2115
2143
|
... Scenario({'a': 'nan', 'b': 1}),
|
@@ -2122,7 +2150,7 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
2122
2150
|
>>> print(scenarios)
|
2123
2151
|
ScenarioList([Scenario({'a': 'nan', 'b': 1}), Scenario({'a': 2, 'b': 'nan'})])
|
2124
2152
|
"""
|
2125
|
-
|
2153
|
+
new_sl = ScenarioList(data=[], codebook=self.codebook)
|
2126
2154
|
for scenario in self:
|
2127
2155
|
new_scenario = {}
|
2128
2156
|
for key, value in scenario.items():
|
@@ -2130,20 +2158,65 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
|
|
2130
2158
|
new_scenario[key] = replacements[str(value)]
|
2131
2159
|
else:
|
2132
2160
|
new_scenario[key] = value
|
2133
|
-
|
2134
|
-
return
|
2135
|
-
|
2161
|
+
new_sl.append(Scenario(new_scenario))
|
2162
|
+
return new_sl
|
2163
|
+
|
2136
2164
|
@classmethod
|
2165
|
+
@deprecated_classmethod("ScenarioSource.from_source('pdf', ...)")
|
2137
2166
|
def from_pdf(cls, filename_or_url, collapse_pages=False):
|
2138
|
-
|
2139
|
-
|
2167
|
+
"""Create a ScenarioList from a PDF file or URL."""
|
2168
|
+
from .scenario_source import PDFSource
|
2169
|
+
|
2170
|
+
source = PDFSource(
|
2171
|
+
file_path=filename_or_url,
|
2172
|
+
chunk_type="page" if not collapse_pages else "text",
|
2173
|
+
chunk_size=1
|
2174
|
+
)
|
2175
|
+
return source.to_scenario_list()
|
2176
|
+
|
2140
2177
|
@classmethod
|
2178
|
+
@deprecated_classmethod("ScenarioSource.from_source('pdf_to_image', ...)")
|
2141
2179
|
def from_pdf_to_image(cls, pdf_path, image_format="jpeg"):
|
2142
|
-
|
2143
|
-
|
2180
|
+
"""Create a ScenarioList with images extracted from a PDF file."""
|
2181
|
+
from .scenario_source import PDFImageSource
|
2182
|
+
|
2183
|
+
source = PDFImageSource(
|
2184
|
+
file_path=pdf_path,
|
2185
|
+
base_width=2000,
|
2186
|
+
include_text=True
|
2187
|
+
)
|
2188
|
+
return source.to_scenario_list()
|
2189
|
+
|
2190
|
+
@classmethod
|
2191
|
+
def from_source(cls, source_type: str, *args, **kwargs) -> "ScenarioList":
|
2192
|
+
"""
|
2193
|
+
Create a ScenarioList from a specified source type.
|
2194
|
+
|
2195
|
+
This method serves as the main entry point for creating ScenarioList objects,
|
2196
|
+
providing a unified interface for various data sources.
|
2197
|
+
|
2198
|
+
Args:
|
2199
|
+
source_type: The type of source to create a ScenarioList from.
|
2200
|
+
Valid values include: 'urls', 'directory', 'csv', 'tsv',
|
2201
|
+
'excel', 'pdf', 'pdf_to_image', and others.
|
2202
|
+
*args: Positional arguments to pass to the source-specific method.
|
2203
|
+
**kwargs: Keyword arguments to pass to the source-specific method.
|
2204
|
+
|
2205
|
+
Returns:
|
2206
|
+
A ScenarioList object created from the specified source.
|
2207
|
+
|
2208
|
+
Examples:
|
2209
|
+
>>> # This is a simplified example for doctest
|
2210
|
+
>>> # In real usage, you would provide a path to your CSV file:
|
2211
|
+
>>> # sl_csv = ScenarioList.from_source('csv', 'your_data.csv')
|
2212
|
+
>>> # Or use other source types like 'directory', 'excel', etc.
|
2213
|
+
>>> # Examples of other source types:
|
2214
|
+
>>> # sl_dir = ScenarioList.from_source('directory', '/path/to/files')
|
2215
|
+
"""
|
2216
|
+
from .scenario_source import ScenarioSource
|
2217
|
+
return ScenarioSource.from_source(source_type, *args, **kwargs)
|
2144
2218
|
|
2145
2219
|
|
2146
2220
|
if __name__ == "__main__":
|
2147
2221
|
import doctest
|
2148
|
-
|
2149
|
-
doctest.testmod(optionflags=doctest.ELLIPSIS)
|
2222
|
+
doctest.testmod(optionflags=doctest.ELLIPSIS)
|