edsl 0.1.53__py3-none-any.whl → 0.1.55__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. edsl/__init__.py +8 -1
  2. edsl/__init__original.py +134 -0
  3. edsl/__version__.py +1 -1
  4. edsl/agents/agent.py +29 -0
  5. edsl/agents/agent_list.py +36 -1
  6. edsl/base/base_class.py +281 -151
  7. edsl/buckets/__init__.py +8 -3
  8. edsl/buckets/bucket_collection.py +9 -3
  9. edsl/buckets/model_buckets.py +4 -2
  10. edsl/buckets/token_bucket.py +2 -2
  11. edsl/buckets/token_bucket_client.py +5 -3
  12. edsl/caching/cache.py +131 -62
  13. edsl/caching/cache_entry.py +70 -58
  14. edsl/caching/sql_dict.py +17 -0
  15. edsl/cli.py +99 -0
  16. edsl/config/config_class.py +16 -0
  17. edsl/conversation/__init__.py +31 -0
  18. edsl/coop/coop.py +276 -242
  19. edsl/coop/coop_jobs_objects.py +59 -0
  20. edsl/coop/coop_objects.py +29 -0
  21. edsl/coop/coop_regular_objects.py +26 -0
  22. edsl/coop/utils.py +24 -19
  23. edsl/dataset/dataset.py +338 -101
  24. edsl/db_list/sqlite_list.py +349 -0
  25. edsl/inference_services/__init__.py +40 -5
  26. edsl/inference_services/exceptions.py +11 -0
  27. edsl/inference_services/services/anthropic_service.py +5 -2
  28. edsl/inference_services/services/aws_bedrock.py +6 -2
  29. edsl/inference_services/services/azure_ai.py +6 -2
  30. edsl/inference_services/services/google_service.py +3 -2
  31. edsl/inference_services/services/mistral_ai_service.py +6 -2
  32. edsl/inference_services/services/open_ai_service.py +6 -2
  33. edsl/inference_services/services/perplexity_service.py +6 -2
  34. edsl/inference_services/services/test_service.py +105 -7
  35. edsl/interviews/answering_function.py +167 -59
  36. edsl/interviews/interview.py +124 -72
  37. edsl/interviews/interview_task_manager.py +10 -0
  38. edsl/invigilators/invigilators.py +10 -1
  39. edsl/jobs/async_interview_runner.py +146 -104
  40. edsl/jobs/data_structures.py +6 -4
  41. edsl/jobs/decorators.py +61 -0
  42. edsl/jobs/fetch_invigilator.py +61 -18
  43. edsl/jobs/html_table_job_logger.py +14 -2
  44. edsl/jobs/jobs.py +180 -104
  45. edsl/jobs/jobs_component_constructor.py +2 -2
  46. edsl/jobs/jobs_interview_constructor.py +2 -0
  47. edsl/jobs/jobs_pricing_estimation.py +127 -46
  48. edsl/jobs/jobs_remote_inference_logger.py +4 -0
  49. edsl/jobs/jobs_runner_status.py +30 -25
  50. edsl/jobs/progress_bar_manager.py +79 -0
  51. edsl/jobs/remote_inference.py +35 -1
  52. edsl/key_management/key_lookup_builder.py +6 -1
  53. edsl/language_models/language_model.py +102 -12
  54. edsl/language_models/model.py +10 -3
  55. edsl/language_models/price_manager.py +45 -75
  56. edsl/language_models/registry.py +5 -0
  57. edsl/language_models/utilities.py +2 -1
  58. edsl/notebooks/notebook.py +77 -10
  59. edsl/questions/VALIDATION_README.md +134 -0
  60. edsl/questions/__init__.py +24 -1
  61. edsl/questions/exceptions.py +21 -0
  62. edsl/questions/question_check_box.py +171 -149
  63. edsl/questions/question_dict.py +243 -51
  64. edsl/questions/question_multiple_choice_with_other.py +624 -0
  65. edsl/questions/question_registry.py +2 -1
  66. edsl/questions/templates/multiple_choice_with_other/__init__.py +0 -0
  67. edsl/questions/templates/multiple_choice_with_other/answering_instructions.jinja +15 -0
  68. edsl/questions/templates/multiple_choice_with_other/question_presentation.jinja +17 -0
  69. edsl/questions/validation_analysis.py +185 -0
  70. edsl/questions/validation_cli.py +131 -0
  71. edsl/questions/validation_html_report.py +404 -0
  72. edsl/questions/validation_logger.py +136 -0
  73. edsl/results/result.py +63 -16
  74. edsl/results/results.py +702 -171
  75. edsl/scenarios/construct_download_link.py +16 -3
  76. edsl/scenarios/directory_scanner.py +226 -226
  77. edsl/scenarios/file_methods.py +5 -0
  78. edsl/scenarios/file_store.py +117 -6
  79. edsl/scenarios/handlers/__init__.py +5 -1
  80. edsl/scenarios/handlers/mp4_file_store.py +104 -0
  81. edsl/scenarios/handlers/webm_file_store.py +104 -0
  82. edsl/scenarios/scenario.py +120 -101
  83. edsl/scenarios/scenario_list.py +800 -727
  84. edsl/scenarios/scenario_list_gc_test.py +146 -0
  85. edsl/scenarios/scenario_list_memory_test.py +214 -0
  86. edsl/scenarios/scenario_list_source_refactor.md +35 -0
  87. edsl/scenarios/scenario_selector.py +5 -4
  88. edsl/scenarios/scenario_source.py +1990 -0
  89. edsl/scenarios/tests/test_scenario_list_sources.py +52 -0
  90. edsl/surveys/survey.py +22 -0
  91. edsl/tasks/__init__.py +4 -2
  92. edsl/tasks/task_history.py +198 -36
  93. edsl/tests/scenarios/test_ScenarioSource.py +51 -0
  94. edsl/tests/scenarios/test_scenario_list_sources.py +51 -0
  95. edsl/utilities/__init__.py +2 -1
  96. edsl/utilities/decorators.py +121 -0
  97. edsl/utilities/memory_debugger.py +1010 -0
  98. {edsl-0.1.53.dist-info → edsl-0.1.55.dist-info}/METADATA +52 -76
  99. {edsl-0.1.53.dist-info → edsl-0.1.55.dist-info}/RECORD +102 -78
  100. edsl/jobs/jobs_runner_asyncio.py +0 -281
  101. edsl/language_models/unused/fake_openai_service.py +0 -60
  102. {edsl-0.1.53.dist-info → edsl-0.1.55.dist-info}/LICENSE +0 -0
  103. {edsl-0.1.53.dist-info → edsl-0.1.55.dist-info}/WHEEL +0 -0
  104. {edsl-0.1.53.dist-info → edsl-0.1.55.dist-info}/entry_points.txt +0 -0
@@ -34,7 +34,13 @@ import os
34
34
  from io import StringIO
35
35
  import inspect
36
36
  from collections import UserList, defaultdict
37
- from collections.abc import Iterable
37
+ from collections.abc import Iterable, MutableSequence
38
+ import json
39
+ import pickle
40
+
41
+
42
+ # Import for refactoring to Source classes
43
+ from edsl.scenarios.scenario_source import deprecated_classmethod, TuplesSource
38
44
 
39
45
  from simpleeval import EvalWithCompoundTypes, NameNotDefined # type: ignore
40
46
  from tabulate import tabulate_formats
@@ -53,9 +59,17 @@ if TYPE_CHECKING:
53
59
 
54
60
 
55
61
  from ..base import Base
56
- from ..utilities import remove_edsl_version, sanitize_string, is_valid_variable_name, dict_hash
62
+ from ..utilities import (
63
+ remove_edsl_version,
64
+ sanitize_string,
65
+ is_valid_variable_name,
66
+ dict_hash,
67
+ memory_profile,
68
+ )
57
69
  from ..dataset import ScenarioListOperationsMixin
58
70
 
71
+ from ..db_list.sqlite_list import SQLiteList
72
+
59
73
  from .exceptions import ScenarioError
60
74
  from .scenario import Scenario
61
75
  from .scenario_list_pdf_tools import PdfTools
@@ -83,41 +97,39 @@ TableFormat: TypeAlias = Literal[
83
97
  "tsv",
84
98
  ]
85
99
 
86
- class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
100
+
101
+
102
+ class ScenarioSQLiteList(SQLiteList):
103
+ """SQLite-backed list specifically for storing Scenario objects."""
104
+
105
+ def serialize(self, obj):
106
+ """Serialize a Scenario object or other data to bytes using pickle."""
107
+ return pickle.dumps(obj)
108
+
109
+ def deserialize(self, data):
110
+ """Deserialize pickled bytes back to a Scenario object or other data."""
111
+ if isinstance(data, str):
112
+ return pickle.loads(data.encode())
113
+ return pickle.loads(data)
114
+
115
+
116
+ if use_sqlite := True:
117
+ data_class = ScenarioSQLiteList
118
+ else:
119
+ data_class = list
120
+
121
+ class ScenarioList(MutableSequence, Base, ScenarioListOperationsMixin):
87
122
  """
88
123
  A collection of Scenario objects with advanced operations for manipulation and analysis.
89
-
90
- ScenarioList extends Python's UserList to provide specialized functionality for
91
- working with collections of Scenario objects. It inherits from Base to integrate
92
- with EDSL's object model and from ScenarioListOperationsMixin to provide
93
- powerful data manipulation capabilities.
94
-
95
- The class provides methods for filtering, sorting, joining, transforming, and
96
- analyzing collections of Scenarios. It's designed to work seamlessly with other
97
- EDSL components like Surveys, Jobs, and Questions.
98
-
124
+
125
+ ScenarioList provides specialized functionality for working with collections of
126
+ Scenario objects. It inherits from MutableSequence to provide standard list operations,
127
+ from Base to integrate with EDSL's object model, and from ScenarioListOperationsMixin
128
+ to provide powerful data manipulation capabilities.
129
+
99
130
  Attributes:
100
- data (list): The underlying list of Scenario objects.
131
+ data (list): The underlying list containing Scenario objects.
101
132
  codebook (dict): Optional metadata describing the fields in the scenarios.
102
-
103
- Examples:
104
- Create a ScenarioList from Scenario objects:
105
- >>> from edsl.scenarios import Scenario, ScenarioList
106
- >>> s1 = Scenario({"product": "apple", "price": 1.99})
107
- >>> s2 = Scenario({"product": "banana", "price": 0.99})
108
- >>> sl = ScenarioList([s1, s2])
109
-
110
- Filter scenarios based on a condition:
111
- >>> cheap_fruits = sl.filter("price < 1.50")
112
- >>> len(cheap_fruits)
113
- 1
114
- >>> cheap_fruits[0]["product"]
115
- 'banana'
116
-
117
- Add a new column based on existing data:
118
- >>> sl_with_tax = sl.mutate("tax = price * 0.08")
119
- >>> sl_with_tax[0]["tax"]
120
- 0.1592
121
133
  """
122
134
 
123
135
  __documentation__ = (
@@ -125,43 +137,64 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
125
137
  )
126
138
 
127
139
  def __init__(
128
- self, data: Optional[list] = None, codebook: Optional[dict[str, str]] = None
140
+ self,
141
+ data: Optional[list] = None,
142
+ codebook: Optional[dict[str, str]] = None,
143
+ data_class: Optional[type] = data_class,
129
144
  ):
130
- """
131
- Initialize a new ScenarioList with optional data and codebook.
132
-
133
- Args:
134
- data: A list of Scenario objects. If None, an empty list is used.
135
- codebook: A dictionary mapping field names to descriptions or metadata.
136
- Used for documentation and to provide context for fields.
137
-
138
- Examples:
139
- >>> sl = ScenarioList() # Empty list
140
- >>> s1 = Scenario({"product": "apple"})
141
- >>> s2 = Scenario({"product": "banana"})
142
- >>> sl = ScenarioList([s1, s2]) # With data
143
-
144
- >>> # With a codebook
145
- >>> codebook = {"product": "Fruit name", "price": "Price in USD"}
146
- >>> sl = ScenarioList([s1, s2], codebook=codebook)
147
- """
148
- if data is not None:
149
- super().__init__(data)
150
- else:
151
- super().__init__([])
145
+ """Initialize a new ScenarioList with optional data and codebook."""
146
+ self._data_class = data_class
147
+ self.data = self._data_class([])
148
+ warned = False
149
+ for item in data or []:
150
+ try:
151
+ _ = json.dumps(item.to_dict())
152
+ except:
153
+ import warnings
154
+ if not warned:
155
+ warnings.warn(
156
+ f"One or more items in the data list are not JSON serializable. "
157
+ "This would prevent running a job that uses this ScenarioList."
158
+ "One solution is to use 'str(item)' to convert the item to a string before adding."
159
+ )
160
+ warned = True
161
+ self.data.append(item)
152
162
  self.codebook = codebook or {}
153
163
 
164
+ # Required MutableSequence abstract methods
165
+ def __getitem__(self, index):
166
+ """Get item at index."""
167
+ if isinstance(index, slice):
168
+ return self.__class__(list(self.data[index]), self.codebook.copy())
169
+ return self.data[index]
170
+
171
+ def __setitem__(self, index, value):
172
+ """Set item at index."""
173
+ self.data[index] = value
174
+
175
+ def __delitem__(self, index):
176
+ """Delete item at index."""
177
+ del self.data[index]
178
+
179
+ def __len__(self):
180
+ """Return number of items."""
181
+ return len(self.data)
182
+
183
+ def insert(self, index, value):
184
+ """Insert value at index."""
185
+ self.data.insert(index, value)
186
+
154
187
  def unique(self) -> ScenarioList:
155
188
  """
156
189
  Return a new ScenarioList containing only unique Scenario objects.
157
-
190
+
158
191
  This method removes duplicate Scenario objects based on their hash values,
159
192
  which are determined by their content. Two Scenarios with identical key-value
160
193
  pairs will have the same hash and be considered duplicates.
161
-
194
+
162
195
  Returns:
163
196
  A new ScenarioList containing only unique Scenario objects.
164
-
197
+
165
198
  Examples:
166
199
  >>> from edsl.scenarios import Scenario, ScenarioList
167
200
  >>> s1 = Scenario({"a": 1})
@@ -173,26 +206,36 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
173
206
  2
174
207
  >>> unique_sl
175
208
  ScenarioList([Scenario({'a': 1}), Scenario({'a': 2})])
176
-
209
+
177
210
  Notes:
178
211
  - The order of scenarios in the result is not guaranteed due to the use of sets
179
212
  - Uniqueness is determined by the Scenario's __hash__ method
180
213
  - The original ScenarioList is not modified
214
+ - This implementation is memory efficient as it processes scenarios one at a time
181
215
  """
182
- return ScenarioList(list(set(self)))
216
+ seen_hashes = set()
217
+ result = ScenarioList()
218
+
219
+ for scenario in self.data:
220
+ scenario_hash = hash(scenario)
221
+ if scenario_hash not in seen_hashes:
222
+ seen_hashes.add(scenario_hash)
223
+ result.append(scenario)
224
+
225
+ return result
183
226
 
184
227
  @property
185
228
  def has_jinja_braces(self) -> bool:
186
229
  """
187
230
  Check if any Scenario in the list contains values with Jinja template braces.
188
-
231
+
189
232
  This property checks all Scenarios in the list to determine if any contain
190
233
  string values with Jinja template syntax ({{ and }}). This is important for
191
234
  rendering templates and avoiding conflicts with other templating systems.
192
-
235
+
193
236
  Returns:
194
237
  True if any Scenario contains values with Jinja braces, False otherwise.
195
-
238
+
196
239
  Examples:
197
240
  >>> from edsl.scenarios import Scenario, ScenarioList
198
241
  >>> s1 = Scenario({"text": "Plain text"})
@@ -204,20 +247,23 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
204
247
  >>> sl2.has_jinja_braces
205
248
  True
206
249
  """
207
- return any([scenario.has_jinja_braces for scenario in self])
250
+ for scenario in self:
251
+ if scenario.has_jinja_braces:
252
+ return True
253
+ return False
208
254
 
209
255
  def _convert_jinja_braces(self) -> ScenarioList:
210
256
  """
211
257
  Convert Jinja braces to alternative symbols in all Scenarios in the list.
212
-
258
+
213
259
  This method creates a new ScenarioList where all Jinja template braces
214
260
  ({{ and }}) in string values are converted to alternative symbols (<< and >>).
215
261
  This is useful when you need to prevent template processing or avoid conflicts
216
262
  with other templating systems.
217
-
263
+
218
264
  Returns:
219
265
  A new ScenarioList with converted braces in all Scenarios.
220
-
266
+
221
267
  Examples:
222
268
  >>> from edsl.scenarios import Scenario, ScenarioList
223
269
  >>> s = Scenario({"text": "Template with {{variable}}"})
@@ -225,13 +271,16 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
225
271
  >>> converted = sl._convert_jinja_braces()
226
272
  >>> converted[0]["text"]
227
273
  'Template with <<variable>>'
228
-
274
+
229
275
  Notes:
230
276
  - The original ScenarioList is not modified
231
277
  - This is primarily intended for internal use
232
278
  - The default replacement symbols are << and >>
233
279
  """
234
- return ScenarioList([scenario._convert_jinja_braces() for scenario in self])
280
+ converted_sl = ScenarioList()
281
+ for scenario in self:
282
+ converted_sl.append(scenario._convert_jinja_braces())
283
+ return converted_sl
235
284
 
236
285
  def give_valid_names(self, existing_codebook: dict = None) -> ScenarioList:
237
286
  """Give valid names to the scenario keys, using an existing codebook if provided.
@@ -253,7 +302,8 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
253
302
  ScenarioList([Scenario({'custom_name': 1, 'b': 2}), Scenario({'a': 1, 'b': 1})])
254
303
  """
255
304
  codebook = existing_codebook.copy() if existing_codebook else {}
256
- new_scenarios = []
305
+
306
+ new_scenarios = ScenarioList(data = [], codebook = codebook)
257
307
 
258
308
  for scenario in self:
259
309
  new_scenario = {}
@@ -274,7 +324,7 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
274
324
 
275
325
  new_scenarios.append(Scenario(new_scenario))
276
326
 
277
- return ScenarioList(new_scenarios, codebook)
327
+ return new_scenarios
278
328
 
279
329
  def unpivot(
280
330
  self,
@@ -301,7 +351,7 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
301
351
  if value_vars is None:
302
352
  value_vars = [field for field in self[0].keys() if field not in id_vars]
303
353
 
304
- new_scenarios = []
354
+ new_scenarios = ScenarioList(data = [], codebook = {})
305
355
  for scenario in self:
306
356
  for var in value_vars:
307
357
  new_scenario = {id_var: scenario[id_var] for id_var in id_vars}
@@ -309,35 +359,7 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
309
359
  new_scenario["value"] = scenario[var]
310
360
  new_scenarios.append(Scenario(new_scenario))
311
361
 
312
- return ScenarioList(new_scenarios)
313
-
314
- def sem_filter(self, language_predicate: str) -> ScenarioList:
315
- """Filter the ScenarioList based on a language predicate.
316
-
317
- :param language_predicate: The language predicate to use.
318
-
319
- Inspired by:
320
- @misc{patel2024semanticoperators,
321
- title={Semantic Operators: A Declarative Model for Rich, AI-based Analytics Over Text Data},
322
- author={Liana Patel and Siddharth Jha and Parth Asawa and Melissa Pan and Carlos Guestrin and Matei Zaharia},
323
- year={2024},
324
- eprint={2407.11418},
325
- archivePrefix={arXiv},
326
- primaryClass={cs.DB},
327
- url={https://arxiv.org/abs/2407.11418},
328
- }
329
- """
330
- from ..questions import QuestionYesNo
331
-
332
- new_scenario_list = self.duplicate()
333
- q = QuestionYesNo(
334
- question_text=language_predicate, question_name="binary_outcome"
335
- )
336
- results = q.by(new_scenario_list).run(verbose=False)
337
- new_scenario_list = new_scenario_list.add_list(
338
- "criteria", results.select("binary_outcome").to_list()
339
- )
340
- return new_scenario_list.filter("criteria == 'Yes'").drop("criteria")
362
+ return new_scenarios
341
363
 
342
364
  def pivot(
343
365
  self,
@@ -378,14 +400,11 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
378
400
  value = scenario[value_name]
379
401
  pivoted_dict[id_key][variable] = value
380
402
 
381
- # Convert the dict of dicts to a list of Scenarios
382
- pivoted_scenarios = [
383
- Scenario(dict(zip(id_vars, id_key), **values))
384
- for id_key, values in pivoted_dict.items()
385
- ]
386
-
387
- return ScenarioList(pivoted_scenarios)
388
-
403
+ new_sl = ScenarioList(data = [], codebook = self.codebook)
404
+ for id_key, values in pivoted_dict.items():
405
+ new_sl.append(Scenario(dict(zip(id_vars, id_key), **values)))
406
+ return new_sl
407
+
389
408
  def group_by(
390
409
  self, id_vars: List[str], variables: List[str], func: Callable
391
410
  ) -> ScenarioList:
@@ -426,7 +445,7 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
426
445
  grouped[key][var].append(scenario[var])
427
446
 
428
447
  # Apply the function to each group
429
- result = []
448
+ new_sl= ScenarioList(data = [], codebook = self.codebook)
430
449
  for key, group in grouped.items():
431
450
  try:
432
451
  aggregated = func(*[group[var] for var in variables])
@@ -440,9 +459,9 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
440
459
 
441
460
  new_scenario = dict(zip(id_vars, key))
442
461
  new_scenario.update(aggregated)
443
- result.append(Scenario(new_scenario))
462
+ new_sl.append(Scenario(new_scenario))
444
463
 
445
- return ScenarioList(result)
464
+ return new_sl
446
465
 
447
466
  @property
448
467
  def parameters(self) -> set:
@@ -457,22 +476,51 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
457
476
  if len(self) == 0:
458
477
  return set()
459
478
 
460
- return set.union(*[set(s.keys()) for s in self])
479
+ params = set()
480
+ for scenario in self:
481
+ params.update(scenario.keys())
482
+ return params
461
483
 
462
- def __hash__(self) -> int:
463
- """Return the hash of the ScenarioList.
484
+ def __original_hash__(self) -> int:
485
+ """Return the original hash of the ScenarioList using the dictionary-based approach.
464
486
 
465
487
  >>> s = ScenarioList.example()
466
- >>> hash(s)
488
+ >>> s.__original_hash__()
467
489
  1262252885757976162
468
490
  """
469
491
  return dict_hash(self.to_dict(sort=True, add_edsl_version=False))
470
492
 
493
+ def __hash__(self) -> int:
494
+ """Return the hash of the ScenarioList using a memory-efficient streaming approach.
495
+
496
+ >>> s = ScenarioList.example()
497
+ >>> hash(s)
498
+ 1219708685929871252
499
+ """
500
+ # Start with a seed value
501
+ running_hash = 0
502
+
503
+ # Use a heap to maintain sorted order as we go
504
+ import heapq
505
+ heap = []
506
+
507
+ # Process each scenario's hash and add to heap
508
+ for scenario in self:
509
+ heapq.heappush(heap, hash(scenario))
510
+
511
+ # Combine hashes in sorted order
512
+ while heap:
513
+ h = heapq.heappop(heap)
514
+ # Use a large prime number to mix the bits
515
+ running_hash = (running_hash * 31) ^ h
516
+
517
+ return running_hash
518
+
471
519
  def __eq__(self, other: Any) -> bool:
472
520
  return hash(self) == hash(other)
473
521
 
474
522
  def __repr__(self):
475
- return f"ScenarioList({self.data})"
523
+ return f"ScenarioList({list(self.data)})"
476
524
 
477
525
  def __mul__(self, other: ScenarioList) -> ScenarioList:
478
526
  """Takes the cross product of two ScenarioLists.
@@ -484,16 +532,18 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
484
532
  """
485
533
  from itertools import product
486
534
  from .scenario import Scenario
535
+
487
536
  if isinstance(other, Scenario):
488
537
  other = ScenarioList([other])
489
538
  elif not isinstance(other, ScenarioList):
490
539
  from .exceptions import TypeScenarioError
540
+
491
541
  raise TypeScenarioError(f"Cannot multiply ScenarioList with {type(other)}")
492
542
 
493
- new_sl = []
494
- for s1, s2 in list(product(self, other)):
543
+ new_sl = ScenarioList(data=[], codebook=self.codebook)
544
+ for s1, s2 in product(self, other):
495
545
  new_sl.append(s1 + s2)
496
- return ScenarioList(new_sl)
546
+ return new_sl
497
547
 
498
548
  def times(self, other: ScenarioList) -> ScenarioList:
499
549
  """Takes the cross product of two ScenarioLists.
@@ -505,6 +555,8 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
505
555
  >>> s1.times(s2)
506
556
  ScenarioList([Scenario({'a': 1, 'b': 1}), Scenario({'a': 1, 'b': 2}), Scenario({'a': 2, 'b': 1}), Scenario({'a': 2, 'b': 2})])
507
557
  """
558
+ import warnings
559
+ warnings.warn("times is deprecated, use * instead", DeprecationWarning)
508
560
  return self.__mul__(other)
509
561
 
510
562
  def shuffle(self, seed: Optional[str] = None) -> ScenarioList:
@@ -524,14 +576,16 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
524
576
  """Return a random sample from the ScenarioList
525
577
 
526
578
  >>> s = ScenarioList.from_list("a", [1,2,3,4,5,6])
527
- >>> s.sample(3, seed = "edsl")
579
+ >>> s.sample(3, seed = "edsl") # doctest: +SKIP
528
580
  ScenarioList([Scenario({'a': 2}), Scenario({'a': 1}), Scenario({'a': 3})])
529
581
  """
530
582
  if seed:
531
583
  random.seed(seed)
532
584
 
533
585
  sl = self.duplicate()
534
- return ScenarioList(random.sample(sl.data, n))
586
+ # Convert to list if necessary for random.sample
587
+ data_list = list(sl.data)
588
+ return ScenarioList(random.sample(data_list, n))
535
589
 
536
590
  def expand(self, expand_field: str, number_field: bool = False) -> ScenarioList:
537
591
  """Expand the ScenarioList by a field.
@@ -560,15 +614,21 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
560
614
  new_scenarios.append(new_scenario)
561
615
  return ScenarioList(new_scenarios)
562
616
 
563
- def _concatenate(self, fields: List[str], output_type: str = "string", separator: str = ";", new_field_name: Optional[str] = None) -> ScenarioList:
617
+ def _concatenate(
618
+ self,
619
+ fields: List[str],
620
+ output_type: str = "string",
621
+ separator: str = ";",
622
+ new_field_name: Optional[str] = None,
623
+ ) -> ScenarioList:
564
624
  """Private method to handle concatenation logic for different output types.
565
-
625
+
566
626
  :param fields: The fields to concatenate.
567
627
  :param output_type: The type of output ("string", "list", or "set").
568
628
  :param separator: The separator to use for string concatenation.
569
629
  :param new_field_name: Optional custom name for the concatenated field.
570
630
  If None, defaults to "concat_field1_field2_..."
571
-
631
+
572
632
  Returns:
573
633
  ScenarioList: A new ScenarioList with concatenated fields.
574
634
  """
@@ -577,7 +637,7 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
577
637
  raise ScenarioError(
578
638
  f"The 'fields' parameter must be a list of field names, not a string. Got '{fields}'."
579
639
  )
580
-
640
+
581
641
  new_scenarios = []
582
642
  for scenario in self:
583
643
  new_scenario = scenario.copy()
@@ -587,8 +647,12 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
587
647
  values.append(new_scenario[field])
588
648
  del new_scenario[field]
589
649
 
590
- field_name = new_field_name if new_field_name is not None else f"concat_{'_'.join(fields)}"
591
-
650
+ field_name = (
651
+ new_field_name
652
+ if new_field_name is not None
653
+ else f"concat_{'_'.join(fields)}"
654
+ )
655
+
592
656
  if output_type == "string":
593
657
  # Convert all values to strings and join with separator
594
658
  new_scenario[field_name] = separator.join(str(v) for v in values)
@@ -600,13 +664,21 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
600
664
  new_scenario[field_name] = set(values)
601
665
  else:
602
666
  from .exceptions import ValueScenarioError
603
- raise ValueScenarioError(f"Invalid output_type: {output_type}. Must be 'string', 'list', or 'set'.")
604
-
667
+
668
+ raise ValueScenarioError(
669
+ f"Invalid output_type: {output_type}. Must be 'string', 'list', or 'set'."
670
+ )
671
+
605
672
  new_scenarios.append(new_scenario)
606
673
 
607
674
  return ScenarioList(new_scenarios)
608
675
 
609
- def concatenate(self, fields: List[str], separator: str = ";", new_field_name: Optional[str] = None) -> ScenarioList:
676
+ def concatenate(
677
+ self,
678
+ fields: List[str],
679
+ separator: str = ";",
680
+ new_field_name: Optional[str] = None,
681
+ ) -> ScenarioList:
610
682
  """Concatenate specified fields into a single string field.
611
683
 
612
684
  :param fields: The fields to concatenate.
@@ -623,9 +695,16 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
623
695
  >>> s.concatenate(['a', 'b', 'c'], new_field_name='combined')
624
696
  ScenarioList([Scenario({'combined': '1;2;3'}), Scenario({'combined': '4;5;6'})])
625
697
  """
626
- return self._concatenate(fields, output_type="string", separator=separator, new_field_name=new_field_name)
698
+ return self._concatenate(
699
+ fields,
700
+ output_type="string",
701
+ separator=separator,
702
+ new_field_name=new_field_name,
703
+ )
627
704
 
628
- def concatenate_to_list(self, fields: List[str], new_field_name: Optional[str] = None) -> ScenarioList:
705
+ def concatenate_to_list(
706
+ self, fields: List[str], new_field_name: Optional[str] = None
707
+ ) -> ScenarioList:
629
708
  """Concatenate specified fields into a single list field.
630
709
 
631
710
  :param fields: The fields to concatenate.
@@ -641,9 +720,13 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
641
720
  >>> s.concatenate_to_list(['a', 'b', 'c'], new_field_name='values')
642
721
  ScenarioList([Scenario({'values': [1, 2, 3]}), Scenario({'values': [4, 5, 6]})])
643
722
  """
644
- return self._concatenate(fields, output_type="list", new_field_name=new_field_name)
723
+ return self._concatenate(
724
+ fields, output_type="list", new_field_name=new_field_name
725
+ )
645
726
 
646
- def concatenate_to_set(self, fields: List[str], new_field_name: Optional[str] = None) -> ScenarioList:
727
+ def concatenate_to_set(
728
+ self, fields: List[str], new_field_name: Optional[str] = None
729
+ ) -> ScenarioList:
647
730
  """Concatenate specified fields into a single set field.
648
731
 
649
732
  :param fields: The fields to concatenate.
@@ -659,7 +742,9 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
659
742
  >>> s.concatenate_to_set(['a', 'b', 'c'], new_field_name='unique_values')
660
743
  ScenarioList([Scenario({'unique_values': {1, 2, 3}}), Scenario({'unique_values': {4, 5, 6}})])
661
744
  """
662
- return self._concatenate(fields, output_type="set", new_field_name=new_field_name)
745
+ return self._concatenate(
746
+ fields, output_type="set", new_field_name=new_field_name
747
+ )
663
748
 
664
749
  def unpack_dict(
665
750
  self, field: str, prefix: Optional[str] = None, drop_field: bool = False
@@ -773,10 +858,10 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
773
858
  def get_sort_key(scenario: Any) -> tuple:
774
859
  return tuple(scenario[field] for field in fields)
775
860
 
776
- return ScenarioList(sorted(self, key=get_sort_key, reverse=reverse))
861
+ return ScenarioList(sorted(self.data, key=get_sort_key, reverse=reverse))
777
862
 
778
863
  def duplicate(self) -> ScenarioList:
779
- """Return a copy of the ScenarioList.
864
+ """Return a copy of the ScenarioList using streaming to avoid loading everything into memory.
780
865
 
781
866
  >>> sl = ScenarioList.example()
782
867
  >>> sl_copy = sl.duplicate()
@@ -785,8 +870,30 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
785
870
  >>> sl is sl_copy
786
871
  False
787
872
  """
788
- return ScenarioList([scenario.copy() for scenario in self])
873
+ new_list = ScenarioList()
874
+ for scenario in self.data:
875
+ new_list.append(scenario.copy())
876
+ return new_list
789
877
 
878
+ def __iter__(self):
879
+ """Iterate over scenarios using streaming."""
880
+ return iter(self.data)
881
+
882
+ def equals(self, other: Any) -> bool:
883
+ """Memory-efficient comparison of two ScenarioLists."""
884
+ if not isinstance(other, ScenarioList):
885
+ return False
886
+ if len(self) != len(other):
887
+ return False
888
+ if self.codebook != other.codebook:
889
+ return False
890
+ return self.data == other.data
891
+
892
+ def __eq__(self, other: Any) -> bool:
893
+ """Use memory-efficient comparison by default."""
894
+ return self.equals(other)
895
+
896
+ @memory_profile
790
897
  def filter(self, expression: str) -> ScenarioList:
791
898
  """
792
899
  Filter a list of scenarios based on an expression.
@@ -799,36 +906,62 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
799
906
  >>> s.filter("b == 2")
800
907
  ScenarioList([Scenario({'a': 1, 'b': 2})])
801
908
  """
802
- sl = self.duplicate()
803
- base_keys = set(self[0].keys())
804
- keys = set()
805
- for scenario in sl:
806
- keys.update(scenario.keys())
807
- if keys != base_keys:
808
- import warnings
809
-
810
- warnings.warn(
811
- "Ragged ScenarioList detected (different keys for different scenario entries). This may cause unexpected behavior."
812
- )
909
+ # Get first item to check keys if available
910
+ try:
911
+ first_item = self[0] if len(self) > 0 else None
912
+ if first_item:
913
+ # Check for ragged keys by examining a sample of scenarios
914
+ # rather than iterating through all of them
915
+ sample_size = min(len(self), 100) # Check at most 100 scenarios
916
+ base_keys = set(first_item.keys())
917
+ keys = set()
918
+
919
+ # Use a counter to check only the sample_size
920
+ count = 0
921
+ for scenario in self:
922
+ keys.update(scenario.keys())
923
+ count += 1
924
+ if count >= sample_size:
925
+ break
926
+
927
+ if keys != base_keys:
928
+ import warnings
929
+ warnings.warn(
930
+ "Ragged ScenarioList detected (different keys for different scenario entries). This may cause unexpected behavior."
931
+ )
932
+ except IndexError:
933
+ pass
934
+
935
+ # Create new ScenarioList with filtered data
936
+ new_sl = ScenarioList(data=[], codebook=self.codebook)
813
937
 
814
938
  def create_evaluator(scenario: Scenario):
815
- """Create an evaluator for the given result.
816
- The 'combined_dict' is a mapping of all values for that Result object.
817
- """
939
+ """Create an evaluator for the given scenario."""
818
940
  return EvalWithCompoundTypes(names=scenario)
819
941
 
820
942
  try:
821
- # iterates through all the results and evaluates the expression
822
- new_data = []
823
- for scenario in sl:
943
+ # Process one scenario at a time to minimize memory usage
944
+ for scenario in self:
945
+ # Check if scenario matches the filter expression
824
946
  if create_evaluator(scenario).eval(expression):
825
- new_data.append(scenario)
947
+ # Create a copy and immediately append to the new list
948
+ scenario_copy = scenario.copy()
949
+ new_sl.append(scenario_copy)
950
+
951
+ # Remove reference to allow for garbage collection
952
+ del scenario_copy
953
+
826
954
  except NameNotDefined as e:
827
- available_fields = ", ".join(self.data[0].keys() if self.data else [])
955
+ # Get available fields for error message
956
+ try:
957
+ first_item = self[0] if len(self) > 0 else None
958
+ available_fields = ", ".join(first_item.keys() if first_item else [])
959
+ except:
960
+ available_fields = "unknown"
961
+
828
962
  raise ScenarioError(
829
963
  f"Error in filter: '{e}'\n"
830
964
  f"The expression '{expression}' refers to a field that does not exist.\n"
831
- f"Scenario: {scenario}\n"
832
965
  f"Available fields: {available_fields}\n"
833
966
  "Check your filter expression or consult the documentation: "
834
967
  "https://docs.expectedparrot.com/en/latest/scenarios.html#module-edsl.scenarios.Scenario"
@@ -836,18 +969,24 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
836
969
  except Exception as e:
837
970
  raise ScenarioError(f"Error in filter. Exception:{e}")
838
971
 
839
- return ScenarioList(new_data)
972
+ return new_sl
840
973
 
841
- def from_urls(
842
- self, urls: list[str], field_name: Optional[str] = "text"
843
- ) -> ScenarioList:
844
- """Create a ScenarioList from a list of URLs.
845
-
846
- :param urls: A list of URLs.
847
- :param field_name: The name of the field to store the text from the URLs.
848
974
 
975
+ @classmethod
976
+ def from_urls(cls, urls: list[str], field_name: Optional[str] = "text") -> ScenarioList:
977
+ from .scenario_source import URLSource
978
+ return URLSource(urls, field_name).to_scenario_list()
979
+
980
+ @classmethod
981
+ def from_list(cls, field_name: str, values: list, use_indexes: bool = False) -> ScenarioList:
982
+ """Create a ScenarioList from a list of values with a specified field name.
983
+
984
+ >>> ScenarioList.from_list('text', ['a', 'b', 'c'])
985
+ ScenarioList([Scenario({'text': 'a'}), Scenario({'text': 'b'}), Scenario({'text': 'c'})])
849
986
  """
850
- return ScenarioList([Scenario.from_url(url, field_name) for url in urls])
987
+ from .scenario_source import ListSource
988
+ return ListSource(field_name, values, use_indexes).to_scenario_list()
989
+
851
990
 
852
991
  def select(self, *fields: str) -> ScenarioList:
853
992
  """
@@ -874,8 +1013,10 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
874
1013
  >>> s.drop('a')
875
1014
  ScenarioList([Scenario({'b': 1}), Scenario({'b': 2})])
876
1015
  """
877
- sl = self.duplicate()
878
- return ScenarioList([scenario.drop(fields) for scenario in sl])
1016
+ new_sl = ScenarioList(data=[], codebook=self.codebook)
1017
+ for scenario in self:
1018
+ new_sl.append(scenario.drop(fields))
1019
+ return new_sl
879
1020
 
880
1021
  def keep(self, *fields: str) -> ScenarioList:
881
1022
  """Keep only the specified fields in the scenarios.
@@ -888,8 +1029,10 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
888
1029
  >>> s.keep('a')
889
1030
  ScenarioList([Scenario({'a': 1}), Scenario({'a': 1})])
890
1031
  """
891
- sl = self.duplicate()
892
- return ScenarioList([scenario.keep(fields) for scenario in sl])
1032
+ new_sl = ScenarioList(data=[], codebook=self.codebook)
1033
+ for scenario in self:
1034
+ new_sl.append(scenario.keep(fields))
1035
+ return new_sl
893
1036
 
894
1037
  @classmethod
895
1038
  def from_directory(
@@ -899,12 +1042,12 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
899
1042
  key_name: str = "content",
900
1043
  ) -> "ScenarioList":
901
1044
  """Create a ScenarioList of Scenario objects from files in a directory.
902
-
1045
+
903
1046
  This method scans a directory and creates a Scenario object for each file found,
904
1047
  where each Scenario contains a FileStore object under the specified key.
905
1048
  Optionally filters files based on a wildcard pattern. If no path is provided,
906
1049
  the current working directory is used.
907
-
1050
+
908
1051
  Args:
909
1052
  path: The directory path to scan, optionally including a wildcard pattern.
910
1053
  If None, uses the current working directory.
@@ -914,124 +1057,84 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
914
1057
  - "*.txt" - scans only text files in the current working directory
915
1058
  recursive: Whether to scan subdirectories recursively. Defaults to False.
916
1059
  key_name: The key to use for the FileStore object in each Scenario. Defaults to "content".
917
-
1060
+
918
1061
  Returns:
919
1062
  A ScenarioList containing Scenario objects for all matching files, where each Scenario
920
1063
  has a FileStore object under the specified key.
921
-
1064
+
922
1065
  Raises:
923
1066
  FileNotFoundError: If the specified directory does not exist.
924
-
1067
+
925
1068
  Examples:
926
1069
  # Get all files in the current directory with default key "content"
927
1070
  sl = ScenarioList.from_directory()
928
-
1071
+
929
1072
  # Get all Python files in a specific directory with custom key "python_file"
930
1073
  sl = ScenarioList.from_directory('*.py', key_name="python_file")
931
-
1074
+
932
1075
  # Get all image files in the current directory
933
1076
  sl = ScenarioList.from_directory('*.png', key_name="image")
934
-
1077
+
935
1078
  # Get all files recursively including subdirectories
936
1079
  sl = ScenarioList.from_directory(recursive=True, key_name="document")
937
1080
  """
938
- # Handle default case - use current directory
939
- if path is None:
940
- directory_path = os.getcwd()
941
- pattern = None
942
- else:
943
- # Special handling for "**" pattern which indicates recursive scanning
944
- has_recursive_pattern = '**' in path if path else False
945
-
946
- # Check if path contains any wildcard
947
- if path and ('*' in path):
948
- # Handle "**/*.ext" pattern - find the directory part before the **
949
- if has_recursive_pattern:
950
- # Extract the base directory by finding the part before **
951
- parts = path.split('**')
952
- if parts and parts[0]:
953
- # Remove trailing slash if any
954
- directory_path = parts[0].rstrip('/')
955
- if not directory_path:
956
- directory_path = os.getcwd()
957
- # Get the pattern after **
958
- pattern = parts[1] if len(parts) > 1 else None
959
- if pattern and pattern.startswith('/'):
960
- pattern = pattern[1:] # Remove leading slash
961
- else:
962
- directory_path = os.getcwd()
963
- pattern = None
964
- # Handle case where path is just a pattern (e.g., "*.py")
965
- elif os.path.dirname(path) == '':
966
- directory_path = os.getcwd()
967
- pattern = os.path.basename(path)
968
- else:
969
- # Split into directory and pattern
970
- directory_path = os.path.dirname(path)
971
- if not directory_path:
972
- directory_path = os.getcwd()
973
- pattern = os.path.basename(path)
974
- else:
975
- # Path is a directory with no pattern
976
- directory_path = path
977
- pattern = None
978
-
979
- # Ensure directory exists
980
- if not os.path.isdir(directory_path):
981
- from .exceptions import FileNotFoundScenarioError
982
- raise FileNotFoundScenarioError(f"Directory not found: {directory_path}")
983
-
984
- # Create a DirectoryScanner for the directory
985
- scanner = DirectoryScanner(directory_path)
986
-
987
- # Configure wildcard pattern filtering
988
- suffix_allow_list = None
989
- example_suffix = None
990
-
991
- if pattern:
992
- if pattern.startswith('*.'):
993
- # Simple extension filter (e.g., "*.py")
994
- suffix_allow_list = [pattern[2:]]
995
- elif '*' in pattern:
996
- # Other wildcard patterns
997
- example_suffix = pattern
998
- else:
999
- # Handle simple non-wildcard pattern (exact match)
1000
- example_suffix = pattern
1081
+ import warnings
1082
+ warnings.warn(
1083
+ "from_directory is deprecated. Use ScenarioSource.from_source('directory', ...) instead.",
1084
+ DeprecationWarning,
1085
+ stacklevel=2
1086
+ )
1087
+ from .scenario_source import DirectorySource
1001
1088
 
1002
- # Use scanner to find files and create FileStore objects
1003
- file_stores = scanner.scan(
1004
- factory=lambda path: FileStore(path),
1089
+ source = DirectorySource(
1090
+ directory=path or os.getcwd(),
1091
+ pattern="*",
1005
1092
  recursive=recursive,
1006
- suffix_allow_list=suffix_allow_list,
1007
- example_suffix=example_suffix
1093
+ metadata=True
1008
1094
  )
1009
1095
 
1010
- # Convert FileStore objects to Scenario objects with the specified key
1011
- scenarios = [Scenario({key_name: file_store}) for file_store in file_stores]
1096
+ # Get the ScenarioList with FileStore objects under "file" key
1097
+ sl = source.to_scenario_list()
1012
1098
 
1013
- return cls(scenarios)
1014
-
1015
- @classmethod
1016
- def from_list(
1017
- cls, name: str, values: list, func: Optional[Callable] = None
1018
- ) -> ScenarioList:
1019
- """Create a ScenarioList from a list of values.
1099
+ # If the requested key is different from the default "file" key used by DirectoryScanner.scan_directory,
1100
+ # rename the keys in all scenarios
1101
+ if key_name != "file":
1102
+ # Create a new ScenarioList
1103
+ result = ScenarioList([])
1104
+ for scenario in sl:
1105
+ # Create a new scenario with the file under the specified key
1106
+ new_data = {key_name: scenario["file"]}
1107
+ # Add all other fields from the original scenario
1108
+ for k, v in scenario.items():
1109
+ if k != "file":
1110
+ new_data[k] = v
1111
+ result.append(Scenario(new_data))
1112
+ return result
1113
+
1114
+ return sl
1020
1115
 
1021
- :param name: The name of the field.
1022
- :param values: The list of values.
1023
- :param func: An optional function to apply to the values.
1116
+ # @classmethod
1117
+ # def from_list(
1118
+ # cls, name: str, values: list, func: Optional[Callable] = None
1119
+ # ) -> ScenarioList:
1120
+ # """Create a ScenarioList from a list of values.
1024
1121
 
1025
- Example:
1122
+ # :param name: The name of the field.
1123
+ # :param values: The list of values.
1124
+ # :param func: An optional function to apply to the values.
1026
1125
 
1027
- >>> ScenarioList.from_list('name', ['Alice', 'Bob'])
1028
- ScenarioList([Scenario({'name': 'Alice'}), Scenario({'name': 'Bob'})])
1029
- """
1030
- if not func:
1031
- def identity(x):
1032
- return x
1033
- func = identity
1034
- return cls([Scenario({name: func(value)}) for value in values])
1126
+ # Example:
1127
+
1128
+ # >>> ScenarioList.from_list('name', ['Alice', 'Bob'])
1129
+ # ScenarioList([Scenario({'name': 'Alice'}), Scenario({'name': 'Bob'})])
1130
+ # """
1131
+ # if not func:
1132
+
1133
+ # def identity(x):
1134
+ # return x
1135
+
1136
+ # func = identity
1137
+ # return cls([Scenario({name: func(value)}) for value in values])
1035
1138
 
1036
1139
  def table(
1037
1140
  self,
@@ -1041,7 +1144,6 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
1041
1144
  ) -> str:
1042
1145
  """Return the ScenarioList as a table."""
1043
1146
 
1044
-
1045
1147
  if tablefmt is not None and tablefmt not in tabulate_formats:
1046
1148
  raise ValueError(
1047
1149
  f"Invalid table format: {tablefmt}",
@@ -1084,11 +1186,11 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
1084
1186
  """
1085
1187
  assert set(new_order) == set(self.parameters)
1086
1188
 
1087
- new_scenarios = []
1189
+ new_sl = ScenarioList(data=[], codebook=self.codebook)
1088
1190
  for scenario in self:
1089
1191
  new_scenario = Scenario({key: scenario[key] for key in new_order})
1090
- new_scenarios.append(new_scenario)
1091
- return ScenarioList(new_scenarios)
1192
+ new_sl.append(new_scenario)
1193
+ return new_sl
1092
1194
 
1093
1195
  def to_dataset(self) -> "Dataset":
1094
1196
  """
@@ -1128,7 +1230,7 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
1128
1230
 
1129
1231
  """
1130
1232
  new_names = new_names or [f"{field}_{i}" for i in range(len(self[0][field]))]
1131
- new_scenarios = []
1233
+ new_sl = ScenarioList(data=[], codebook=self.codebook)
1132
1234
  for scenario in self:
1133
1235
  new_scenario = scenario.copy()
1134
1236
  if len(new_names) == 1:
@@ -1139,15 +1241,25 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
1139
1241
 
1140
1242
  if not keep_original:
1141
1243
  del new_scenario[field]
1142
- new_scenarios.append(new_scenario)
1143
- return ScenarioList(new_scenarios)
1244
+ new_sl.append(new_scenario)
1245
+ return new_sl
1144
1246
 
1145
1247
  @classmethod
1146
- def from_list_of_tuples(self, *names: str, values: List[tuple]) -> ScenarioList:
1147
- sl = ScenarioList.from_list(names[0], [value[0] for value in values])
1148
- for index, name in enumerate(names[1:]):
1149
- sl = sl.add_list(name, [value[index + 1] for value in values])
1150
- return sl
1248
+ @deprecated_classmethod("ScenarioSource.from_source('list_of_tuples', ...)")
1249
+ def from_list_of_tuples(cls, field_names: list[str], values: list[tuple], use_indexes: bool = False) -> ScenarioList:
1250
+ """Create a ScenarioList from a list of tuples with specified field names.
1251
+
1252
+ Args:
1253
+ field_names: A list of field names for the tuples
1254
+ values: A list of tuples with values matching the field_names
1255
+ use_indexes: Whether to add an index field to each scenario
1256
+
1257
+ Returns:
1258
+ A ScenarioList containing the data from the tuples
1259
+ """
1260
+ from .scenario_source import TuplesSource
1261
+ source = TuplesSource(field_names, values, use_indexes)
1262
+ return source.to_scenario_list()
1151
1263
 
1152
1264
  def add_list(self, name: str, values: List[Any]) -> ScenarioList:
1153
1265
  """Add a list of values to a ScenarioList.
@@ -1158,19 +1270,25 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
1158
1270
  >>> s.add_list('age', [30, 25])
1159
1271
  ScenarioList([Scenario({'name': 'Alice', 'age': 30}), Scenario({'name': 'Bob', 'age': 25})])
1160
1272
  """
1161
- sl = self.duplicate()
1162
- if len(values) != len(sl):
1273
+ #sl = self.duplicate()
1274
+ if len(values) != len(self.data):
1163
1275
  raise ScenarioError(
1164
- f"Length of values ({len(values)}) does not match length of ScenarioList ({len(sl)})"
1276
+ f"Length of values ({len(values)}) does not match length of ScenarioList ({len(self)})"
1165
1277
  )
1278
+ new_sl = ScenarioList(data=[], codebook=self.codebook)
1166
1279
  for i, value in enumerate(values):
1167
- sl[i][name] = value
1168
- return sl
1280
+ scenario = self.data[i]
1281
+ scenario[name] = value
1282
+ new_sl.append(scenario)
1283
+ return new_sl
1169
1284
 
1170
1285
  @classmethod
1171
1286
  def create_empty_scenario_list(cls, n: int) -> ScenarioList:
1172
1287
  """Create an empty ScenarioList with n scenarios.
1173
1288
 
1289
+ Args:
1290
+ n: The number of empty scenarios to create
1291
+
1174
1292
  Example:
1175
1293
 
1176
1294
  >>> ScenarioList.create_empty_scenario_list(3)
@@ -1187,11 +1305,12 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
1187
1305
  >>> s.add_value('age', 30)
1188
1306
  ScenarioList([Scenario({'name': 'Alice', 'age': 30}), Scenario({'name': 'Bob', 'age': 30})])
1189
1307
  """
1190
- sl = self.duplicate()
1191
- for scenario in sl:
1308
+ new_sl = ScenarioList(data=[], codebook=self.codebook)
1309
+ for scenario in self:
1192
1310
  scenario[name] = value
1193
- return sl
1194
-
1311
+ new_sl.append(scenario)
1312
+ return new_sl
1313
+
1195
1314
  def rename(self, replacement_dict: dict) -> ScenarioList:
1196
1315
  """Rename the fields in the scenarios.
1197
1316
 
@@ -1204,13 +1323,11 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
1204
1323
  ScenarioList([Scenario({'first_name': 'Alice', 'years': 30}), Scenario({'first_name': 'Bob', 'years': 25})])
1205
1324
 
1206
1325
  """
1207
- new_list = ScenarioList([])
1208
- for obj in self:
1209
- new_obj = obj.rename(replacement_dict)
1210
- new_list.append(new_obj)
1211
- return new_list
1212
-
1213
-
1326
+ new_sl = ScenarioList(data = [], codebook=self.codebook)
1327
+ for scenario in self:
1328
+ new_scenario = scenario.rename(replacement_dict)
1329
+ new_sl.append(new_scenario)
1330
+ return new_sl
1214
1331
 
1215
1332
  def replace_names(self, new_names: list) -> ScenarioList:
1216
1333
  """Replace the field names in the scenarios with a new list of names.
@@ -1225,7 +1342,7 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
1225
1342
  """
1226
1343
  if not self:
1227
1344
  return ScenarioList([])
1228
-
1345
+
1229
1346
  if len(new_names) != len(self[0].keys()):
1230
1347
  raise ScenarioError(
1231
1348
  f"Length of new names ({len(new_names)}) does not match number of fields ({len(self[0].keys())})"
@@ -1253,72 +1370,71 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
1253
1370
  # return new_list
1254
1371
 
1255
1372
  @classmethod
1256
- def from_sqlite(cls, filepath: str, table: Optional[str] = None, sql_query: Optional[str] = None):
1373
+ @deprecated_classmethod("ScenarioSource.from_source('sqlite', ...)")
1374
+ def from_sqlite(
1375
+ cls, filepath: str, table: Optional[str] = None, sql_query: Optional[str] = None
1376
+ ):
1257
1377
  """Create a ScenarioList from a SQLite database.
1258
-
1378
+
1259
1379
  Args:
1260
1380
  filepath (str): Path to the SQLite database file
1261
1381
  table (Optional[str]): Name of table to query. If None, sql_query must be provided.
1262
1382
  sql_query (Optional[str]): SQL query to execute. Used if table is None.
1263
-
1383
+
1264
1384
  Returns:
1265
1385
  ScenarioList: List of scenarios created from database rows
1266
-
1386
+
1267
1387
  Raises:
1268
1388
  ValueError: If both table and sql_query are None
1269
1389
  sqlite3.Error: If there is an error executing the database query
1270
1390
  """
1271
- import sqlite3
1272
-
1391
+ from .scenario_source import SQLiteSource
1392
+
1393
+ # Handle the case where sql_query is provided instead of table
1273
1394
  if table is None and sql_query is None:
1274
1395
  from .exceptions import ValueScenarioError
1275
1396
  raise ValueScenarioError("Either table or sql_query must be provided")
1397
+
1398
+ if table is None:
1399
+ # We need to use the old implementation for SQL queries
1400
+ import sqlite3
1276
1401
 
1277
- try:
1278
- with sqlite3.connect(filepath) as conn:
1279
- cursor = conn.cursor()
1280
-
1281
- if table is not None:
1282
- cursor.execute(f"SELECT * FROM {table}")
1283
- else:
1402
+ try:
1403
+ with sqlite3.connect(filepath) as conn:
1404
+ cursor = conn.cursor()
1284
1405
  cursor.execute(sql_query)
1285
-
1286
- columns = [description[0] for description in cursor.description]
1287
- data = cursor.fetchall()
1288
-
1289
- return cls([Scenario(dict(zip(columns, row))) for row in data])
1290
-
1291
- except sqlite3.Error as e:
1292
- raise sqlite3.Error(f"Database error occurred: {str(e)}")
1406
+ columns = [description[0] for description in cursor.description]
1407
+ data = cursor.fetchall()
1293
1408
 
1294
- @classmethod
1295
- def from_latex(cls, tex_file_path: str):
1296
- with open(tex_file_path, "r") as file:
1297
- lines = file.readlines()
1409
+ return cls([Scenario(dict(zip(columns, row))) for row in data])
1298
1410
 
1299
- processed_lines = []
1300
- non_blank_lines = [
1301
- (i, line.strip()) for i, line in enumerate(lines) if line.strip()
1302
- ]
1411
+ except sqlite3.Error as e:
1412
+ raise sqlite3.Error(f"Database error occurred: {str(e)}")
1413
+ else:
1414
+ # If a table is specified, use SQLiteSource
1415
+ source = SQLiteSource(filepath, table)
1416
+ return source.to_scenario_list()
1303
1417
 
1304
- for index, (line_no, text) in enumerate(non_blank_lines):
1305
- entry = {
1306
- "line_no": line_no + 1, # Using 1-based index for line numbers
1307
- "text": text,
1308
- "num_words": len(text.split()),
1309
- "num_chars": len(text),
1310
- "line_before": non_blank_lines[index - 1][1] if index > 0 else None,
1311
- "line_after": (
1312
- non_blank_lines[index + 1][1]
1313
- if index < len(non_blank_lines) - 1
1314
- else None
1315
- ),
1316
- }
1317
- processed_lines.append(entry)
1318
-
1319
- return ScenarioList([Scenario(entry) for entry in processed_lines])
1418
+ @classmethod
1419
+ @deprecated_classmethod("ScenarioSource.from_source('latex', ...)")
1420
+ def from_latex(cls, tex_file_path: str, table_index: int = 0, has_header: bool = True):
1421
+ """Create a ScenarioList from a LaTeX file.
1422
+
1423
+ Args:
1424
+ tex_file_path: The path to the LaTeX file.
1425
+ table_index: The index of the table to extract (if multiple tables exist).
1426
+ Default is 0 (first table).
1427
+ has_header: Whether the table has a header row. Default is True.
1428
+
1429
+ Returns:
1430
+ ScenarioList: A new ScenarioList containing the data from the LaTeX table.
1431
+ """
1432
+ from .scenario_source import LaTeXSource
1433
+ source = LaTeXSource(tex_file_path, table_index, has_header)
1434
+ return source.to_scenario_list()
1320
1435
 
1321
1436
  @classmethod
1437
+ @deprecated_classmethod("ScenarioSource.from_source('google_doc', ...)")
1322
1438
  def from_google_doc(cls, url: str) -> ScenarioList:
1323
1439
  """Create a ScenarioList from a Google Doc.
1324
1440
 
@@ -1332,30 +1448,12 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
1332
1448
  ScenarioList: An instance of the ScenarioList class.
1333
1449
 
1334
1450
  """
1335
- import tempfile
1336
- import requests
1337
-
1338
- if "/edit" in url:
1339
- doc_id = url.split("/d/")[1].split("/edit")[0]
1340
- else:
1341
- from .exceptions import ValueScenarioError
1342
- raise ValueScenarioError("Invalid Google Doc URL format.")
1343
-
1344
- export_url = f"https://docs.google.com/document/d/{doc_id}/export?format=docx"
1345
-
1346
- # Download the Google Doc as a Word file (.docx)
1347
- response = requests.get(export_url)
1348
- response.raise_for_status() # Ensure the request was successful
1349
-
1350
- # Save the Word file to a temporary file
1351
- with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as temp_file:
1352
- temp_file.write(response.content)
1353
- temp_filename = temp_file.name
1354
-
1355
- # Call the from_docx class method with the temporary file
1356
- return cls.from_docx(temp_filename)
1451
+ from .scenario_source import GoogleDocSource
1452
+ source = GoogleDocSource(url)
1453
+ return source.to_scenario_list()
1357
1454
 
1358
1455
  @classmethod
1456
+ @deprecated_classmethod("ScenarioSource.from_source('pandas', ...)")
1359
1457
  def from_pandas(cls, df) -> ScenarioList:
1360
1458
  """Create a ScenarioList from a pandas DataFrame.
1361
1459
 
@@ -1366,105 +1464,48 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
1366
1464
  >>> ScenarioList.from_pandas(df)
1367
1465
  ScenarioList([Scenario({'name': 'Alice', 'age': 30, 'location': 'New York'}), Scenario({'name': 'Bob', 'age': 25, 'location': 'Los Angeles'})])
1368
1466
  """
1369
- return cls([Scenario(row) for row in df.to_dict(orient="records")])
1370
-
1467
+ from .scenario_source import PandasSource
1468
+ source = PandasSource(df)
1469
+ return source.to_scenario_list()
1371
1470
 
1372
1471
  @classmethod
1472
+ @deprecated_classmethod("ScenarioSource.from_source('dta', ...)")
1373
1473
  def from_dta(cls, filepath: str, include_metadata: bool = True) -> ScenarioList:
1374
1474
  """Create a ScenarioList from a Stata file.
1375
-
1475
+
1376
1476
  Args:
1377
1477
  filepath (str): Path to the Stata (.dta) file
1378
1478
  include_metadata (bool): If True, extract and preserve variable labels and value labels
1379
1479
  as additional metadata in the ScenarioList
1380
-
1480
+
1381
1481
  Returns:
1382
1482
  ScenarioList: A ScenarioList containing the data from the Stata file
1383
1483
  """
1384
- import pandas as pd
1385
-
1386
- # Read the Stata file with pandas
1387
- df = pd.read_stata(filepath)
1388
-
1389
- # Create the basic ScenarioList
1390
- scenario_list = cls.from_pandas(df)
1391
-
1392
- # Extract and preserve metadata if requested
1393
- if include_metadata:
1394
- # Get variable labels (if any)
1395
- variable_labels = {}
1396
- if hasattr(df, 'variable_labels') and df.variable_labels:
1397
- variable_labels = df.variable_labels
1398
-
1399
- # Get value labels (if any)
1400
- value_labels = {}
1401
- if hasattr(df, 'value_labels') and df.value_labels:
1402
- value_labels = df.value_labels
1403
-
1404
- # Store the metadata in the ScenarioList's codebook
1405
- if variable_labels or value_labels:
1406
- scenario_list.codebook = {
1407
- 'variable_labels': variable_labels,
1408
- 'value_labels': value_labels
1409
- }
1410
-
1411
- return scenario_list
1484
+ from .scenario_source import StataSource
1485
+ source = StataSource(filepath, include_metadata)
1486
+ return source.to_scenario_list()
1412
1487
 
1413
1488
  @classmethod
1414
- def from_wikipedia(cls, url: str, table_index: int = 0):
1489
+ @deprecated_classmethod("ScenarioSource.from_source('wikipedia', ...)")
1490
+ def from_wikipedia(cls, url: str, table_index: int = 0, header: bool = True):
1415
1491
  """
1416
1492
  Extracts a table from a Wikipedia page.
1417
1493
 
1418
1494
  Parameters:
1419
1495
  url (str): The URL of the Wikipedia page.
1420
1496
  table_index (int): The index of the table to extract (default is 0).
1497
+ header (bool): Whether the table has a header row (default is True).
1421
1498
 
1422
1499
  Returns:
1423
- pd.DataFrame: A DataFrame containing the extracted table.
1424
- # # Example usage
1425
- # url = "https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)"
1426
- # df = from_wikipedia(url, 0)
1427
-
1428
- # if not df.empty:
1429
- # print(df.head())
1430
- # else:
1431
- # print("Failed to extract table.")
1432
-
1433
-
1500
+ ScenarioList: A ScenarioList containing data from the Wikipedia table.
1501
+
1502
+ Example usage:
1503
+ url = "https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)"
1504
+ scenarios = ScenarioList.from_wikipedia(url, 0)
1434
1505
  """
1435
- import pandas as pd
1436
- import requests
1437
- from requests.exceptions import RequestException
1438
-
1439
- try:
1440
- # Check if the URL is reachable
1441
- response = requests.get(url)
1442
- response.raise_for_status() # Raises HTTPError for bad responses
1443
-
1444
- # Extract tables from the Wikipedia page
1445
- tables = pd.read_html(url)
1446
-
1447
- # Ensure the requested table index is within the range of available tables
1448
- if table_index >= len(tables) or table_index < 0:
1449
- raise IndexError(
1450
- f"Table index {table_index} is out of range. This page has {len(tables)} table(s)."
1451
- )
1452
-
1453
- # Return the requested table as a DataFrame
1454
- # return tables[table_index]
1455
- return cls.from_pandas(tables[table_index])
1456
-
1457
- except RequestException as e:
1458
- print(f"Error fetching the URL: {e}")
1459
- except ValueError as e:
1460
- print(f"Error parsing tables: {e}")
1461
- except IndexError as e:
1462
- print(e)
1463
- except Exception as e:
1464
- print(f"An unexpected error occurred: {e}")
1465
-
1466
- # Return an empty DataFrame in case of an error
1467
- # return cls.from_pandas(pd.DataFrame())
1506
+ from .scenario_source import WikipediaSource
1507
+ source = WikipediaSource(url, table_index, header)
1508
+ return source.to_scenario_list()
1468
1509
 
1469
1510
  def to_key_value(self, field: str, value=None) -> Union[dict, set]:
1470
1511
  """Return the set of values in the field.
@@ -1484,8 +1525,14 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
1484
1525
  return {scenario[field]: scenario[value] for scenario in self}
1485
1526
 
1486
1527
  @classmethod
1528
+ @deprecated_classmethod("ScenarioSource.from_source('excel', ...)")
1487
1529
  def from_excel(
1488
- cls, filename: str, sheet_name: Optional[str] = None, skip_rows: Optional[List[int]] = None, use_codebook: bool = False
1530
+ cls,
1531
+ filename: str,
1532
+ sheet_name: Optional[str] = None,
1533
+ skip_rows: Optional[List[int]] = None,
1534
+ use_codebook: bool = False,
1535
+ **kwargs
1489
1536
  ) -> ScenarioList:
1490
1537
  """Create a ScenarioList from an Excel file.
1491
1538
 
@@ -1497,6 +1544,8 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
1497
1544
  sheet_name (Optional[str]): Name of the sheet to load. If None and multiple sheets exist,
1498
1545
  will raise an error listing available sheets.
1499
1546
  skip_rows (Optional[List[int]]): List of row indices to skip (0-based). If None, all rows are included.
1547
+ use_codebook (bool): If True, rename columns to standard format and store original names in codebook.
1548
+ **kwargs: Additional parameters to pass to pandas.read_excel.
1500
1549
 
1501
1550
  Example:
1502
1551
 
@@ -1531,52 +1580,21 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
1531
1580
  >>> scenario_list[1]['name']
1532
1581
  'Charlie'
1533
1582
  """
1534
- from .scenario import Scenario
1535
- import pandas as pd
1536
-
1537
- # Get all sheets
1538
- all_sheets = pd.read_excel(filename, sheet_name=None)
1539
-
1540
- # If no sheet_name is provided and there is more than one sheet, print available sheets
1541
- if sheet_name is None:
1542
- if len(all_sheets) > 1:
1543
- print("The Excel file contains multiple sheets:")
1544
- for name in all_sheets.keys():
1545
- print(f"- {name}")
1546
- from .exceptions import ValueScenarioError
1547
- raise ValueScenarioError("Please provide a sheet name to load data from.")
1548
- else:
1549
- # If there is only one sheet, use it
1550
- sheet_name = list(all_sheets.keys())[0]
1551
-
1552
- # Load the specified or determined sheet
1553
- df = pd.read_excel(filename, sheet_name=sheet_name)
1554
-
1555
- # Skip specified rows if any
1556
- if skip_rows:
1557
- df = df.drop(skip_rows)
1558
- # Reset index to ensure continuous indexing
1559
- df = df.reset_index(drop=True)
1560
-
1561
- if use_codebook:
1562
- codebook = {f"col_{i}": col for i, col in enumerate(df.columns)}
1563
- koobedoc = {col:f"col_{i}" for i, col in enumerate(df.columns)}
1564
-
1565
- observations = []
1566
- for _, row in df.iterrows():
1567
- if use_codebook:
1568
- observations.append(Scenario({koobedoc.get(k):v for k,v in row.to_dict().items()}))
1569
- else:
1570
- observations.append(Scenario(row.to_dict()))
1571
-
1572
-
1573
- if use_codebook:
1574
- return cls(observations, codebook=codebook)
1575
- else:
1576
- return cls(observations)
1583
+ from .scenario_source import ExcelSource
1584
+ source = ExcelSource(
1585
+ file_path=filename,
1586
+ sheet_name=sheet_name,
1587
+ skip_rows=skip_rows,
1588
+ use_codebook=use_codebook,
1589
+ **kwargs
1590
+ )
1591
+ return source.to_scenario_list()
1577
1592
 
1578
1593
  @classmethod
1579
- def from_google_sheet(cls, url: str, sheet_name: str = None, column_names: Optional[List[str]]= None) -> ScenarioList:
1594
+ @deprecated_classmethod("ScenarioSource.from_source('google_sheet', ...)")
1595
+ def from_google_sheet(
1596
+ cls, url: str, sheet_name: str = None, column_names: Optional[List[str]] = None, **kwargs
1597
+ ) -> ScenarioList:
1580
1598
  """Create a ScenarioList from a Google Sheet.
1581
1599
 
1582
1600
  This method downloads the Google Sheet as an Excel file, saves it to a temporary file,
@@ -1588,126 +1606,111 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
1588
1606
  the same as from_excel regarding multiple sheets.
1589
1607
  column_names (List[str], optional): If provided, use these names for the columns instead
1590
1608
  of the default column names from the sheet.
1609
+ **kwargs: Additional parameters to pass to pandas.read_excel.
1591
1610
 
1592
1611
  Returns:
1593
1612
  ScenarioList: An instance of the ScenarioList class.
1594
1613
 
1595
1614
  """
1596
- import tempfile
1597
- import requests
1598
-
1599
- if "/edit" in url:
1600
- sheet_id = url.split("/d/")[1].split("/edit")[0]
1601
- else:
1602
- from .exceptions import ValueScenarioError
1603
- raise ValueScenarioError("Invalid Google Sheet URL format.")
1604
-
1605
- export_url = (
1606
- f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=xlsx"
1607
- )
1608
-
1609
- # Download the Google Sheet as an Excel file
1610
- response = requests.get(export_url)
1611
- response.raise_for_status() # Ensure the request was successful
1612
-
1613
- # Save the Excel file to a temporary file
1614
- with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as temp_file:
1615
- temp_file.write(response.content)
1616
- temp_filename = temp_file.name
1617
-
1618
- # First create the ScenarioList with default column names
1619
- scenario_list = cls.from_excel(temp_filename, sheet_name=sheet_name)
1620
-
1621
- # If column_names is provided, create a new ScenarioList with the specified names
1622
- if column_names is not None:
1623
- if len(column_names) != len(scenario_list[0].keys()):
1624
- raise ValueError(
1625
- f"Number of provided column names ({len(column_names)}) "
1626
- f"does not match number of columns in sheet ({len(scenario_list[0].keys())})"
1627
- )
1628
-
1629
- # Create a codebook mapping original keys to new names
1630
- original_keys = list(scenario_list[0].keys())
1631
- codebook = dict(zip(original_keys, column_names))
1632
-
1633
- # Return new ScenarioList with renamed columns
1634
- return scenario_list.rename(codebook)
1635
- else:
1636
- return scenario_list
1615
+ from .scenario_source import GoogleSheetSource
1616
+ source = GoogleSheetSource(url, sheet_name=sheet_name, column_names=column_names, **kwargs)
1617
+ return source.to_scenario_list()
1637
1618
 
1638
1619
  @classmethod
1620
+ @deprecated_classmethod("ScenarioSource.from_source('delimited_file', ...)")
1639
1621
  def from_delimited_file(
1640
- cls, source: Union[str, "ParseResult"], delimiter: str = ","
1622
+ cls, source: Union[str, "ParseResult"], delimiter: str = ",", encoding: str = "utf-8", **kwargs
1641
1623
  ) -> ScenarioList:
1642
- """Create a ScenarioList from a delimited file (CSV/TSV) or URL."""
1643
- import requests
1644
- from .scenario import Scenario
1645
- from urllib.parse import urlparse
1624
+ """Create a ScenarioList from a delimited file (CSV/TSV) or URL.
1625
+
1626
+ Args:
1627
+ source: Path to a local file or URL to a remote file.
1628
+ delimiter: The delimiter character used in the file (default is ',').
1629
+ encoding: The file encoding to use (default is 'utf-8').
1630
+ **kwargs: Additional parameters for csv reader.
1631
+
1632
+ Returns:
1633
+ ScenarioList: An instance of the ScenarioList class.
1634
+ """
1635
+ from .scenario_source import DelimitedFileSource
1646
1636
  from urllib.parse import ParseResult
1647
-
1648
- headers = {
1649
- "Accept": "text/csv,application/csv,text/plain",
1650
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
1651
- }
1652
-
1653
- def is_url(source):
1654
- try:
1655
- result = urlparse(source)
1656
- return all([result.scheme, result.netloc])
1657
- except ValueError:
1658
- return False
1659
-
1660
- try:
1661
- if isinstance(source, str) and is_url(source):
1662
- response = requests.get(source, headers=headers)
1663
- response.raise_for_status()
1664
- file_obj = StringIO(response.text)
1665
- elif isinstance(source, ParseResult):
1666
- response = requests.get(source.geturl(), headers=headers)
1667
- response.raise_for_status()
1668
- file_obj = StringIO(response.text)
1669
- else:
1670
- # Try different encodings if the default fails
1671
- encodings_to_try = ["utf-8", "latin-1", "cp1252", "ISO-8859-1"]
1672
- last_exception = None
1673
- file_obj = None
1674
-
1675
- for encoding in encodings_to_try:
1676
- try:
1677
- file_obj = open(source, "r", encoding=encoding)
1678
- # Test reading a bit to verify encoding
1679
- file_obj.readline()
1680
- file_obj.seek(0) # Reset file position
1681
- break
1682
- except UnicodeDecodeError as e:
1683
- last_exception = e
1684
- if file_obj:
1685
- file_obj.close()
1686
- file_obj = None
1687
-
1688
- if file_obj is None:
1689
- from .exceptions import ValueScenarioError
1690
- raise ValueScenarioError(f"Could not decode file {source} with any of the attempted encodings. Original error: {last_exception}")
1691
-
1692
- reader = csv.reader(file_obj, delimiter=delimiter)
1693
- try:
1694
- header = next(reader)
1695
- observations = [Scenario(dict(zip(header, row))) for row in reader]
1696
- except StopIteration:
1697
- from .exceptions import ValueScenarioError
1698
- raise ValueScenarioError(f"File {source} appears to be empty or has an invalid format")
1699
-
1700
- finally:
1701
- if file_obj:
1702
- file_obj.close()
1703
-
1704
- return cls(observations)
1637
+
1638
+ if isinstance(source, ParseResult):
1639
+ # Convert ParseResult to string URL
1640
+ file_or_url = source.geturl()
1641
+ else:
1642
+ file_or_url = source
1643
+
1644
+ source = DelimitedFileSource(
1645
+ file_or_url=file_or_url,
1646
+ delimiter=delimiter,
1647
+ encoding=encoding,
1648
+ **kwargs
1649
+ )
1650
+ return source.to_scenario_list()
1705
1651
 
1706
1652
  # Convenience methods for specific file types
1707
1653
  @classmethod
1708
- def from_csv(cls, source: Union[str, "ParseResult"]) -> ScenarioList:
1709
- """Create a ScenarioList from a CSV file or URL."""
1710
- return cls.from_delimited_file(source, delimiter=",")
1654
+ @deprecated_classmethod("ScenarioSource.from_source('csv', ...)")
1655
+ def from_csv(cls, source: Union[str, "ParseResult"], has_header: bool = True, encoding: str = "utf-8", **kwargs) -> ScenarioList:
1656
+ """Create a ScenarioList from a CSV file or URL.
1657
+
1658
+ Args:
1659
+ source: Path to a local file or URL to a remote file.
1660
+ has_header: Whether the file has a header row (default is True).
1661
+ encoding: The file encoding to use (default is 'utf-8').
1662
+ **kwargs: Additional parameters for csv reader.
1663
+
1664
+ Returns:
1665
+ ScenarioList: An instance of the ScenarioList class.
1666
+ """
1667
+ from .scenario_source import CSVSource
1668
+ from urllib.parse import ParseResult
1669
+
1670
+ if isinstance(source, ParseResult):
1671
+ # Convert ParseResult to string URL
1672
+ file_or_url = source.geturl()
1673
+ else:
1674
+ file_or_url = source
1675
+
1676
+ source = CSVSource(
1677
+ file_or_url=file_or_url,
1678
+ has_header=has_header,
1679
+ encoding=encoding,
1680
+ **kwargs
1681
+ )
1682
+ return source.to_scenario_list()
1683
+
1684
+ @classmethod
1685
+ @deprecated_classmethod("ScenarioSource.from_source('tsv', ...)")
1686
+ def from_tsv(cls, source: Union[str, "ParseResult"], has_header: bool = True, encoding: str = "utf-8", **kwargs) -> ScenarioList:
1687
+ """Create a ScenarioList from a TSV file or URL.
1688
+
1689
+ Args:
1690
+ source: Path to a local file or URL to a remote file.
1691
+ has_header: Whether the file has a header row (default is True).
1692
+ encoding: The file encoding to use (default is 'utf-8').
1693
+ **kwargs: Additional parameters for csv reader.
1694
+
1695
+ Returns:
1696
+ ScenarioList: An instance of the ScenarioList class.
1697
+ """
1698
+ from .scenario_source import TSVSource
1699
+ from urllib.parse import ParseResult
1700
+
1701
+ if isinstance(source, ParseResult):
1702
+ # Convert ParseResult to string URL
1703
+ file_or_url = source.geturl()
1704
+ else:
1705
+ file_or_url = source
1706
+
1707
+ source = TSVSource(
1708
+ file_or_url=file_or_url,
1709
+ has_header=has_header,
1710
+ encoding=encoding,
1711
+ **kwargs
1712
+ )
1713
+ return source.to_scenario_list()
1711
1714
 
1712
1715
  def left_join(self, other: ScenarioList, by: Union[str, list[str]]) -> ScenarioList:
1713
1716
  """Perform a left join with another ScenarioList, following SQL join semantics.
@@ -1730,21 +1733,35 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
1730
1733
  @classmethod
1731
1734
  def from_tsv(cls, source: Union[str, "ParseResult"]) -> ScenarioList:
1732
1735
  """Create a ScenarioList from a TSV file or URL."""
1733
- return cls.from_delimited_file(source, delimiter="\t")
1736
+ from .scenario_source import ScenarioSource
1737
+
1738
+ # Delegate to ScenarioSource implementation
1739
+ return ScenarioSource._from_tsv(source)
1734
1740
 
1735
1741
  def to_dict(self, sort: bool = False, add_edsl_version: bool = True) -> dict:
1736
1742
  """
1737
1743
  >>> s = ScenarioList([Scenario({'food': 'wood chips'}), Scenario({'food': 'wood-fired pizza'})])
1738
- >>> s.to_dict()
1744
+ >>> s.to_dict() # doctest: +ELLIPSIS
1739
1745
  {'scenarios': [{'food': 'wood chips', 'edsl_version': '...', 'edsl_class_name': 'Scenario'}, {'food': 'wood-fired pizza', 'edsl_version': '...', 'edsl_class_name': 'Scenario'}], 'edsl_version': '...', 'edsl_class_name': 'ScenarioList'}
1740
1746
 
1747
+ >>> s = ScenarioList([Scenario({'food': 'wood chips'})], codebook={'food': 'description'})
1748
+ >>> d = s.to_dict()
1749
+ >>> 'codebook' in d
1750
+ True
1751
+ >>> d['codebook'] == {'food': 'description'}
1752
+ True
1741
1753
  """
1742
1754
  if sort:
1743
1755
  data = sorted(self, key=lambda x: hash(x))
1744
1756
  else:
1745
1757
  data = self
1758
+
1746
1759
  d = {"scenarios": [s.to_dict(add_edsl_version=add_edsl_version) for s in data]}
1747
1760
 
1761
+ # Add codebook if it exists
1762
+ if hasattr(self, 'codebook') and self.codebook:
1763
+ d['codebook'] = self.codebook
1764
+
1748
1765
  if add_edsl_version:
1749
1766
  from .. import __version__
1750
1767
 
@@ -1758,8 +1775,8 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
1758
1775
  :param survey: The Survey object to use for the Jobs object.
1759
1776
 
1760
1777
  Example:
1761
- >>> from edsl import Survey, Jobs, ScenarioList
1762
- >>> isinstance(ScenarioList.example().to(Survey.example()), Jobs)
1778
+ >>> from edsl import Survey, Jobs, ScenarioList # doctest: +SKIP
1779
+ >>> isinstance(ScenarioList.example().to(Survey.example()), Jobs) # doctest: +SKIP
1763
1780
  True
1764
1781
  """
1765
1782
  from ..surveys import Survey
@@ -1786,11 +1803,23 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
1786
1803
 
1787
1804
  @classmethod
1788
1805
  @remove_edsl_version
1789
- def from_dict(cls, data) -> ScenarioList:
1790
- """Create a `ScenarioList` from a dictionary."""
1806
+ def from_dict(cls, data: dict) -> ScenarioList:
1807
+ """Create a `ScenarioList` from a dictionary.
1808
+
1809
+ >>> d = {'scenarios': [{'food': 'wood chips'}], 'codebook': {'food': 'description'}}
1810
+ >>> s = ScenarioList.from_dict(d)
1811
+ >>> s.codebook == {'food': 'description'}
1812
+ True
1813
+ >>> s[0]['food']
1814
+ 'wood chips'
1815
+ """
1791
1816
  from .scenario import Scenario
1792
1817
 
1793
- return cls([Scenario.from_dict(s) for s in data["scenarios"]])
1818
+ # Extract codebook if it exists
1819
+ codebook = data.get('codebook', None)
1820
+
1821
+ # Create ScenarioList with scenarios and codebook
1822
+ return cls([Scenario.from_dict(s) for s in data["scenarios"]], codebook=codebook)
1794
1823
 
1795
1824
  @classmethod
1796
1825
  def from_nested_dict(cls, data: dict) -> ScenarioList:
@@ -1835,62 +1864,80 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
1835
1864
  """
1836
1865
  return cls([Scenario.example(randomize), Scenario.example(randomize)])
1837
1866
 
1838
- def __getitem__(self, key: Union[int, slice]) -> Any:
1839
- """Return the item at the given index.
1840
1867
 
1841
- Example:
1842
- >>> s = ScenarioList([Scenario({'age': 22, 'hair': 'brown', 'height': 5.5}), Scenario({'age': 22, 'hair': 'brown', 'height': 5.5})])
1843
- >>> s[0]
1844
- Scenario({'age': 22, 'hair': 'brown', 'height': 5.5})
1868
+ def items(self):
1869
+ """Make this class compatible with dict.items() by accessing first scenario items.
1845
1870
 
1846
- >>> s[:1]
1847
- ScenarioList([Scenario({'age': 22, 'hair': 'brown', 'height': 5.5})])
1871
+ This ensures the class works as a drop-in replacement for UserList in code
1872
+ that expects a dictionary-like interface.
1848
1873
 
1874
+ Returns:
1875
+ items view from the first scenario object if available, empty list otherwise
1849
1876
  """
1850
- if isinstance(key, slice):
1851
- return ScenarioList(super().__getitem__(key))
1852
- elif isinstance(key, int):
1853
- return super().__getitem__(key)
1854
- else:
1855
- return self.to_dict(add_edsl_version=False)[key]
1877
+ if len(self.data) > 0:
1878
+ return self.data[0].items()
1879
+ return {}.items()
1856
1880
 
1857
- def to_agent_list(self):
1858
- """Convert the ScenarioList to an AgentList.
1881
+ def copy(self):
1882
+ """Create a copy of this ScenarioList.
1859
1883
 
1860
- Example:
1861
-
1862
- >>> s = ScenarioList([Scenario({'age': 22, 'hair': 'brown', 'height': 5.5}), Scenario({'age': 22, 'hair': 'brown', 'height': 5.5})])
1863
- >>> s.to_agent_list()
1864
- AgentList([Agent(traits = {'age': 22, 'hair': 'brown', 'height': 5.5}), Agent(traits = {'age': 22, 'hair': 'brown', 'height': 5.5})])
1884
+ Returns:
1885
+ A new ScenarioList with copies of the same scenarios
1865
1886
  """
1866
- from ..agents import AgentList, Agent
1887
+ # Get copies of all scenarios
1888
+ if len(self.data) > 0:
1889
+ # If we have at least one scenario, copy the first one
1890
+ if hasattr(self.data[0], "copy"):
1891
+ return self.data[0].copy()
1892
+ # Otherwise try to convert to Scenario
1893
+ from .scenario import Scenario
1867
1894
 
1868
- agents = []
1869
- for scenario in self:
1870
- new_scenario = scenario.copy().data
1871
- if "name" in new_scenario:
1872
- name = new_scenario.pop("name")
1873
- proposed_agent_name = "agent_name"
1874
- while proposed_agent_name not in new_scenario:
1875
- proposed_agent_name += "_"
1876
- warnings.warn(
1877
- f"The 'name' field is reserved for the agent's name---putting this value in {proposed_agent_name}"
1878
- )
1879
- new_scenario[proposed_agent_name] = name
1880
- new_agent = Agent(traits=new_scenario, name=name)
1881
- if "agent_parameters" in new_scenario:
1882
- agent_parameters = new_scenario.pop("agent_parameters")
1883
- instruction = agent_parameters.get("instruction", None)
1884
- name = agent_parameters.get("name", None)
1885
- new_agent = Agent(
1886
- traits=new_scenario, name=name, instruction=instruction
1887
- )
1888
- else:
1889
- new_agent = Agent(traits=new_scenario)
1895
+ try:
1896
+ return Scenario(dict(self.data[0]))
1897
+ except (TypeError, ValueError):
1898
+ # Fallback to empty scenario
1899
+ return Scenario({})
1890
1900
 
1891
- agents.append(new_agent)
1892
1901
 
1893
- return AgentList(agents)
1902
+ def to_agent_list(self):
1903
+ """Convert the ScenarioList to an AgentList.
1904
+
1905
+ This method supports special fields that map to Agent parameters:
1906
+ - "name": Will be used as the agent's name
1907
+ - "agent_parameters": A dictionary containing:
1908
+ - "instruction": The agent's instruction text
1909
+ - "name": The agent's name (overrides the "name" field if present)
1910
+
1911
+ Example:
1912
+ >>> from edsl import ScenarioList, Scenario
1913
+ >>> # Basic usage with traits
1914
+ >>> s = ScenarioList([Scenario({'age': 22, 'hair': 'brown', 'height': 5.5})])
1915
+ >>> al = s.to_agent_list()
1916
+ >>> al
1917
+ AgentList([Agent(traits = {'age': 22, 'hair': 'brown', 'height': 5.5})])
1918
+
1919
+ >>> # Using agent name
1920
+ >>> s = ScenarioList([Scenario({'name': 'Alice', 'age': 22})])
1921
+ >>> al = s.to_agent_list()
1922
+ >>> al[0].name
1923
+ 'Alice'
1924
+
1925
+ >>> # Using agent parameters for instructions
1926
+ >>> s = ScenarioList([Scenario({
1927
+ ... 'age': 22,
1928
+ ... 'agent_parameters': {
1929
+ ... 'instruction': 'You are a helpful assistant',
1930
+ ... 'name': 'Assistant'
1931
+ ... }
1932
+ ... })])
1933
+ >>> al = s.to_agent_list()
1934
+ >>> al[0].instruction
1935
+ 'You are a helpful assistant'
1936
+ >>> al[0].name
1937
+ 'Assistant'
1938
+ """
1939
+ from ..agents import AgentList
1940
+ return AgentList.from_scenario_list(self)
1894
1941
 
1895
1942
  def chunk(
1896
1943
  self,
@@ -1920,7 +1967,9 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
1920
1967
  new_scenarios.extend(replacement_scenarios)
1921
1968
  return ScenarioList(new_scenarios)
1922
1969
 
1923
- def collapse(self, field: str, separator: Optional[str] = None, add_count: bool = False) -> ScenarioList:
1970
+ def collapse(
1971
+ self, field: str, separator: Optional[str] = None, add_count: bool = False
1972
+ ) -> ScenarioList:
1924
1973
  """Collapse a ScenarioList by grouping on all fields except the specified one,
1925
1974
  collecting the values of the specified field into a list.
1926
1975
 
@@ -1943,10 +1992,10 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
1943
1992
  """
1944
1993
  if not self:
1945
1994
  return ScenarioList([])
1946
-
1995
+
1947
1996
  # Determine all fields except the one to collapse
1948
1997
  id_vars = [key for key in self[0].keys() if key != field]
1949
-
1998
+
1950
1999
  # Group the scenarios
1951
2000
  grouped = defaultdict(list)
1952
2001
  for scenario in self:
@@ -1954,33 +2003,34 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
1954
2003
  key = tuple(scenario[id_var] for id_var in id_vars)
1955
2004
  # Add the value of the field to collapse to the list for this key
1956
2005
  grouped[key].append(scenario[field])
1957
-
2006
+
1958
2007
  # Create a new ScenarioList with the collapsed field
1959
- result = []
2008
+ new_sl = ScenarioList(data = [], codebook=self.codebook)
1960
2009
  for key, values in grouped.items():
1961
2010
  new_scenario = dict(zip(id_vars, key))
1962
2011
  if separator:
1963
- new_scenario[field] = separator.join(values)
2012
+ new_scenario[field] = separator.join([str(x) for x in values])
1964
2013
  else:
1965
2014
  new_scenario[field] = values
1966
2015
  if add_count:
1967
- new_scenario['num_collapsed_rows'] = len(values)
1968
- result.append(Scenario(new_scenario))
1969
-
1970
- return ScenarioList(result)
2016
+ new_scenario["num_collapsed_rows"] = len(values)
2017
+ new_sl.append(Scenario(new_scenario))
2018
+
2019
+ #return ScenarioList(result)
2020
+ return new_sl
1971
2021
 
1972
2022
  def create_comparisons(
1973
- self,
1974
- bidirectional: bool = False,
2023
+ self,
2024
+ bidirectional: bool = False,
1975
2025
  num_options: int = 2,
1976
2026
  option_prefix: str = "option_",
1977
- use_alphabet: bool = False
2027
+ use_alphabet: bool = False,
1978
2028
  ) -> ScenarioList:
1979
2029
  """Create a new ScenarioList with comparisons between scenarios.
1980
-
2030
+
1981
2031
  Each scenario in the result contains multiple original scenarios as dictionaries,
1982
2032
  allowing for side-by-side comparison.
1983
-
2033
+
1984
2034
  Args:
1985
2035
  bidirectional (bool): If True, include both (A,B) and (B,A) comparisons.
1986
2036
  If False, only include (A,B) where A comes before B in the original list.
@@ -1991,11 +2041,11 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
1991
2041
  Ignored if use_alphabet is True.
1992
2042
  use_alphabet (bool): If True, use letters as keys (A, B, C, etc.) instead of
1993
2043
  the option_prefix with numbers.
1994
-
2044
+
1995
2045
  Returns:
1996
2046
  ScenarioList: A new ScenarioList where each scenario contains multiple original
1997
2047
  scenarios as dictionaries.
1998
-
2048
+
1999
2049
  Example:
2000
2050
  >>> s = ScenarioList([
2001
2051
  ... Scenario({'id': 1, 'text': 'Option A'}),
@@ -2009,22 +2059,29 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
2009
2059
  """
2010
2060
  from itertools import combinations, permutations
2011
2061
  import string
2012
-
2062
+
2013
2063
  if num_options < 2:
2014
2064
  from .exceptions import ValueScenarioError
2065
+
2015
2066
  raise ValueScenarioError("num_options must be at least 2")
2016
-
2067
+
2017
2068
  if num_options > len(self):
2018
2069
  from .exceptions import ValueScenarioError
2019
- raise ValueScenarioError(f"num_options ({num_options}) cannot exceed the number of scenarios ({len(self)})")
2020
-
2070
+
2071
+ raise ValueScenarioError(
2072
+ f"num_options ({num_options}) cannot exceed the number of scenarios ({len(self)})"
2073
+ )
2074
+
2021
2075
  if use_alphabet and num_options > 26:
2022
2076
  from .exceptions import ValueScenarioError
2023
- raise ValueScenarioError("When using alphabet labels, num_options cannot exceed 26 (the number of letters in the English alphabet)")
2024
-
2077
+
2078
+ raise ValueScenarioError(
2079
+ "When using alphabet labels, num_options cannot exceed 26 (the number of letters in the English alphabet)"
2080
+ )
2081
+
2025
2082
  # Convert each scenario to a dictionary
2026
2083
  scenario_dicts = [scenario.to_dict(add_edsl_version=False) for scenario in self]
2027
-
2084
+
2028
2085
  # Generate combinations or permutations based on bidirectional flag
2029
2086
  if bidirectional:
2030
2087
  # For bidirectional, use permutations to get all ordered arrangements
@@ -2032,13 +2089,13 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
2032
2089
  # For pairwise, we can use permutations with r=2
2033
2090
  scenario_groups = permutations(scenario_dicts, 2)
2034
2091
  else:
2035
- # For more than 2 options with bidirectional=True,
2092
+ # For more than 2 options with bidirectional=True,
2036
2093
  # we need all permutations of the specified size
2037
2094
  scenario_groups = permutations(scenario_dicts, num_options)
2038
2095
  else:
2039
2096
  # For unidirectional, use combinations to get unordered groups
2040
2097
  scenario_groups = combinations(scenario_dicts, num_options)
2041
-
2098
+
2042
2099
  # Create new scenarios with the combinations
2043
2100
  result = []
2044
2101
  for group in scenario_groups:
@@ -2052,64 +2109,35 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
2052
2109
  key = f"{option_prefix}{i+1}"
2053
2110
  new_scenario[key] = scenario_dict
2054
2111
  result.append(Scenario(new_scenario))
2055
-
2112
+
2056
2113
  return ScenarioList(result)
2114
+
2057
2115
 
2058
2116
  @classmethod
2117
+ @deprecated_classmethod("ScenarioSource.from_source('parquet', ...)")
2059
2118
  def from_parquet(cls, filepath: str) -> ScenarioList:
2060
2119
  """Create a ScenarioList from a Parquet file.
2061
-
2120
+
2062
2121
  Args:
2063
- filepath (str): Path to the Parquet file
2064
-
2122
+ filepath (str): The path to the Parquet file.
2123
+
2065
2124
  Returns:
2066
- ScenarioList: A ScenarioList containing the data from the Parquet file
2067
-
2068
- Example:
2069
- >>> import pandas as pd
2070
- >>> import tempfile
2071
- >>> df = pd.DataFrame({'name': ['Alice', 'Bob'], 'age': [30, 25]})
2072
- >>> # The following would create and read a parquet file if dependencies are installed:
2073
- >>> # with tempfile.NamedTemporaryFile(suffix='.parquet', delete=False) as f:
2074
- >>> # df.to_parquet(f.name)
2075
- >>> # scenario_list = ScenarioList.from_parquet(f.name)
2076
- >>> # Instead, we'll demonstrate the equivalent result:
2077
- >>> scenario_list = ScenarioList.from_pandas(df)
2078
- >>> len(scenario_list)
2079
- 2
2080
- >>> scenario_list[0]['name']
2081
- 'Alice'
2125
+ ScenarioList: A new ScenarioList containing the scenarios from the Parquet file.
2082
2126
  """
2083
- import pandas as pd
2084
-
2085
- try:
2086
- # Try to read the Parquet file with pandas
2087
- df = pd.read_parquet(filepath)
2088
- except ImportError as e:
2089
- # Handle missing dependencies with a helpful error message
2090
- if "pyarrow" in str(e) or "fastparquet" in str(e):
2091
- raise ImportError(
2092
- "Missing dependencies for Parquet support. Please install either pyarrow or fastparquet:\n"
2093
- " pip install pyarrow\n"
2094
- " or\n"
2095
- " pip install fastparquet"
2096
- ) from e
2097
- else:
2098
- raise
2099
-
2100
- # Convert the DataFrame to a ScenarioList
2101
- return cls.from_pandas(df)
2127
+ from .scenario_source import ParquetSource
2128
+ source = ParquetSource(filepath)
2129
+ return source.to_scenario_list()
2102
2130
 
2103
- def replace_values(self, replacements:dict) -> "ScenarioList":
2131
+ def replace_values(self, replacements: dict) -> "ScenarioList":
2104
2132
  """
2105
2133
  Create new scenarios with values replaced according to the provided replacement dictionary.
2106
-
2134
+
2107
2135
  Args:
2108
2136
  replacements (dict): Dictionary of values to replace {old_value: new_value}
2109
-
2137
+
2110
2138
  Returns:
2111
2139
  ScenarioList: A new ScenarioList with replaced values
2112
-
2140
+
2113
2141
  Examples:
2114
2142
  >>> scenarios = ScenarioList([
2115
2143
  ... Scenario({'a': 'nan', 'b': 1}),
@@ -2122,7 +2150,7 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
2122
2150
  >>> print(scenarios)
2123
2151
  ScenarioList([Scenario({'a': 'nan', 'b': 1}), Scenario({'a': 2, 'b': 'nan'})])
2124
2152
  """
2125
- new_scenarios = []
2153
+ new_sl = ScenarioList(data=[], codebook=self.codebook)
2126
2154
  for scenario in self:
2127
2155
  new_scenario = {}
2128
2156
  for key, value in scenario.items():
@@ -2130,20 +2158,65 @@ class ScenarioList(Base, UserList, ScenarioListOperationsMixin):
2130
2158
  new_scenario[key] = replacements[str(value)]
2131
2159
  else:
2132
2160
  new_scenario[key] = value
2133
- new_scenarios.append(Scenario(new_scenario))
2134
- return ScenarioList(new_scenarios)
2135
-
2161
+ new_sl.append(Scenario(new_scenario))
2162
+ return new_sl
2163
+
2136
2164
  @classmethod
2165
+ @deprecated_classmethod("ScenarioSource.from_source('pdf', ...)")
2137
2166
  def from_pdf(cls, filename_or_url, collapse_pages=False):
2138
- return PdfTools.from_pdf(filename_or_url, collapse_pages)
2139
-
2167
+ """Create a ScenarioList from a PDF file or URL."""
2168
+ from .scenario_source import PDFSource
2169
+
2170
+ source = PDFSource(
2171
+ file_path=filename_or_url,
2172
+ chunk_type="page" if not collapse_pages else "text",
2173
+ chunk_size=1
2174
+ )
2175
+ return source.to_scenario_list()
2176
+
2140
2177
  @classmethod
2178
+ @deprecated_classmethod("ScenarioSource.from_source('pdf_to_image', ...)")
2141
2179
  def from_pdf_to_image(cls, pdf_path, image_format="jpeg"):
2142
- return PdfTools.from_pdf_to_image(pdf_path, image_format)
2143
-
2180
+ """Create a ScenarioList with images extracted from a PDF file."""
2181
+ from .scenario_source import PDFImageSource
2182
+
2183
+ source = PDFImageSource(
2184
+ file_path=pdf_path,
2185
+ base_width=2000,
2186
+ include_text=True
2187
+ )
2188
+ return source.to_scenario_list()
2189
+
2190
+ @classmethod
2191
+ def from_source(cls, source_type: str, *args, **kwargs) -> "ScenarioList":
2192
+ """
2193
+ Create a ScenarioList from a specified source type.
2194
+
2195
+ This method serves as the main entry point for creating ScenarioList objects,
2196
+ providing a unified interface for various data sources.
2197
+
2198
+ Args:
2199
+ source_type: The type of source to create a ScenarioList from.
2200
+ Valid values include: 'urls', 'directory', 'csv', 'tsv',
2201
+ 'excel', 'pdf', 'pdf_to_image', and others.
2202
+ *args: Positional arguments to pass to the source-specific method.
2203
+ **kwargs: Keyword arguments to pass to the source-specific method.
2204
+
2205
+ Returns:
2206
+ A ScenarioList object created from the specified source.
2207
+
2208
+ Examples:
2209
+ >>> # This is a simplified example for doctest
2210
+ >>> # In real usage, you would provide a path to your CSV file:
2211
+ >>> # sl_csv = ScenarioList.from_source('csv', 'your_data.csv')
2212
+ >>> # Or use other source types like 'directory', 'excel', etc.
2213
+ >>> # Examples of other source types:
2214
+ >>> # sl_dir = ScenarioList.from_source('directory', '/path/to/files')
2215
+ """
2216
+ from .scenario_source import ScenarioSource
2217
+ return ScenarioSource.from_source(source_type, *args, **kwargs)
2144
2218
 
2145
2219
 
2146
2220
  if __name__ == "__main__":
2147
2221
  import doctest
2148
-
2149
- doctest.testmod(optionflags=doctest.ELLIPSIS)
2222
+ doctest.testmod(optionflags=doctest.ELLIPSIS)