edsl 0.1.60__py3-none-any.whl → 0.1.61__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -16,32 +16,53 @@ from __future__ import annotations
16
16
  import functools
17
17
  import warnings
18
18
  import fnmatch
19
- from typing import Any, Callable, List, Literal, Optional, Type, TypeVar, Union, TYPE_CHECKING, cast, Any
19
+ from collections import defaultdict
20
+ import warnings
21
+ from typing import (
22
+ Any,
23
+ Callable,
24
+ List,
25
+ Literal,
26
+ Optional,
27
+ Type,
28
+ TypeVar,
29
+ Union,
30
+ TYPE_CHECKING,
31
+ cast,
32
+ Any,
33
+ )
34
+
35
+ T = TypeVar("T")
20
36
 
21
- T = TypeVar('T')
22
37
 
23
- def deprecated_classmethod(alternative: str) -> Callable[[Callable[..., T]], Callable[..., T]]:
38
+ def deprecated_classmethod(
39
+ alternative: str,
40
+ ) -> Callable[[Callable[..., T]], Callable[..., T]]:
24
41
  """
25
42
  Decorator that marks a class method as deprecated.
26
-
43
+
27
44
  Args:
28
45
  alternative: The suggested alternative to use instead
29
-
46
+
30
47
  Returns:
31
48
  A decorator function that wraps the original method with a deprecation warning
32
49
  """
50
+
33
51
  def decorator(func: Callable[..., T]) -> Callable[..., T]:
34
52
  @functools.wraps(func)
35
53
  def wrapper(*args: Any, **kwargs: Any) -> T:
36
54
  warnings.warn(
37
55
  f"{func.__qualname__} is deprecated. Use {alternative} instead.",
38
56
  DeprecationWarning,
39
- stacklevel=2
57
+ stacklevel=2,
40
58
  )
41
59
  return func(*args, **kwargs)
60
+
42
61
  return wrapper
62
+
43
63
  return decorator
44
64
 
65
+
45
66
  import os
46
67
  import csv
47
68
  import json
@@ -61,27 +82,28 @@ from .exceptions import ScenarioError
61
82
 
62
83
  from abc import ABC, abstractmethod
63
84
 
85
+
64
86
  class Source(ABC):
65
87
  # Registry to store child classes and their source types
66
- _registry: dict[str, Type['Source']] = {}
88
+ _registry: dict[str, Type["Source"]] = {}
67
89
 
68
90
  def __init_subclass__(cls, **kwargs):
69
91
  """Automatically register subclasses with their source_type."""
70
92
  super().__init_subclass__(**kwargs)
71
- if hasattr(cls, 'source_type'):
93
+ if hasattr(cls, "source_type"):
72
94
  Source._registry[cls.source_type] = cls
73
95
 
74
96
  @classmethod
75
97
  @abstractmethod
76
- def example(cls) -> 'Source':
98
+ def example(cls) -> "Source":
77
99
  """
78
100
  Return an example instance of this Source type.
79
-
101
+
80
102
  This method should return a valid instance of the Source subclass
81
103
  that can be used for testing. The instance should be created with
82
104
  reasonable default values that will produce a valid ScenarioList
83
105
  when to_scenario_list() is called.
84
-
106
+
85
107
  Returns:
86
108
  An instance of the Source subclass
87
109
  """
@@ -91,14 +113,14 @@ class Source(ABC):
91
113
  def to_scenario_list(self):
92
114
  """
93
115
  Convert the source to a ScenarioList.
94
-
116
+
95
117
  Returns:
96
118
  A ScenarioList containing the data from this source
97
119
  """
98
120
  pass
99
121
 
100
122
  @classmethod
101
- def get_source_class(cls, source_type: str) -> Type['Source']:
123
+ def get_source_class(cls, source_type: str) -> Type["Source"]:
102
124
  """Get the Source subclass for a given source_type."""
103
125
  if source_type not in cls._registry:
104
126
  raise ValueError(f"No Source subclass found for source_type: {source_type}")
@@ -114,12 +136,12 @@ class Source(ABC):
114
136
  """
115
137
  Test all registered source types by creating an example instance
116
138
  and calling to_scenario_list() on it.
117
-
139
+
118
140
  Returns:
119
141
  A dictionary mapping source types to boolean success values
120
142
  """
121
143
  from .scenario_list import ScenarioList
122
-
144
+
123
145
  results = {}
124
146
  for source_type, source_class in cls._registry.items():
125
147
  try:
@@ -130,7 +152,9 @@ class Source(ABC):
130
152
  # Basic validation
131
153
  if not isinstance(scenario_list, ScenarioList):
132
154
  results[source_type] = False
133
- print(f"Source {source_type} returned {type(scenario_list)} instead of ScenarioList")
155
+ print(
156
+ f"Source {source_type} returned {type(scenario_list)} instead of ScenarioList"
157
+ )
134
158
  else:
135
159
  results[source_type] = True
136
160
  except Exception as e:
@@ -138,6 +162,7 @@ class Source(ABC):
138
162
  print(f"Source {source_type} exception: {e}")
139
163
  return results
140
164
 
165
+
141
166
  class URLSource(Source):
142
167
  source_type = "urls"
143
168
 
@@ -146,19 +171,16 @@ class URLSource(Source):
146
171
  self.field_name = field_name
147
172
 
148
173
  @classmethod
149
- def example(cls) -> 'URLSource':
174
+ def example(cls) -> "URLSource":
150
175
  """Return an example URLSource instance."""
151
- return cls(
152
- urls=['http://www.example.com'],
153
- field_name="text"
154
- )
155
-
176
+ return cls(urls=["http://www.example.com"], field_name="text")
177
+
156
178
  def to_scenario_list(self):
157
179
  """Create a ScenarioList from a list of URLs."""
158
180
  import requests
159
-
181
+
160
182
  from .scenario_list import ScenarioList
161
-
183
+
162
184
  result = ScenarioList()
163
185
  for url in self.urls:
164
186
  try:
@@ -169,9 +191,9 @@ class URLSource(Source):
169
191
  except requests.RequestException as e:
170
192
  warnings.warn(f"Failed to fetch URL {url}: {str(e)}")
171
193
  continue
172
-
194
+
173
195
  return result
174
-
196
+
175
197
 
176
198
  class ListSource(Source):
177
199
  source_type = "list"
@@ -182,26 +204,26 @@ class ListSource(Source):
182
204
  self.use_indexes = use_indexes
183
205
 
184
206
  @classmethod
185
- def example(cls) -> 'ListSource':
207
+ def example(cls) -> "ListSource":
186
208
  """Return an example ListSource instance."""
187
209
  return cls(
188
210
  field_name="text",
189
211
  values=["example1", "example2", "example3"],
190
- use_indexes=True
212
+ use_indexes=True,
191
213
  )
192
214
 
193
215
  def to_scenario_list(self):
194
216
  """Create a ScenarioList from a list of values with a specified field name."""
195
217
  from .scenario_list import ScenarioList
196
-
218
+
197
219
  scenarios = []
198
-
220
+
199
221
  for i, value in enumerate(self.values):
200
222
  scenario_dict = {self.field_name: value}
201
223
  if self.use_indexes:
202
224
  scenario_dict["idx"] = i
203
225
  scenarios.append(Scenario(scenario_dict))
204
-
226
+
205
227
  return ScenarioList(scenarios)
206
228
 
207
229
 
@@ -225,48 +247,48 @@ class DirectorySource(Source):
225
247
  self.ignore_files = ignore_files or []
226
248
 
227
249
  @classmethod
228
- def example(cls) -> 'DirectorySource':
250
+ def example(cls) -> "DirectorySource":
229
251
  """Return an example DirectorySource instance."""
230
252
  import tempfile
231
253
  import os
232
-
254
+
233
255
  # Create a temporary directory for the example
234
256
  temp_dir = tempfile.mkdtemp(prefix="edsl_test_")
235
-
257
+
236
258
  # Create some sample files in the directory
237
259
  with open(os.path.join(temp_dir, "test1.txt"), "w") as f:
238
260
  f.write("Sample content 1")
239
-
261
+
240
262
  with open(os.path.join(temp_dir, "test2.txt"), "w") as f:
241
263
  f.write("Sample content 2")
242
-
264
+
243
265
  # Create a subdirectory with a file
244
266
  subdir = os.path.join(temp_dir, "subdir")
245
267
  os.makedirs(subdir, exist_ok=True)
246
268
  with open(os.path.join(subdir, "test3.txt"), "w") as f:
247
269
  f.write("Sample content 3")
248
-
270
+
249
271
  return cls(
250
272
  directory=temp_dir,
251
273
  pattern="*.txt",
252
274
  recursive=True,
253
275
  metadata=True,
254
276
  ignore_dirs=["__pycache__"],
255
- ignore_files=["*.pyc"]
277
+ ignore_files=["*.pyc"],
256
278
  )
257
-
279
+
258
280
  def to_scenario_list(self):
259
281
  """Create a ScenarioList from files in a directory."""
260
282
  import os
261
283
  import glob
262
-
284
+
263
285
  from .scenario_list import ScenarioList
264
-
286
+
265
287
  # Set default recursive value
266
288
  recursive = self.recursive
267
-
289
+
268
290
  # Handle paths with wildcards properly
269
- if '*' in self.directory:
291
+ if "*" in self.directory:
270
292
  # Handle "**/*.py" patterns (recursive wildcard)
271
293
  if "**" in self.directory:
272
294
  parts = self.directory.split("**")
@@ -287,52 +309,58 @@ class DirectorySource(Source):
287
309
  else:
288
310
  directory = self.directory
289
311
  pattern = self.pattern
290
-
312
+
291
313
  # Check if directory exists
292
314
  if not os.path.isdir(directory):
293
315
  from .exceptions import FileNotFoundScenarioError
316
+
294
317
  raise FileNotFoundScenarioError(f"Directory not found: {directory}")
295
-
318
+
296
319
  # Use glob directly for ** patterns to prevent duplicates
297
320
  if "**" in pattern:
298
321
  from .scenario_list import ScenarioList
299
322
  from .file_store import FileStore
300
-
323
+
301
324
  # Handle the pattern directly with glob
302
325
  full_pattern = os.path.join(directory, pattern)
303
326
  file_paths = glob.glob(full_pattern, recursive=True)
304
-
327
+
305
328
  # Remove duplicates (by converting to a set and back)
306
329
  file_paths = list(set(file_paths))
307
-
330
+
308
331
  # Create scenarios
309
332
  scenarios = []
310
333
  for file_path in file_paths:
311
334
  if os.path.isfile(file_path):
312
335
  # Check if file should be ignored
313
336
  file_name = os.path.basename(file_path)
314
- if any(fnmatch.fnmatch(file_name, ignore_pattern) for ignore_pattern in self.ignore_files or []):
337
+ if any(
338
+ fnmatch.fnmatch(file_name, ignore_pattern)
339
+ for ignore_pattern in self.ignore_files or []
340
+ ):
315
341
  continue
316
-
342
+
317
343
  # Create FileStore object
318
344
  file_store = FileStore(file_path)
319
-
345
+
320
346
  # Create scenario
321
347
  scenario_data = {"file": file_store}
322
-
348
+
323
349
  # Add metadata if requested
324
350
  if self.metadata:
325
351
  file_stat = os.stat(file_path)
326
- scenario_data.update({
327
- "file_path": file_path,
328
- "file_name": file_name,
329
- "file_size": file_stat.st_size,
330
- "file_created": file_stat.st_ctime,
331
- "file_modified": file_stat.st_mtime,
332
- })
333
-
352
+ scenario_data.update(
353
+ {
354
+ "file_path": file_path,
355
+ "file_name": file_name,
356
+ "file_size": file_stat.st_size,
357
+ "file_created": file_stat.st_ctime,
358
+ "file_modified": file_stat.st_mtime,
359
+ }
360
+ )
361
+
334
362
  scenarios.append(Scenario(scenario_data))
335
-
363
+
336
364
  return ScenarioList(scenarios)
337
365
  else:
338
366
  # Use the standard scanning method for non-** patterns
@@ -348,148 +376,146 @@ class DirectorySource(Source):
348
376
 
349
377
  class TuplesSource(Source):
350
378
  source_type = "list_of_tuples"
351
-
352
- def __init__(self, field_names: list[str], values: list[tuple], use_indexes: bool = False):
379
+
380
+ def __init__(
381
+ self, field_names: list[str], values: list[tuple], use_indexes: bool = False
382
+ ):
353
383
  self.field_names = field_names
354
384
  self.values = values
355
385
  self.use_indexes = use_indexes
356
-
386
+
357
387
  # Validate inputs
358
388
  if not all(isinstance(v, (tuple, list)) for v in values):
359
389
  raise ScenarioError("All values must be tuples or lists")
360
-
390
+
361
391
  @classmethod
362
- def example(cls) -> 'TuplesSource':
392
+ def example(cls) -> "TuplesSource":
363
393
  """Return an example TuplesSource instance."""
364
394
  return cls(
365
395
  field_names=["name", "age", "city"],
366
396
  values=[
367
397
  ("Alice", 30, "New York"),
368
398
  ("Bob", 25, "San Francisco"),
369
- ("Charlie", 35, "Boston")
399
+ ("Charlie", 35, "Boston"),
370
400
  ],
371
- use_indexes=True
401
+ use_indexes=True,
372
402
  )
373
-
403
+
374
404
  def to_scenario_list(self):
375
405
  """Create a ScenarioList from a list of tuples with specified field names."""
376
406
  from .scenario_list import ScenarioList
377
-
407
+
378
408
  scenarios = []
379
-
409
+
380
410
  for i, value_tuple in enumerate(self.values):
381
411
  if len(value_tuple) != len(self.field_names):
382
412
  raise ScenarioError(
383
413
  f"Tuple {i} has {len(value_tuple)} elements, but {len(self.field_names)} field names were provided."
384
414
  )
385
-
415
+
386
416
  scenario_dict = dict(zip(self.field_names, value_tuple))
387
417
  if self.use_indexes:
388
418
  scenario_dict["idx"] = i
389
419
  scenarios.append(Scenario(scenario_dict))
390
-
420
+
391
421
  return ScenarioList(scenarios)
392
422
 
393
423
 
394
424
  class SQLiteSource(Source):
395
425
  source_type = "sqlite"
396
-
426
+
397
427
  def __init__(self, db_path: str, table: str, fields: Optional[list] = None):
398
428
  self.db_path = db_path
399
429
  self.table = table
400
430
  self.fields = fields
401
-
431
+
402
432
  @classmethod
403
- def example(cls) -> 'SQLiteSource':
433
+ def example(cls) -> "SQLiteSource":
404
434
  """Return an example SQLiteSource instance."""
405
435
  import sqlite3
406
436
  import tempfile
407
437
  import os
408
-
438
+
409
439
  # Create a temporary SQLite database for the example
410
- fd, temp_path = tempfile.mkstemp(suffix='.db', prefix='edsl_test_')
440
+ fd, temp_path = tempfile.mkstemp(suffix=".db", prefix="edsl_test_")
411
441
  os.close(fd) # Close the file descriptor
412
-
442
+
413
443
  # Connect to the database and create a sample table
414
444
  conn = sqlite3.connect(temp_path)
415
445
  cursor = conn.cursor()
416
-
446
+
417
447
  # Create a simple table
418
- cursor.execute('CREATE TABLE test_table (id INTEGER PRIMARY KEY, name TEXT, value INTEGER)')
419
-
448
+ cursor.execute(
449
+ "CREATE TABLE test_table (id INTEGER PRIMARY KEY, name TEXT, value INTEGER)"
450
+ )
451
+
420
452
  # Insert sample data
421
- sample_data = [
422
- (1, 'Alpha', 100),
423
- (2, 'Beta', 200),
424
- (3, 'Gamma', 300)
425
- ]
426
- cursor.executemany('INSERT INTO test_table VALUES (?, ?, ?)', sample_data)
427
-
453
+ sample_data = [(1, "Alpha", 100), (2, "Beta", 200), (3, "Gamma", 300)]
454
+ cursor.executemany("INSERT INTO test_table VALUES (?, ?, ?)", sample_data)
455
+
428
456
  conn.commit()
429
457
  conn.close()
430
-
458
+
431
459
  return cls(
432
- db_path=temp_path,
433
- table='test_table',
434
- fields=['id', 'name', 'value']
460
+ db_path=temp_path, table="test_table", fields=["id", "name", "value"]
435
461
  )
436
-
462
+
437
463
  def to_scenario_list(self):
438
464
  """Create a ScenarioList from a SQLite database."""
439
465
  from .scenario_list import ScenarioList
440
466
  import sqlite3
441
-
467
+
442
468
  conn = sqlite3.connect(self.db_path)
443
469
  cursor = conn.cursor()
444
-
470
+
445
471
  # If fields weren't provided, get all fields from the table
446
472
  fields = self.fields
447
473
  if fields is None:
448
474
  cursor.execute(f"PRAGMA table_info({self.table})")
449
475
  fields = [row[1] for row in cursor.fetchall()]
450
-
476
+
451
477
  # Query the data
452
478
  field_placeholders = ", ".join(fields)
453
479
  cursor.execute(f"SELECT {field_placeholders} FROM {self.table}")
454
480
  rows = cursor.fetchall()
455
-
481
+
456
482
  # Create scenarios
457
483
  scenarios = []
458
484
  for row in rows:
459
485
  scenario_dict = dict(zip(fields, row))
460
486
  scenarios.append(Scenario(scenario_dict))
461
-
487
+
462
488
  conn.close()
463
489
  return ScenarioList(scenarios)
464
490
 
465
491
 
466
492
  class LaTeXSource(Source):
467
493
  source_type = "latex"
468
-
494
+
469
495
  def __init__(self, file_path: str, table_index: int = 0, has_header: bool = True):
470
496
  """
471
497
  Initialize a LaTeXSource with a LaTeX file path.
472
-
498
+
473
499
  Args:
474
500
  file_path: The path to the LaTeX file.
475
- table_index: The index of the table to extract (if multiple tables exist).
501
+ table_index: The index of the table to extract (if multiple tables exist).
476
502
  Default is 0 (first table).
477
503
  has_header: Whether the table has a header row. Default is True.
478
504
  """
479
505
  self.file_path = file_path
480
506
  self.table_index = table_index
481
507
  self.has_header = has_header
482
-
508
+
483
509
  @classmethod
484
- def example(cls) -> 'LaTeXSource':
510
+ def example(cls) -> "LaTeXSource":
485
511
  """Return an example LaTeXSource instance."""
486
512
  import tempfile
487
513
  import os
488
-
514
+
489
515
  # Create a temporary LaTeX file with a sample table
490
- fd, temp_path = tempfile.mkstemp(suffix='.tex', prefix='edsl_test_')
516
+ fd, temp_path = tempfile.mkstemp(suffix=".tex", prefix="edsl_test_")
491
517
  os.close(fd) # Close the file descriptor
492
-
518
+
493
519
  # Write a sample LaTeX table to the file
494
520
  sample_latex = r"""
495
521
  \documentclass{article}
@@ -505,39 +531,35 @@ Charlie & 35 & 92 \\
505
531
 
506
532
  \end{document}
507
533
  """
508
- with open(temp_path, 'w') as f:
534
+ with open(temp_path, "w") as f:
509
535
  f.write(sample_latex)
510
-
511
- return cls(
512
- file_path=temp_path,
513
- table_index=0,
514
- has_header=True
515
- )
516
-
536
+
537
+ return cls(file_path=temp_path, table_index=0, has_header=True)
538
+
517
539
  def to_scenario_list(self):
518
540
  """Create a ScenarioList from a LaTeX file."""
519
541
  from .scenario_list import ScenarioList
520
542
  import re
521
-
543
+
522
544
  with open(self.file_path, "r") as f:
523
545
  content = f.read()
524
-
546
+
525
547
  # Find all tabular environments
526
548
  tabular_pattern = r"\\begin{tabular}(.*?)\\end{tabular}"
527
549
  tables = re.findall(tabular_pattern, content, re.DOTALL)
528
-
550
+
529
551
  if not tables or self.table_index >= len(tables):
530
552
  raise ScenarioError(f"No table found at index {self.table_index}")
531
-
553
+
532
554
  table_content = tables[self.table_index]
533
-
555
+
534
556
  # Extract rows
535
557
  rows = table_content.split("\\\\")
536
558
  rows = [row.strip() for row in rows if row.strip()]
537
-
559
+
538
560
  if not rows:
539
561
  return ScenarioList()
540
-
562
+
541
563
  # Process header if available
542
564
  if self.has_header:
543
565
  header_row = rows[0]
@@ -545,98 +567,104 @@ Charlie & 35 & 92 \\
545
567
  if not header_cells:
546
568
  header_cells = header_row.split("&")
547
569
  header_cells = [h.strip() for h in header_cells]
548
-
570
+
549
571
  data_rows = rows[1:]
550
572
  else:
551
573
  # Auto-generate column names
552
574
  header_cells = [f"col{i}" for i in range(rows[0].count("&") + 1)]
553
575
  data_rows = rows
554
-
576
+
555
577
  # Process data rows
556
578
  scenarios = []
557
579
  for row in data_rows:
558
580
  cells = row.split("&")
559
581
  cells = [cell.strip() for cell in cells]
560
-
582
+
561
583
  if len(cells) != len(header_cells):
562
584
  continue # Skip malformed rows
563
-
585
+
564
586
  scenario_dict = dict(zip(header_cells, cells))
565
587
  scenarios.append(Scenario(scenario_dict))
566
-
588
+
567
589
  return ScenarioList(scenarios)
568
590
 
569
591
 
570
592
  class GoogleDocSource(Source):
571
593
  source_type = "google_doc"
572
-
594
+
573
595
  def __init__(self, url: str):
574
596
  """
575
597
  Initialize a GoogleDocSource with a Google Doc URL.
576
-
598
+
577
599
  Args:
578
600
  url: The URL to the Google Doc.
579
601
  """
580
602
  self.url = url
581
-
603
+
582
604
  @classmethod
583
- def example(cls) -> 'GoogleDocSource':
605
+ def example(cls) -> "GoogleDocSource":
584
606
  """Return an example GoogleDocSource instance."""
585
607
  # Create a mock instance that doesn't actually fetch a Google Doc
586
- instance = cls(url="https://docs.google.com/document/d/1234567890abcdefghijklmnopqrstuvwxyz/edit")
587
-
608
+ instance = cls(
609
+ url="https://docs.google.com/document/d/1234567890abcdefghijklmnopqrstuvwxyz/edit"
610
+ )
611
+
588
612
  # Override the to_scenario_list method just for the example
589
613
  def mock_to_scenario_list(self):
590
614
  from .scenario_list import ScenarioList
615
+
591
616
  # Create a simple mock ScenarioList with a few paragraphs
592
617
  scenarios = [
593
618
  Scenario({"text": "This is paragraph 1 from a sample Google Doc."}),
594
619
  Scenario({"text": "This is paragraph 2 with some more content."}),
595
- Scenario({"text": "This is the final paragraph with a conclusion."})
620
+ Scenario({"text": "This is the final paragraph with a conclusion."}),
596
621
  ]
597
622
  return ScenarioList(scenarios)
598
-
623
+
599
624
  # Replace the method on this instance only
600
625
  import types
626
+
601
627
  instance.to_scenario_list = types.MethodType(mock_to_scenario_list, instance)
602
-
628
+
603
629
  return instance
604
-
630
+
605
631
  def to_scenario_list(self):
606
632
  """Create a ScenarioList from a Google Doc."""
607
633
  from .scenario_list import ScenarioList
608
634
  import tempfile
609
635
  import requests
610
-
636
+
611
637
  # Extract the document ID from the URL
612
638
  if "/edit" in self.url:
613
639
  doc_id = self.url.split("/d/")[1].split("/edit")[0]
614
640
  else:
615
641
  raise ScenarioError("Invalid Google Doc URL format.")
616
-
642
+
617
643
  # Create the export URL to download as DOCX
618
644
  export_url = f"https://docs.google.com/document/d/{doc_id}/export?format=docx"
619
-
645
+
620
646
  try:
621
647
  # Download the Google Doc as a Word file (.docx)
622
648
  response = requests.get(export_url)
623
649
  response.raise_for_status() # Ensure the request was successful
624
-
650
+
625
651
  # Save the Word file to a temporary file
626
652
  with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as temp_file:
627
653
  temp_file.write(response.content)
628
654
  temp_filename = temp_file.name
629
-
655
+
630
656
  # Use the DocxScenario class to process the temporary file
631
657
  from .scenario_list import ScenarioList
632
658
  from .DocxScenario import DocxScenario
633
-
659
+
634
660
  # Create a scenario from the DOCX file
635
661
  docx_scenario = DocxScenario(temp_filename)
636
- scenarios = [Scenario({"text": paragraph}) for paragraph in docx_scenario.paragraphs]
637
-
662
+ scenarios = [
663
+ Scenario({"text": paragraph}) for paragraph in docx_scenario.paragraphs
664
+ ]
665
+
638
666
  return ScenarioList(scenarios)
639
-
667
+
640
668
  except requests.RequestException as e:
641
669
  raise ScenarioError(f"Failed to fetch Google Doc: {str(e)}")
642
670
  except Exception as e:
@@ -645,79 +673,84 @@ class GoogleDocSource(Source):
645
673
 
646
674
  class PandasSource(Source):
647
675
  source_type = "pandas"
648
-
676
+
649
677
  def __init__(self, df):
650
678
  """
651
679
  Initialize a PandasSource with a pandas DataFrame.
652
-
680
+
653
681
  Args:
654
682
  df: A pandas DataFrame.
655
683
  """
656
684
  try:
657
685
  import pandas as pd
686
+
658
687
  if not isinstance(df, pd.DataFrame):
659
688
  raise ScenarioError("Input must be a pandas DataFrame")
660
689
  self.df = df
661
690
  except ImportError:
662
691
  raise ImportError("pandas is required for PandasSource")
663
-
692
+
664
693
  @classmethod
665
- def example(cls) -> 'PandasSource':
694
+ def example(cls) -> "PandasSource":
666
695
  """Return an example PandasSource instance."""
667
696
  try:
668
697
  import pandas as pd
669
-
698
+
670
699
  # Create a sample DataFrame for the example
671
700
  sample_data = {
672
- 'name': ['Alice', 'Bob', 'Charlie', 'David'],
673
- 'age': [30, 25, 35, 28],
674
- 'city': ['New York', 'San Francisco', 'Boston', 'Seattle']
701
+ "name": ["Alice", "Bob", "Charlie", "David"],
702
+ "age": [30, 25, 35, 28],
703
+ "city": ["New York", "San Francisco", "Boston", "Seattle"],
675
704
  }
676
705
  df = pd.DataFrame(sample_data)
677
-
706
+
678
707
  return cls(df)
679
708
  except ImportError:
680
709
  # Create a mock instance that doesn't actually need pandas
681
710
  instance = cls.__new__(cls)
682
-
711
+
683
712
  # Override the to_scenario_list method just for the example
684
713
  def mock_to_scenario_list(self):
685
714
  from .scenario_list import ScenarioList
715
+
686
716
  # Create a simple mock ScenarioList
687
717
  scenarios = [
688
718
  Scenario({"name": "Alice", "age": 30, "city": "New York"}),
689
719
  Scenario({"name": "Bob", "age": 25, "city": "San Francisco"}),
690
720
  Scenario({"name": "Charlie", "age": 35, "city": "Boston"}),
691
- Scenario({"name": "David", "age": 28, "city": "Seattle"})
721
+ Scenario({"name": "David", "age": 28, "city": "Seattle"}),
692
722
  ]
693
723
  return ScenarioList(scenarios)
694
-
724
+
695
725
  # Replace the method on this instance only
696
726
  import types
697
- instance.to_scenario_list = types.MethodType(mock_to_scenario_list, instance)
698
-
727
+
728
+ instance.to_scenario_list = types.MethodType(
729
+ mock_to_scenario_list, instance
730
+ )
731
+
699
732
  return instance
700
-
733
+
701
734
  def to_scenario_list(self):
702
735
  """Create a ScenarioList from a pandas DataFrame."""
703
736
  from .scenario_list import ScenarioList
704
-
737
+
705
738
  # Convert DataFrame records to scenarios
706
739
  scenarios = []
707
740
  for _, row in self.df.iterrows():
708
741
  scenario_dict = row.to_dict()
709
742
  scenarios.append(Scenario(scenario_dict))
710
-
743
+
711
744
  return ScenarioList(scenarios)
712
745
 
713
746
 
714
747
  class StataSource(Source):
715
748
  source_type = "dta"
716
-
749
+
717
750
  def __init__(self, file_path: str, include_metadata: bool = True):
718
751
  """
719
752
  Initialize a StataSource with a path to a Stata data file.
720
-
753
+
721
754
  Args:
722
755
  file_path: Path to the Stata (.dta) file.
723
756
  include_metadata: If True, extract and preserve variable labels and value labels
@@ -725,102 +758,108 @@ class StataSource(Source):
725
758
  """
726
759
  self.file_path = file_path
727
760
  self.include_metadata = include_metadata
728
-
761
+
729
762
  @classmethod
730
- def example(cls) -> 'StataSource':
763
+ def example(cls) -> "StataSource":
731
764
  """Return an example StataSource instance."""
732
765
  import tempfile
733
766
  import os
734
-
767
+
735
768
  # Since we can't easily create a real Stata file for testing,
736
769
  # we'll create a mock instance with an override
737
770
  instance = cls(file_path="/path/to/nonexistent/file.dta")
738
-
771
+
739
772
  # Override the to_scenario_list method just for the example
740
773
  def mock_to_scenario_list(self):
741
774
  from .scenario_list import ScenarioList
742
-
775
+
743
776
  # Create a simple mock ScenarioList with Stata-like data
744
777
  scenarios = [
745
778
  Scenario({"id": 1, "gender": 1, "income": 50000, "education": 2}),
746
779
  Scenario({"id": 2, "gender": 2, "income": 45000, "education": 3}),
747
- Scenario({"id": 3, "gender": 1, "income": 60000, "education": 4})
780
+ Scenario({"id": 3, "gender": 1, "income": 60000, "education": 4}),
748
781
  ]
749
-
782
+
750
783
  result = ScenarioList(scenarios)
751
-
784
+
752
785
  # Add metadata similar to what would be in a Stata file
753
786
  if self.include_metadata:
754
787
  result.codebook = {
755
788
  "variable_labels": {
756
789
  "gender": "Gender (1=Male, 2=Female)",
757
790
  "income": "Annual income in USD",
758
- "education": "Education level (1-4)"
791
+ "education": "Education level (1-4)",
759
792
  },
760
793
  "value_labels": {
761
794
  "gender": {1: "Male", 2: "Female"},
762
- "education": {1: "High School", 2: "Associate", 3: "Bachelor", 4: "Graduate"}
763
- }
795
+ "education": {
796
+ 1: "High School",
797
+ 2: "Associate",
798
+ 3: "Bachelor",
799
+ 4: "Graduate",
800
+ },
801
+ },
764
802
  }
765
-
803
+
766
804
  return result
767
-
805
+
768
806
  # Replace the method on this instance only
769
807
  import types
808
+
770
809
  instance.to_scenario_list = types.MethodType(mock_to_scenario_list, instance)
771
-
810
+
772
811
  return instance
773
-
812
+
774
813
  def to_scenario_list(self):
775
814
  """Create a ScenarioList from a Stata data file."""
776
815
  from .scenario_list import ScenarioList
777
-
816
+
778
817
  try:
779
818
  import pandas as pd
780
819
  except ImportError:
781
820
  raise ImportError("pandas is required to read Stata files")
782
-
821
+
783
822
  # Read the Stata file with pandas
784
823
  df = pd.read_stata(self.file_path)
785
-
824
+
786
825
  # Create scenarios
787
826
  scenarios = []
788
827
  for _, row in df.iterrows():
789
828
  scenario_dict = row.to_dict()
790
829
  scenarios.append(Scenario(scenario_dict))
791
-
830
+
792
831
  # Create the basic ScenarioList
793
832
  result = ScenarioList(scenarios)
794
-
833
+
795
834
  # Extract and preserve metadata if requested
796
835
  if self.include_metadata:
797
836
  # Get variable labels (if any)
798
837
  variable_labels = {}
799
838
  if hasattr(df, "variable_labels") and df.variable_labels:
800
839
  variable_labels = df.variable_labels
801
-
840
+
802
841
  # Get value labels (if any)
803
842
  value_labels = {}
804
843
  if hasattr(df, "value_labels") and df.value_labels:
805
844
  value_labels = df.value_labels
806
-
845
+
807
846
  # Store the metadata in the ScenarioList's codebook
808
847
  if variable_labels or value_labels:
809
848
  result.codebook = {
810
849
  "variable_labels": variable_labels,
811
850
  "value_labels": value_labels,
812
851
  }
813
-
852
+
814
853
  return result
815
854
 
816
855
 
817
856
  class WikipediaSource(Source):
818
857
  source_type = "wikipedia"
819
-
858
+
820
859
  def __init__(self, url: str, table_index: int = 0, header: bool = True):
821
860
  """
822
861
  Initialize a WikipediaSource with a URL to a Wikipedia page.
823
-
862
+
824
863
  Args:
825
864
  url: The URL of the Wikipedia page.
826
865
  table_index: The index of the table to extract (default is 0).
@@ -829,74 +868,89 @@ class WikipediaSource(Source):
829
868
  self.url = url
830
869
  self.table_index = table_index
831
870
  self.header = header
832
-
871
+
833
872
  @classmethod
834
- def example(cls) -> 'WikipediaSource':
873
+ def example(cls) -> "WikipediaSource":
835
874
  """Return an example WikipediaSource instance."""
836
875
  # Use a real Wikipedia URL for the example, but we'll override the to_scenario_list method
837
876
  instance = cls(
838
877
  url="https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)",
839
878
  table_index=0,
840
- header=True
879
+ header=True,
841
880
  )
842
-
881
+
843
882
  # Override the to_scenario_list method just for the example
844
883
  def mock_to_scenario_list(self):
845
884
  from .scenario_list import ScenarioList
846
-
885
+
847
886
  # Create a simple mock ScenarioList with GDP data
848
887
  scenarios = [
849
- Scenario({"Rank": 1, "Country": "United States", "GDP (millions of USD)": 25460000}),
850
- Scenario({"Rank": 2, "Country": "China", "GDP (millions of USD)": 17963000}),
851
- Scenario({"Rank": 3, "Country": "Japan", "GDP (millions of USD)": 4231000}),
852
- Scenario({"Rank": 4, "Country": "Germany", "GDP (millions of USD)": 4430000}),
853
- Scenario({"Rank": 5, "Country": "India", "GDP (millions of USD)": 3737000})
888
+ Scenario(
889
+ {
890
+ "Rank": 1,
891
+ "Country": "United States",
892
+ "GDP (millions of USD)": 25460000,
893
+ }
894
+ ),
895
+ Scenario(
896
+ {"Rank": 2, "Country": "China", "GDP (millions of USD)": 17963000}
897
+ ),
898
+ Scenario(
899
+ {"Rank": 3, "Country": "Japan", "GDP (millions of USD)": 4231000}
900
+ ),
901
+ Scenario(
902
+ {"Rank": 4, "Country": "Germany", "GDP (millions of USD)": 4430000}
903
+ ),
904
+ Scenario(
905
+ {"Rank": 5, "Country": "India", "GDP (millions of USD)": 3737000}
906
+ ),
854
907
  ]
855
-
908
+
856
909
  return ScenarioList(scenarios)
857
-
910
+
858
911
  # Replace the method on this instance only
859
912
  import types
913
+
860
914
  instance.to_scenario_list = types.MethodType(mock_to_scenario_list, instance)
861
-
915
+
862
916
  return instance
863
-
917
+
864
918
  def to_scenario_list(self):
865
919
  """Create a ScenarioList from a table on a Wikipedia page."""
866
920
  from .scenario_list import ScenarioList
867
921
  import requests
868
-
922
+
869
923
  try:
870
924
  # Try to import pandas
871
925
  import pandas as pd
872
926
  except ImportError:
873
927
  raise ImportError("pandas is required to read Wikipedia tables")
874
-
928
+
875
929
  try:
876
930
  # Check if the URL is reachable
877
931
  response = requests.get(self.url)
878
932
  response.raise_for_status() # Raises HTTPError for bad responses
879
-
933
+
880
934
  # Extract tables from the Wikipedia page
881
935
  tables = pd.read_html(self.url, header=0 if self.header else None)
882
-
936
+
883
937
  # Ensure the requested table index is within the range of available tables
884
938
  if self.table_index >= len(tables) or self.table_index < 0:
885
939
  raise ScenarioError(
886
940
  f"Table index {self.table_index} is out of range. This page has {len(tables)} table(s)."
887
941
  )
888
-
942
+
889
943
  # Get the requested table
890
944
  df = tables[self.table_index]
891
-
945
+
892
946
  # Convert DataFrame to ScenarioList
893
947
  scenarios = []
894
948
  for _, row in df.iterrows():
895
949
  scenario_dict = row.to_dict()
896
950
  scenarios.append(Scenario(scenario_dict))
897
-
951
+
898
952
  return ScenarioList(scenarios)
899
-
953
+
900
954
  except requests.exceptions.RequestException as e:
901
955
  raise ScenarioError(f"Error fetching the URL: {str(e)}")
902
956
  except ValueError as e:
@@ -907,18 +961,18 @@ class WikipediaSource(Source):
907
961
 
908
962
  class ExcelSource(Source):
909
963
  source_type = "excel"
910
-
964
+
911
965
  def __init__(
912
- self,
913
- file_path: str,
914
- sheet_name: Optional[str] = None,
966
+ self,
967
+ file_path: str,
968
+ sheet_name: Optional[str] = None,
915
969
  skip_rows: Optional[List[int]] = None,
916
970
  use_codebook: bool = False,
917
- **kwargs
971
+ **kwargs,
918
972
  ):
919
973
  """
920
974
  Initialize an ExcelSource with a path to an Excel file.
921
-
975
+
922
976
  Args:
923
977
  file_path: Path to the Excel file.
924
978
  sheet_name: Name of the sheet to load. If None and multiple sheets exist,
@@ -932,76 +986,81 @@ class ExcelSource(Source):
932
986
  self.skip_rows = skip_rows
933
987
  self.use_codebook = use_codebook
934
988
  self.kwargs = kwargs
935
-
989
+
936
990
  @classmethod
937
- def example(cls) -> 'ExcelSource':
991
+ def example(cls) -> "ExcelSource":
938
992
  """Return an example ExcelSource instance."""
939
993
  import tempfile
940
994
  import os
941
-
995
+
942
996
  try:
943
997
  import pandas as pd
944
-
998
+
945
999
  # Create a temporary Excel file with sample data
946
- fd, temp_path = tempfile.mkstemp(suffix='.xlsx', prefix='edsl_test_')
1000
+ fd, temp_path = tempfile.mkstemp(suffix=".xlsx", prefix="edsl_test_")
947
1001
  os.close(fd) # Close the file descriptor
948
-
1002
+
949
1003
  # Create sample data
950
- df1 = pd.DataFrame({
951
- 'name': ['Alice', 'Bob', 'Charlie'],
952
- 'age': [30, 25, 35],
953
- 'city': ['New York', 'San Francisco', 'Boston']
954
- })
955
-
956
- df2 = pd.DataFrame({
957
- 'name': ['David', 'Eve'],
958
- 'age': [40, 45],
959
- 'city': ['Seattle', 'Chicago']
960
- })
961
-
1004
+ df1 = pd.DataFrame(
1005
+ {
1006
+ "name": ["Alice", "Bob", "Charlie"],
1007
+ "age": [30, 25, 35],
1008
+ "city": ["New York", "San Francisco", "Boston"],
1009
+ }
1010
+ )
1011
+
1012
+ df2 = pd.DataFrame(
1013
+ {
1014
+ "name": ["David", "Eve"],
1015
+ "age": [40, 45],
1016
+ "city": ["Seattle", "Chicago"],
1017
+ }
1018
+ )
1019
+
962
1020
  # Write to Excel file with multiple sheets
963
1021
  with pd.ExcelWriter(temp_path) as writer:
964
- df1.to_excel(writer, sheet_name='Sheet1', index=False)
965
- df2.to_excel(writer, sheet_name='Sheet2', index=False)
966
-
967
- return cls(
968
- file_path=temp_path,
969
- sheet_name='Sheet1'
970
- )
971
-
1022
+ df1.to_excel(writer, sheet_name="Sheet1", index=False)
1023
+ df2.to_excel(writer, sheet_name="Sheet2", index=False)
1024
+
1025
+ return cls(file_path=temp_path, sheet_name="Sheet1")
1026
+
972
1027
  except ImportError:
973
1028
  # Create a mock instance with an override if pandas is not available
974
1029
  instance = cls(file_path="/path/to/nonexistent/file.xlsx")
975
-
1030
+
976
1031
  # Override the to_scenario_list method just for the example
977
1032
  def mock_to_scenario_list(self):
978
1033
  from .scenario_list import ScenarioList
1034
+
979
1035
  # Create a simple mock ScenarioList with sample data
980
1036
  scenarios = [
981
1037
  Scenario({"name": "Alice", "age": 30, "city": "New York"}),
982
1038
  Scenario({"name": "Bob", "age": 25, "city": "San Francisco"}),
983
- Scenario({"name": "Charlie", "age": 35, "city": "Boston"})
1039
+ Scenario({"name": "Charlie", "age": 35, "city": "Boston"}),
984
1040
  ]
985
1041
  return ScenarioList(scenarios)
986
-
1042
+
987
1043
  # Replace the method on this instance only
988
1044
  import types
989
- instance.to_scenario_list = types.MethodType(mock_to_scenario_list, instance)
990
-
1045
+
1046
+ instance.to_scenario_list = types.MethodType(
1047
+ mock_to_scenario_list, instance
1048
+ )
1049
+
991
1050
  return instance
992
-
1051
+
993
1052
  def to_scenario_list(self):
994
1053
  """Create a ScenarioList from an Excel file."""
995
1054
  from .scenario_list import ScenarioList
996
-
1055
+
997
1056
  try:
998
1057
  import pandas as pd
999
1058
  except ImportError:
1000
1059
  raise ImportError("pandas is required to read Excel files")
1001
-
1060
+
1002
1061
  # Get all sheets
1003
1062
  all_sheets = pd.read_excel(self.file_path, sheet_name=None)
1004
-
1063
+
1005
1064
  # If no sheet_name is provided and there is more than one sheet, print available sheets
1006
1065
  sheet_name = self.sheet_name
1007
1066
  if sheet_name is None:
@@ -1015,27 +1074,27 @@ class ExcelSource(Source):
1015
1074
  else:
1016
1075
  # If there is only one sheet, use it
1017
1076
  sheet_name = list(all_sheets.keys())[0]
1018
-
1077
+
1019
1078
  # Load the specified or determined sheet
1020
1079
  df = pd.read_excel(self.file_path, sheet_name=sheet_name, **self.kwargs)
1021
-
1080
+
1022
1081
  # Skip specified rows if any
1023
1082
  if self.skip_rows:
1024
1083
  df = df.drop(self.skip_rows)
1025
1084
  # Reset index to ensure continuous indexing
1026
1085
  df = df.reset_index(drop=True)
1027
-
1086
+
1028
1087
  # Handle codebook if requested
1029
1088
  if self.use_codebook:
1030
1089
  codebook = {f"col_{i}": col for i, col in enumerate(df.columns)}
1031
1090
  koobedoc = {col: f"col_{i}" for i, col in enumerate(df.columns)}
1032
-
1091
+
1033
1092
  # Create scenarios with renamed columns
1034
1093
  scenarios = []
1035
1094
  for _, row in df.iterrows():
1036
1095
  scenario_dict = {koobedoc.get(k): v for k, v in row.to_dict().items()}
1037
1096
  scenarios.append(Scenario(scenario_dict))
1038
-
1097
+
1039
1098
  result = ScenarioList(scenarios)
1040
1099
  result.codebook = codebook
1041
1100
  return result
@@ -1045,23 +1104,23 @@ class ExcelSource(Source):
1045
1104
  for _, row in df.iterrows():
1046
1105
  scenario_dict = row.to_dict()
1047
1106
  scenarios.append(Scenario(scenario_dict))
1048
-
1107
+
1049
1108
  return ScenarioList(scenarios)
1050
1109
 
1051
1110
 
1052
1111
  class GoogleSheetSource(Source):
1053
1112
  source_type = "google_sheet"
1054
-
1113
+
1055
1114
  def __init__(
1056
- self,
1057
- url: str,
1058
- sheet_name: Optional[str] = None,
1115
+ self,
1116
+ url: str,
1117
+ sheet_name: Optional[str] = None,
1059
1118
  column_names: Optional[List[str]] = None,
1060
- **kwargs
1119
+ **kwargs,
1061
1120
  ):
1062
1121
  """
1063
1122
  Initialize a GoogleSheetSource with a URL to a Google Sheet.
1064
-
1123
+
1065
1124
  Args:
1066
1125
  url: The URL of the Google Sheet.
1067
1126
  sheet_name: The name of the sheet to load. If None, the first sheet will be used.
@@ -1073,67 +1132,68 @@ class GoogleSheetSource(Source):
1073
1132
  self.sheet_name = sheet_name
1074
1133
  self.column_names = column_names
1075
1134
  self.kwargs = kwargs
1076
-
1135
+
1077
1136
  @classmethod
1078
- def example(cls) -> 'GoogleSheetSource':
1137
+ def example(cls) -> "GoogleSheetSource":
1079
1138
  """Return an example GoogleSheetSource instance."""
1080
1139
  # Use a mock instance since we can't create a real Google Sheet for testing
1081
1140
  instance = cls(
1082
1141
  url="https://docs.google.com/spreadsheets/d/1234567890abcdefg/edit",
1083
- sheet_name="Sheet1"
1142
+ sheet_name="Sheet1",
1084
1143
  )
1085
-
1144
+
1086
1145
  # Override the to_scenario_list method just for the example
1087
1146
  def mock_to_scenario_list(self):
1088
1147
  from .scenario_list import ScenarioList
1089
-
1148
+
1090
1149
  # Create a simple mock ScenarioList with sample data
1091
1150
  scenarios = [
1092
1151
  Scenario({"name": "Alice", "age": 30, "city": "New York"}),
1093
1152
  Scenario({"name": "Bob", "age": 25, "city": "San Francisco"}),
1094
- Scenario({"name": "Charlie", "age": 35, "city": "Boston"})
1153
+ Scenario({"name": "Charlie", "age": 35, "city": "Boston"}),
1095
1154
  ]
1096
1155
  return ScenarioList(scenarios)
1097
-
1156
+
1098
1157
  # Replace the method on this instance only
1099
1158
  import types
1159
+
1100
1160
  instance.to_scenario_list = types.MethodType(mock_to_scenario_list, instance)
1101
-
1161
+
1102
1162
  return instance
1103
-
1163
+
1104
1164
  def to_scenario_list(self):
1105
1165
  """Create a ScenarioList from a Google Sheet."""
1106
1166
  from .scenario_list import ScenarioList
1107
1167
  import tempfile
1108
1168
  import requests
1109
-
1169
+
1110
1170
  # Extract the sheet ID from the URL
1111
1171
  if "/edit" in self.url:
1112
1172
  sheet_id = self.url.split("/d/")[1].split("/edit")[0]
1113
1173
  else:
1114
1174
  raise ScenarioError("Invalid Google Sheet URL format.")
1115
-
1175
+
1116
1176
  # Create the export URL for XLSX format
1117
- export_url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=xlsx"
1118
-
1177
+ export_url = (
1178
+ f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=xlsx"
1179
+ )
1180
+
1119
1181
  try:
1120
1182
  # Download the Google Sheet as an Excel file
1121
1183
  response = requests.get(export_url)
1122
1184
  response.raise_for_status() # Ensure the request was successful
1123
-
1185
+
1124
1186
  # Save the Excel file to a temporary file
1125
1187
  with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as temp_file:
1126
1188
  temp_file.write(response.content)
1127
1189
  temp_filename = temp_file.name
1128
-
1190
+
1129
1191
  # Use ExcelSource to create the initial ScenarioList
1130
1192
  excel_source = ExcelSource(
1131
- file_path=temp_filename,
1132
- sheet_name=self.sheet_name,
1133
- **self.kwargs
1193
+ file_path=temp_filename, sheet_name=self.sheet_name, **self.kwargs
1134
1194
  )
1135
1195
  scenario_list = excel_source.to_scenario_list()
1136
-
1196
+
1137
1197
  # Apply column renaming if specified
1138
1198
  if self.column_names is not None and scenario_list:
1139
1199
  if len(self.column_names) != len(scenario_list[0].keys()):
@@ -1141,21 +1201,23 @@ class GoogleSheetSource(Source):
1141
1201
  f"Number of provided column names ({len(self.column_names)}) "
1142
1202
  f"does not match number of columns in sheet ({len(scenario_list[0].keys())})"
1143
1203
  )
1144
-
1204
+
1145
1205
  # Create a mapping from original keys to new names
1146
1206
  original_keys = list(scenario_list[0].keys())
1147
1207
  column_mapping = dict(zip(original_keys, self.column_names))
1148
-
1208
+
1149
1209
  # Create a new ScenarioList with renamed columns
1150
1210
  renamed_scenarios = []
1151
1211
  for scenario in scenario_list:
1152
- renamed_scenario = {column_mapping.get(k, k): v for k, v in scenario.items()}
1212
+ renamed_scenario = {
1213
+ column_mapping.get(k, k): v for k, v in scenario.items()
1214
+ }
1153
1215
  renamed_scenarios.append(Scenario(renamed_scenario))
1154
-
1216
+
1155
1217
  return ScenarioList(renamed_scenarios)
1156
-
1218
+
1157
1219
  return scenario_list
1158
-
1220
+
1159
1221
  except requests.exceptions.RequestException as e:
1160
1222
  raise ScenarioError(f"Error fetching the Google Sheet: {str(e)}")
1161
1223
  except Exception as e:
@@ -1164,18 +1226,18 @@ class GoogleSheetSource(Source):
1164
1226
 
1165
1227
  class DelimitedFileSource(Source):
1166
1228
  source_type = "delimited_file"
1167
-
1229
+
1168
1230
  def __init__(
1169
- self,
1231
+ self,
1170
1232
  file_or_url: str,
1171
1233
  delimiter: str = ",",
1172
1234
  has_header: bool = True,
1173
1235
  encoding: str = "utf-8",
1174
- **kwargs
1236
+ **kwargs,
1175
1237
  ):
1176
1238
  """
1177
1239
  Initialize a DelimitedFileSource with a path to a delimited file or URL.
1178
-
1240
+
1179
1241
  Args:
1180
1242
  file_or_url: Path to a local file or URL to a remote file.
1181
1243
  delimiter: The delimiter character used in the file (default is ',').
@@ -1188,42 +1250,38 @@ class DelimitedFileSource(Source):
1188
1250
  self.has_header = has_header
1189
1251
  self.encoding = encoding
1190
1252
  self.kwargs = kwargs
1191
-
1253
+
1192
1254
  @classmethod
1193
- def example(cls) -> 'DelimitedFileSource':
1255
+ def example(cls) -> "DelimitedFileSource":
1194
1256
  """Return an example DelimitedFileSource instance."""
1195
1257
  import tempfile
1196
1258
  import os
1197
-
1259
+
1198
1260
  # Create a temporary CSV file with sample data
1199
- fd, temp_path = tempfile.mkstemp(suffix='.csv', prefix='edsl_test_')
1261
+ fd, temp_path = tempfile.mkstemp(suffix=".csv", prefix="edsl_test_")
1200
1262
  os.close(fd) # Close the file descriptor
1201
-
1263
+
1202
1264
  # Write sample data to the file
1203
- with open(temp_path, 'w', newline='') as f:
1265
+ with open(temp_path, "w", newline="") as f:
1204
1266
  f.write("name,age,city\n")
1205
1267
  f.write("Alice,30,New York\n")
1206
1268
  f.write("Bob,25,San Francisco\n")
1207
1269
  f.write("Charlie,35,Boston\n")
1208
-
1209
- return cls(
1210
- file_or_url=temp_path,
1211
- delimiter=",",
1212
- has_header=True
1213
- )
1214
-
1270
+
1271
+ return cls(file_or_url=temp_path, delimiter=",", has_header=True)
1272
+
1215
1273
  def to_scenario_list(self):
1216
1274
  """Create a ScenarioList from a delimited file or URL."""
1217
1275
  from .scenario_list import ScenarioList
1218
1276
  import requests
1219
-
1277
+
1220
1278
  # Check if the input is a URL
1221
1279
  parsed_url = urlparse(self.file_or_url)
1222
1280
  if parsed_url.scheme in ("http", "https"):
1223
1281
  try:
1224
1282
  headers = {
1225
1283
  "Accept": "text/csv,application/csv,text/plain",
1226
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
1284
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
1227
1285
  }
1228
1286
  response = requests.get(self.file_or_url, headers=headers)
1229
1287
  response.raise_for_status()
@@ -1240,7 +1298,7 @@ class DelimitedFileSource(Source):
1240
1298
  encodings_to_try = ["latin-1", "cp1252", "ISO-8859-1"]
1241
1299
  if self.encoding in encodings_to_try:
1242
1300
  encodings_to_try.remove(self.encoding)
1243
-
1301
+
1244
1302
  for encoding in encodings_to_try:
1245
1303
  try:
1246
1304
  with open(self.file_or_url, "r", encoding=encoding) as f:
@@ -1249,17 +1307,21 @@ class DelimitedFileSource(Source):
1249
1307
  except UnicodeDecodeError:
1250
1308
  continue
1251
1309
  else:
1252
- raise ScenarioError(f"Failed to decode file with any of the attempted encodings")
1310
+ raise ScenarioError(
1311
+ f"Failed to decode file with any of the attempted encodings"
1312
+ )
1253
1313
  except Exception as e:
1254
1314
  raise ScenarioError(f"Failed to read file: {str(e)}")
1255
-
1315
+
1256
1316
  # Parse the content
1257
- csv_reader = csv.reader(StringIO(content), delimiter=self.delimiter, **self.kwargs)
1317
+ csv_reader = csv.reader(
1318
+ StringIO(content), delimiter=self.delimiter, **self.kwargs
1319
+ )
1258
1320
  rows = list(csv_reader)
1259
-
1321
+
1260
1322
  if not rows:
1261
1323
  return ScenarioList()
1262
-
1324
+
1263
1325
  # Handle header row
1264
1326
  if self.has_header:
1265
1327
  header = rows[0]
@@ -1268,33 +1330,50 @@ class DelimitedFileSource(Source):
1268
1330
  # Auto-generate column names
1269
1331
  header = [f"col{i}" for i in range(len(rows[0]))]
1270
1332
  data_rows = rows
1271
-
1333
+
1334
+ header_counts = defaultdict(lambda: 0)
1335
+ new_header = []
1336
+ for h in header:
1337
+ print(header_counts)
1338
+ if header_counts[h] >= 1:
1339
+ new_header.append(f"{h}_{header_counts[h]}")
1340
+ warnings.warn(
1341
+ f"Duplicate header found: {h}. Renamed to {h}_{header_counts[h]}"
1342
+ )
1343
+ else:
1344
+ new_header.append(h)
1345
+ header_counts[h] += 1
1346
+
1347
+ assert len(new_header) == len(set(new_header))
1348
+
1272
1349
  # Create scenarios
1273
1350
  scenarios = []
1274
1351
  for row in data_rows:
1275
- if len(row) != len(header):
1276
- warnings.warn(f"Skipping row with {len(row)} values (expected {len(header)})")
1352
+ if len(row) != len(new_header):
1353
+ warnings.warn(
1354
+ f"Skipping row with {len(row)} values (expected {len(header)})"
1355
+ )
1277
1356
  continue
1278
-
1279
- scenario_dict = dict(zip(header, row))
1357
+
1358
+ scenario_dict = dict(zip(new_header, row))
1280
1359
  scenarios.append(Scenario(scenario_dict))
1281
-
1360
+
1282
1361
  return ScenarioList(scenarios)
1283
1362
 
1284
1363
 
1285
1364
  class CSVSource(DelimitedFileSource):
1286
1365
  source_type = "csv"
1287
-
1366
+
1288
1367
  def __init__(
1289
- self,
1368
+ self,
1290
1369
  file_or_url: str,
1291
1370
  has_header: bool = True,
1292
1371
  encoding: str = "utf-8",
1293
- **kwargs
1372
+ **kwargs,
1294
1373
  ):
1295
1374
  """
1296
1375
  Initialize a CSVSource with a path to a CSV file or URL.
1297
-
1376
+
1298
1377
  Args:
1299
1378
  file_or_url: Path to a local file or URL to a remote file.
1300
1379
  has_header: Whether the file has a header row (default is True).
@@ -1306,45 +1385,42 @@ class CSVSource(DelimitedFileSource):
1306
1385
  delimiter=",",
1307
1386
  has_header=has_header,
1308
1387
  encoding=encoding,
1309
- **kwargs
1388
+ **kwargs,
1310
1389
  )
1311
-
1390
+
1312
1391
  @classmethod
1313
- def example(cls) -> 'CSVSource':
1392
+ def example(cls) -> "CSVSource":
1314
1393
  """Return an example CSVSource instance."""
1315
1394
  import tempfile
1316
1395
  import os
1317
-
1396
+
1318
1397
  # Create a temporary CSV file with sample data
1319
- fd, temp_path = tempfile.mkstemp(suffix='.csv', prefix='edsl_test_')
1398
+ fd, temp_path = tempfile.mkstemp(suffix=".csv", prefix="edsl_test_")
1320
1399
  os.close(fd) # Close the file descriptor
1321
-
1400
+
1322
1401
  # Write sample data to the file
1323
- with open(temp_path, 'w', newline='') as f:
1402
+ with open(temp_path, "w", newline="") as f:
1324
1403
  f.write("name,age,city\n")
1325
1404
  f.write("Alice,30,New York\n")
1326
1405
  f.write("Bob,25,San Francisco\n")
1327
1406
  f.write("Charlie,35,Boston\n")
1328
-
1329
- return cls(
1330
- file_or_url=temp_path,
1331
- has_header=True
1332
- )
1407
+
1408
+ return cls(file_or_url=temp_path, has_header=True)
1333
1409
 
1334
1410
 
1335
1411
  class TSVSource(DelimitedFileSource):
1336
1412
  source_type = "tsv"
1337
-
1413
+
1338
1414
  def __init__(
1339
- self,
1415
+ self,
1340
1416
  file_or_url: str,
1341
1417
  has_header: bool = True,
1342
1418
  encoding: str = "utf-8",
1343
- **kwargs
1419
+ **kwargs,
1344
1420
  ):
1345
1421
  """
1346
1422
  Initialize a TSVSource with a path to a TSV file or URL.
1347
-
1423
+
1348
1424
  Args:
1349
1425
  file_or_url: Path to a local file or URL to a remote file.
1350
1426
  has_header: Whether the file has a header row (default is True).
@@ -1356,130 +1432,134 @@ class TSVSource(DelimitedFileSource):
1356
1432
  delimiter="\t",
1357
1433
  has_header=has_header,
1358
1434
  encoding=encoding,
1359
- **kwargs
1435
+ **kwargs,
1360
1436
  )
1361
-
1437
+
1362
1438
  @classmethod
1363
- def example(cls) -> 'TSVSource':
1439
+ def example(cls) -> "TSVSource":
1364
1440
  """Return an example TSVSource instance."""
1365
1441
  import tempfile
1366
1442
  import os
1367
-
1443
+
1368
1444
  # Create a temporary TSV file with sample data
1369
- fd, temp_path = tempfile.mkstemp(suffix='.tsv', prefix='edsl_test_')
1445
+ fd, temp_path = tempfile.mkstemp(suffix=".tsv", prefix="edsl_test_")
1370
1446
  os.close(fd) # Close the file descriptor
1371
-
1447
+
1372
1448
  # Write sample data to the file
1373
- with open(temp_path, 'w', newline='') as f:
1449
+ with open(temp_path, "w", newline="") as f:
1374
1450
  f.write("name\tage\tcity\n")
1375
1451
  f.write("Alice\t30\tNew York\n")
1376
1452
  f.write("Bob\t25\tSan Francisco\n")
1377
1453
  f.write("Charlie\t35\tBoston\n")
1378
-
1379
- return cls(
1380
- file_or_url=temp_path,
1381
- has_header=True
1382
- )
1454
+
1455
+ return cls(file_or_url=temp_path, has_header=True)
1456
+
1383
1457
 
1384
1458
  class ParquetSource(Source):
1385
1459
  source_type = "parquet"
1386
-
1460
+
1387
1461
  def __init__(self, file_path: str):
1388
1462
  """
1389
1463
  Initialize a ParquetSource with a path to a Parquet file.
1390
-
1464
+
1391
1465
  Args:
1392
1466
  file_path: Path to the Parquet file.
1393
1467
  """
1394
1468
  self.file_path = file_path
1395
-
1469
+
1396
1470
  @classmethod
1397
- def example(cls) -> 'ParquetSource':
1471
+ def example(cls) -> "ParquetSource":
1398
1472
  """Return an example ParquetSource instance."""
1399
1473
  import tempfile
1400
1474
  import os
1401
-
1475
+
1402
1476
  try:
1403
1477
  import pandas as pd
1404
1478
  import pyarrow as pa
1405
1479
  import pyarrow.parquet as pq
1406
-
1480
+
1407
1481
  # Create a temporary Parquet file with sample data
1408
- fd, temp_path = tempfile.mkstemp(suffix='.parquet', prefix='edsl_test_')
1482
+ fd, temp_path = tempfile.mkstemp(suffix=".parquet", prefix="edsl_test_")
1409
1483
  os.close(fd) # Close the file descriptor
1410
-
1484
+
1411
1485
  # Create sample data
1412
- df = pd.DataFrame({
1413
- 'name': ['Alice', 'Bob', 'Charlie'],
1414
- 'age': [30, 25, 35],
1415
- 'city': ['New York', 'San Francisco', 'Boston']
1416
- })
1417
-
1486
+ df = pd.DataFrame(
1487
+ {
1488
+ "name": ["Alice", "Bob", "Charlie"],
1489
+ "age": [30, 25, 35],
1490
+ "city": ["New York", "San Francisco", "Boston"],
1491
+ }
1492
+ )
1493
+
1418
1494
  # Write to Parquet file
1419
1495
  df.to_parquet(temp_path)
1420
-
1496
+
1421
1497
  return cls(file_path=temp_path)
1422
-
1498
+
1423
1499
  except ImportError:
1424
1500
  # Create a mock instance with an override if pandas or pyarrow is not available
1425
1501
  instance = cls(file_path="/path/to/nonexistent/file.parquet")
1426
-
1502
+
1427
1503
  # Override the to_scenario_list method just for the example
1428
1504
  def mock_to_scenario_list(self):
1429
1505
  from .scenario_list import ScenarioList
1506
+
1430
1507
  # Create a simple mock ScenarioList with sample data
1431
1508
  scenarios = [
1432
1509
  Scenario({"name": "Alice", "age": 30, "city": "New York"}),
1433
1510
  Scenario({"name": "Bob", "age": 25, "city": "San Francisco"}),
1434
- Scenario({"name": "Charlie", "age": 35, "city": "Boston"})
1511
+ Scenario({"name": "Charlie", "age": 35, "city": "Boston"}),
1435
1512
  ]
1436
1513
  return ScenarioList(scenarios)
1437
-
1514
+
1438
1515
  # Replace the method on this instance only
1439
1516
  import types
1440
- instance.to_scenario_list = types.MethodType(mock_to_scenario_list, instance)
1441
-
1517
+
1518
+ instance.to_scenario_list = types.MethodType(
1519
+ mock_to_scenario_list, instance
1520
+ )
1521
+
1442
1522
  return instance
1443
-
1523
+
1444
1524
  def to_scenario_list(self):
1445
1525
  """Create a ScenarioList from a Parquet file."""
1446
1526
  from .scenario_list import ScenarioList
1447
-
1527
+
1448
1528
  try:
1449
1529
  import pandas as pd
1450
1530
  except ImportError:
1451
1531
  raise ImportError("pandas is required to read Parquet files")
1452
-
1532
+
1453
1533
  try:
1454
1534
  import pyarrow
1455
1535
  except ImportError:
1456
1536
  raise ImportError("pyarrow is required to read Parquet files")
1457
-
1537
+
1458
1538
  # Read the Parquet file
1459
1539
  df = pd.read_parquet(self.file_path)
1460
-
1540
+
1461
1541
  # Convert DataFrame to ScenarioList
1462
1542
  scenarios = []
1463
1543
  for _, row in df.iterrows():
1464
1544
  scenario_dict = row.to_dict()
1465
1545
  scenarios.append(Scenario(scenario_dict))
1466
-
1546
+
1467
1547
  return ScenarioList(scenarios)
1468
1548
 
1469
1549
 
1470
1550
  class PDFSource(Source):
1471
1551
  source_type = "pdf"
1472
-
1552
+
1473
1553
  def __init__(
1474
- self,
1554
+ self,
1475
1555
  file_path: str,
1476
1556
  chunk_type: Literal["page", "text"] = "page",
1477
1557
  chunk_size: int = 1,
1478
- chunk_overlap: int = 0
1558
+ chunk_overlap: int = 0,
1479
1559
  ):
1480
1560
  """
1481
1561
  Initialize a PDFSource with a path to a PDF file.
1482
-
1562
+
1483
1563
  Args:
1484
1564
  file_path: Path to the PDF file or URL to a PDF.
1485
1565
  chunk_type: Type of chunking to use ("page" or "text").
@@ -1490,39 +1570,53 @@ class PDFSource(Source):
1490
1570
  self.chunk_type = chunk_type
1491
1571
  self.chunk_size = chunk_size
1492
1572
  self.chunk_overlap = chunk_overlap
1493
-
1573
+
1494
1574
  @classmethod
1495
- def example(cls) -> 'PDFSource':
1575
+ def example(cls) -> "PDFSource":
1496
1576
  """Return an example PDFSource instance."""
1497
1577
  # Skip actual file creation and just use a mock instance
1498
1578
  instance = cls(
1499
1579
  file_path="/path/to/nonexistent/file.pdf",
1500
1580
  chunk_type="page",
1501
1581
  chunk_size=1,
1502
- chunk_overlap=0
1582
+ chunk_overlap=0,
1503
1583
  )
1504
-
1584
+
1505
1585
  # Override the to_scenario_list method just for the example
1506
1586
  def mock_to_scenario_list(self):
1507
1587
  from .scenario_list import ScenarioList
1588
+
1508
1589
  # Create a simple mock ScenarioList with sample PDF data
1509
1590
  scenarios = [
1510
- Scenario({"filename": "example.pdf", "page": 1, "text": "This is page 1 content"}),
1511
- Scenario({"filename": "example.pdf", "page": 2, "text": "This is page 2 content"})
1591
+ Scenario(
1592
+ {
1593
+ "filename": "example.pdf",
1594
+ "page": 1,
1595
+ "text": "This is page 1 content",
1596
+ }
1597
+ ),
1598
+ Scenario(
1599
+ {
1600
+ "filename": "example.pdf",
1601
+ "page": 2,
1602
+ "text": "This is page 2 content",
1603
+ }
1604
+ ),
1512
1605
  ]
1513
1606
  return ScenarioList(scenarios)
1514
-
1607
+
1515
1608
  # Replace the method on this instance only
1516
1609
  import types
1610
+
1517
1611
  instance.to_scenario_list = types.MethodType(mock_to_scenario_list, instance)
1518
-
1612
+
1519
1613
  return instance
1520
-
1614
+
1521
1615
  def to_scenario_list(self):
1522
1616
  """Create a ScenarioList from a PDF file."""
1523
1617
  from .scenario_list import ScenarioList
1524
1618
  from .scenario_list_pdf_tools import PdfTools
1525
-
1619
+
1526
1620
  try:
1527
1621
  # Check if it's a URL
1528
1622
  if PdfTools.is_url(self.file_path):
@@ -1534,14 +1628,16 @@ class PDFSource(Source):
1534
1628
  )
1535
1629
  else:
1536
1630
  # It's a regular URL
1537
- local_path = PdfTools.fetch_and_save_pdf(self.file_path, "temp_pdf.pdf")
1631
+ local_path = PdfTools.fetch_and_save_pdf(
1632
+ self.file_path, "temp_pdf.pdf"
1633
+ )
1538
1634
  else:
1539
1635
  # It's a local file path
1540
1636
  local_path = self.file_path
1541
-
1637
+
1542
1638
  # Extract scenarios from the PDF
1543
1639
  scenarios = list(PdfTools.extract_text_from_pdf(local_path))
1544
-
1640
+
1545
1641
  # Handle chunking based on the specified parameters
1546
1642
  if self.chunk_type == "page":
1547
1643
  # Default behavior - one scenario per page
@@ -1551,31 +1647,31 @@ class PDFSource(Source):
1551
1647
  combined_text = ""
1552
1648
  for scenario in scenarios:
1553
1649
  combined_text += scenario["text"]
1554
-
1650
+
1555
1651
  # Create a single scenario with all text
1556
1652
  base_scenario = scenarios[0].copy()
1557
1653
  base_scenario["text"] = combined_text
1558
1654
  return ScenarioList([base_scenario])
1559
1655
  else:
1560
- raise ValueError(f"Invalid chunk_type: {self.chunk_type}. Must be 'page' or 'text'.")
1561
-
1656
+ raise ValueError(
1657
+ f"Invalid chunk_type: {self.chunk_type}. Must be 'page' or 'text'."
1658
+ )
1659
+
1562
1660
  except Exception as e:
1563
1661
  from .exceptions import ScenarioError
1662
+
1564
1663
  raise ScenarioError(f"Error processing PDF: {str(e)}")
1565
1664
 
1566
1665
 
1567
1666
  class PDFImageSource(Source):
1568
1667
  source_type = "pdf_to_image"
1569
-
1668
+
1570
1669
  def __init__(
1571
- self,
1572
- file_path: str,
1573
- base_width: int = 2000,
1574
- include_text: bool = True
1670
+ self, file_path: str, base_width: int = 2000, include_text: bool = True
1575
1671
  ):
1576
1672
  """
1577
1673
  Initialize a PDFImageSource with a path to a PDF file.
1578
-
1674
+
1579
1675
  Args:
1580
1676
  file_path: Path to the PDF file.
1581
1677
  base_width: Width to use for the generated images.
@@ -1584,74 +1680,91 @@ class PDFImageSource(Source):
1584
1680
  self.file_path = file_path
1585
1681
  self.base_width = base_width
1586
1682
  self.include_text = include_text
1587
-
1683
+
1588
1684
  @classmethod
1589
- def example(cls) -> 'PDFImageSource':
1685
+ def example(cls) -> "PDFImageSource":
1590
1686
  """Return an example PDFImageSource instance."""
1591
1687
  # Skip actual file creation and just use a mock instance
1592
1688
  instance = cls(
1593
1689
  file_path="/path/to/nonexistent/file.pdf",
1594
1690
  base_width=2000,
1595
- include_text=True
1691
+ include_text=True,
1596
1692
  )
1597
-
1693
+
1598
1694
  # Override the to_scenario_list method just for the example
1599
1695
  def mock_to_scenario_list(self):
1600
1696
  from .scenario_list import ScenarioList
1697
+
1601
1698
  # Create a simple mock ScenarioList with sample PDF image data
1602
1699
  scenarios = [
1603
- Scenario({"filepath": "/tmp/page_1.jpeg", "page": 0, "text": "This is page 1 content"}),
1604
- Scenario({"filepath": "/tmp/page_2.jpeg", "page": 1, "text": "This is page 2 content"})
1700
+ Scenario(
1701
+ {
1702
+ "filepath": "/tmp/page_1.jpeg",
1703
+ "page": 0,
1704
+ "text": "This is page 1 content",
1705
+ }
1706
+ ),
1707
+ Scenario(
1708
+ {
1709
+ "filepath": "/tmp/page_2.jpeg",
1710
+ "page": 1,
1711
+ "text": "This is page 2 content",
1712
+ }
1713
+ ),
1605
1714
  ]
1606
1715
  return ScenarioList(scenarios)
1607
-
1716
+
1608
1717
  # Replace the method on this instance only
1609
1718
  import types
1719
+
1610
1720
  instance.to_scenario_list = types.MethodType(mock_to_scenario_list, instance)
1611
-
1721
+
1612
1722
  return instance
1613
-
1723
+
1614
1724
  def to_scenario_list(self):
1615
1725
  """Create a ScenarioList from a PDF file, converting pages to images."""
1616
1726
  from .scenario_list import ScenarioList
1617
1727
  from .scenario_list_pdf_tools import PdfTools
1618
-
1728
+
1619
1729
  try:
1620
1730
  # Import pdf2image library
1621
1731
  try:
1622
1732
  from pdf2image import convert_from_path
1623
1733
  except ImportError:
1624
- raise ImportError("pdf2image is required to convert PDF to images. Install it with 'pip install pdf2image'.")
1625
-
1734
+ raise ImportError(
1735
+ "pdf2image is required to convert PDF to images. Install it with 'pip install pdf2image'."
1736
+ )
1737
+
1626
1738
  # Convert PDF pages to images
1627
1739
  scenarios = PdfTools.from_pdf_to_image(self.file_path, image_format="jpeg")
1628
1740
  return ScenarioList(scenarios)
1629
-
1741
+
1630
1742
  except Exception as e:
1631
1743
  from .exceptions import ScenarioError
1744
+
1632
1745
  raise ScenarioError(f"Error converting PDF to images: {str(e)}")
1633
1746
 
1634
1747
 
1635
1748
  class ScenarioSource:
1636
1749
  """
1637
1750
  Factory class for creating ScenarioList objects from various sources.
1638
-
1751
+
1639
1752
  This class provides static methods for creating ScenarioList objects from different
1640
1753
  data sources, centralizing the creation logic that was previously scattered across
1641
1754
  different classmethods in the ScenarioList class.
1642
-
1755
+
1643
1756
  The main entry point is the from_source method, which dispatches to appropriate
1644
1757
  source-specific methods based on the source_type parameter.
1645
1758
  """
1646
-
1759
+
1647
1760
  @staticmethod
1648
1761
  def from_source(source_type: str, *args, **kwargs):
1649
1762
  """
1650
1763
  Create a ScenarioList from a specified source type.
1651
-
1764
+
1652
1765
  This method serves as the main entry point for creating ScenarioList objects,
1653
1766
  dispatching to the appropriate source-specific method based on the source_type.
1654
-
1767
+
1655
1768
  Args:
1656
1769
  source_type: The type of source to create a ScenarioList from.
1657
1770
  Valid values include: 'urls', 'directory', 'list', 'list_of_tuples',
@@ -1660,10 +1773,10 @@ class ScenarioSource:
1660
1773
  'nested_dict', 'parquet', 'pdf', 'pdf_to_image'.
1661
1774
  *args: Positional arguments to pass to the source-specific method.
1662
1775
  **kwargs: Keyword arguments to pass to the source-specific method.
1663
-
1776
+
1664
1777
  Returns:
1665
1778
  A ScenarioList object created from the specified source.
1666
-
1779
+
1667
1780
  Raises:
1668
1781
  ValueError: If the source_type is not recognized.
1669
1782
  """
@@ -1679,14 +1792,14 @@ class ScenarioSource:
1679
1792
  return method(*args, **kwargs)
1680
1793
  else:
1681
1794
  raise ValueError(f"Unsupported source type: {source_type}")
1682
-
1795
+
1683
1796
  @staticmethod
1684
1797
  def _from_urls(urls: list[str], field_name: Optional[str] = "text"):
1685
1798
  """Create a ScenarioList from a list of URLs."""
1686
1799
  from .scenario_list import ScenarioList
1687
-
1800
+
1688
1801
  import requests
1689
-
1802
+
1690
1803
  result = ScenarioList()
1691
1804
  for url in urls:
1692
1805
  try:
@@ -1697,9 +1810,9 @@ class ScenarioSource:
1697
1810
  except requests.RequestException as e:
1698
1811
  warnings.warn(f"Failed to fetch URL {url}: {str(e)}")
1699
1812
  continue
1700
-
1813
+
1701
1814
  return result
1702
-
1815
+
1703
1816
  @staticmethod
1704
1817
  def _from_directory(
1705
1818
  directory: str,
@@ -1713,7 +1826,7 @@ class ScenarioSource:
1713
1826
  warnings.warn(
1714
1827
  "_from_directory is deprecated. Use DirectorySource directly or ScenarioSource.from_source('directory', ...) instead.",
1715
1828
  DeprecationWarning,
1716
- stacklevel=2
1829
+ stacklevel=2,
1717
1830
  )
1718
1831
  source = DirectorySource(
1719
1832
  directory=directory,
@@ -1721,23 +1834,21 @@ class ScenarioSource:
1721
1834
  recursive=recursive,
1722
1835
  metadata=metadata,
1723
1836
  ignore_dirs=ignore_dirs,
1724
- ignore_files=ignore_files
1837
+ ignore_files=ignore_files,
1725
1838
  )
1726
1839
  return source.to_scenario_list()
1727
-
1840
+
1728
1841
  @staticmethod
1729
- def _from_list(
1730
- field_name: str, values: list, use_indexes: bool = False
1731
- ):
1842
+ def _from_list(field_name: str, values: list, use_indexes: bool = False):
1732
1843
  """Create a ScenarioList from a list of values with a specified field name."""
1733
1844
  warnings.warn(
1734
1845
  "_from_list is deprecated. Use ListSource directly or ScenarioSource.from_source('list', ...) instead.",
1735
1846
  DeprecationWarning,
1736
- stacklevel=2
1847
+ stacklevel=2,
1737
1848
  )
1738
1849
  source = ListSource(field_name, values, use_indexes)
1739
1850
  return source.to_scenario_list()
1740
-
1851
+
1741
1852
  @staticmethod
1742
1853
  def _from_list_of_tuples(
1743
1854
  field_names: list[str], values: list[tuple], use_indexes: bool = False
@@ -1746,107 +1857,106 @@ class ScenarioSource:
1746
1857
  warnings.warn(
1747
1858
  "_from_list_of_tuples is deprecated. Use TuplesSource directly or ScenarioSource.from_source('list_of_tuples', ...) instead.",
1748
1859
  DeprecationWarning,
1749
- stacklevel=2
1860
+ stacklevel=2,
1750
1861
  )
1751
1862
  source = TuplesSource(field_names, values, use_indexes)
1752
1863
  return source.to_scenario_list()
1753
-
1864
+
1754
1865
  @staticmethod
1755
- def _from_sqlite(
1756
- db_path: str, table: str, fields: Optional[list] = None
1757
- ):
1866
+ def _from_sqlite(db_path: str, table: str, fields: Optional[list] = None):
1758
1867
  """Create a ScenarioList from a SQLite database."""
1759
1868
  warnings.warn(
1760
1869
  "_from_sqlite is deprecated. Use SQLiteSource directly or ScenarioSource.from_source('sqlite', ...) instead.",
1761
1870
  DeprecationWarning,
1762
- stacklevel=2
1871
+ stacklevel=2,
1763
1872
  )
1764
1873
  source = SQLiteSource(db_path, table, fields)
1765
1874
  return source.to_scenario_list()
1766
-
1875
+
1767
1876
  @staticmethod
1768
- def _from_latex(
1769
- file_path: str, table_index: int = 0, has_header: bool = True
1770
- ):
1877
+ def _from_latex(file_path: str, table_index: int = 0, has_header: bool = True):
1771
1878
  """Create a ScenarioList from a LaTeX file."""
1772
1879
  warnings.warn(
1773
1880
  "_from_latex is deprecated. Use LaTeXSource directly or ScenarioSource.from_source('latex', ...) instead.",
1774
1881
  DeprecationWarning,
1775
- stacklevel=2
1882
+ stacklevel=2,
1776
1883
  )
1777
1884
  source = LaTeXSource(file_path, table_index, has_header)
1778
1885
  return source.to_scenario_list()
1779
-
1886
+
1780
1887
  @staticmethod
1781
1888
  def _from_google_doc(url: str):
1782
1889
  """Create a ScenarioList from a Google Doc."""
1783
1890
  warnings.warn(
1784
1891
  "_from_google_doc is deprecated. Use GoogleDocSource directly or ScenarioSource.from_source('google_doc', ...) instead.",
1785
1892
  DeprecationWarning,
1786
- stacklevel=2
1893
+ stacklevel=2,
1787
1894
  )
1788
1895
  source = GoogleDocSource(url)
1789
1896
  return source.to_scenario_list()
1790
-
1897
+
1791
1898
  @staticmethod
1792
1899
  def _from_pandas(df):
1793
1900
  """Create a ScenarioList from a pandas DataFrame."""
1794
1901
  warnings.warn(
1795
1902
  "_from_pandas is deprecated. Use PandasSource directly or ScenarioSource.from_source('pandas', ...) instead.",
1796
1903
  DeprecationWarning,
1797
- stacklevel=2
1904
+ stacklevel=2,
1798
1905
  )
1799
1906
  source = PandasSource(df)
1800
1907
  return source.to_scenario_list()
1801
-
1908
+
1802
1909
  @staticmethod
1803
1910
  def _from_dta(file_path: str, include_metadata: bool = True):
1804
1911
  """Create a ScenarioList from a Stata data file."""
1805
1912
  warnings.warn(
1806
1913
  "_from_dta is deprecated. Use StataSource directly or ScenarioSource.from_source('dta', ...) instead.",
1807
1914
  DeprecationWarning,
1808
- stacklevel=2
1915
+ stacklevel=2,
1809
1916
  )
1810
1917
  source = StataSource(file_path, include_metadata)
1811
1918
  return source.to_scenario_list()
1812
-
1919
+
1813
1920
  @staticmethod
1814
- def _from_wikipedia(
1815
- url: str, table_index: int = 0, header: bool = True
1816
- ):
1921
+ def _from_wikipedia(url: str, table_index: int = 0, header: bool = True):
1817
1922
  """Create a ScenarioList from a table on a Wikipedia page."""
1818
1923
  warnings.warn(
1819
1924
  "_from_wikipedia is deprecated. Use WikipediaSource directly or ScenarioSource.from_source('wikipedia', ...) instead.",
1820
1925
  DeprecationWarning,
1821
- stacklevel=2
1926
+ stacklevel=2,
1822
1927
  )
1823
1928
  source = WikipediaSource(url, table_index, header)
1824
1929
  return source.to_scenario_list()
1825
-
1930
+
1826
1931
  @staticmethod
1827
- def _from_excel(
1828
- file_path: str, sheet_name: Optional[str] = None, **kwargs
1829
- ):
1932
+ def _from_excel(file_path: str, sheet_name: Optional[str] = None, **kwargs):
1830
1933
  """Create a ScenarioList from an Excel file."""
1831
1934
  warnings.warn(
1832
1935
  "_from_excel is deprecated. Use ExcelSource directly or ScenarioSource.from_source('excel', ...) instead.",
1833
1936
  DeprecationWarning,
1834
- stacklevel=2
1937
+ stacklevel=2,
1835
1938
  )
1836
1939
  source = ExcelSource(file_path, sheet_name=sheet_name, **kwargs)
1837
1940
  return source.to_scenario_list()
1838
-
1941
+
1839
1942
  @staticmethod
1840
- def _from_google_sheet(url: str, sheet_name: Optional[str] = None, column_names: Optional[List[str]] = None, **kwargs):
1943
+ def _from_google_sheet(
1944
+ url: str,
1945
+ sheet_name: Optional[str] = None,
1946
+ column_names: Optional[List[str]] = None,
1947
+ **kwargs,
1948
+ ):
1841
1949
  """Create a ScenarioList from a Google Sheet."""
1842
1950
  warnings.warn(
1843
1951
  "_from_google_sheet is deprecated. Use GoogleSheetSource directly or ScenarioSource.from_source('google_sheet', ...) instead.",
1844
1952
  DeprecationWarning,
1845
- stacklevel=2
1953
+ stacklevel=2,
1954
+ )
1955
+ source = GoogleSheetSource(
1956
+ url, sheet_name=sheet_name, column_names=column_names, **kwargs
1846
1957
  )
1847
- source = GoogleSheetSource(url, sheet_name=sheet_name, column_names=column_names, **kwargs)
1848
1958
  return source.to_scenario_list()
1849
-
1959
+
1850
1960
  @staticmethod
1851
1961
  def _from_delimited_file(
1852
1962
  file_or_url: str,
@@ -1859,44 +1969,44 @@ class ScenarioSource:
1859
1969
  warnings.warn(
1860
1970
  "_from_delimited_file is deprecated. Use DelimitedFileSource directly or ScenarioSource.from_source('delimited_file', ...) instead.",
1861
1971
  DeprecationWarning,
1862
- stacklevel=2
1972
+ stacklevel=2,
1863
1973
  )
1864
1974
  source = DelimitedFileSource(
1865
1975
  file_or_url=file_or_url,
1866
1976
  delimiter=delimiter,
1867
1977
  has_header=has_header,
1868
1978
  encoding=encoding,
1869
- **kwargs
1979
+ **kwargs,
1870
1980
  )
1871
1981
  return source.to_scenario_list()
1872
-
1982
+
1873
1983
  @staticmethod
1874
1984
  def _from_csv(file_or_url: str, **kwargs):
1875
1985
  """Create a ScenarioList from a CSV file or URL."""
1876
1986
  warnings.warn(
1877
1987
  "_from_csv is deprecated. Use CSVSource directly or ScenarioSource.from_source('csv', ...) instead.",
1878
1988
  DeprecationWarning,
1879
- stacklevel=2
1989
+ stacklevel=2,
1880
1990
  )
1881
1991
  source = CSVSource(file_or_url=file_or_url, **kwargs)
1882
1992
  return source.to_scenario_list()
1883
-
1993
+
1884
1994
  @staticmethod
1885
1995
  def _from_tsv(file_or_url: str, **kwargs):
1886
1996
  """Create a ScenarioList from a TSV file or URL."""
1887
1997
  warnings.warn(
1888
1998
  "_from_tsv is deprecated. Use TSVSource directly or ScenarioSource.from_source('tsv', ...) instead.",
1889
1999
  DeprecationWarning,
1890
- stacklevel=2
2000
+ stacklevel=2,
1891
2001
  )
1892
2002
  source = TSVSource(file_or_url=file_or_url, **kwargs)
1893
2003
  return source.to_scenario_list()
1894
-
2004
+
1895
2005
  @staticmethod
1896
2006
  def _from_dict(data: dict):
1897
2007
  """Create a ScenarioList from a dictionary."""
1898
2008
  from .scenario_list import ScenarioList
1899
-
2009
+
1900
2010
  if "scenarios" in data:
1901
2011
  scenarios = [Scenario(s) for s in data["scenarios"]]
1902
2012
  codebook = data.get("codebook", {})
@@ -1907,48 +2017,48 @@ class ScenarioSource:
1907
2017
  field_names = list(data.keys())
1908
2018
  if not all(isinstance(v, list) for v in data.values()):
1909
2019
  raise ScenarioError("All values in the dictionary must be lists")
1910
-
2020
+
1911
2021
  # Check all lists have the same length
1912
2022
  list_lengths = [len(v) for v in data.values()]
1913
2023
  if not all(l == list_lengths[0] for l in list_lengths):
1914
2024
  raise ScenarioError("All lists must have the same length")
1915
-
2025
+
1916
2026
  # Create scenarios
1917
2027
  for i in range(list_lengths[0]):
1918
2028
  scenario_dict = {k: data[k][i] for k in field_names}
1919
2029
  scenarios.append(Scenario(scenario_dict))
1920
-
2030
+
1921
2031
  return ScenarioList(scenarios)
1922
-
2032
+
1923
2033
  @staticmethod
1924
2034
  def _from_nested_dict(data: dict, id_field: Optional[str] = None):
1925
2035
  """Create a ScenarioList from a nested dictionary."""
1926
2036
  from .scenario_list import ScenarioList
1927
-
2037
+
1928
2038
  scenarios = []
1929
-
2039
+
1930
2040
  for key, value in data.items():
1931
2041
  if not isinstance(value, dict):
1932
2042
  raise ScenarioError(f"Value for key {key} is not a dictionary")
1933
-
2043
+
1934
2044
  scenario_dict = value.copy()
1935
2045
  if id_field:
1936
2046
  scenario_dict[id_field] = key
1937
2047
  scenarios.append(Scenario(scenario_dict))
1938
-
2048
+
1939
2049
  return ScenarioList(scenarios)
1940
-
2050
+
1941
2051
  @staticmethod
1942
2052
  def _from_parquet(file_path: str):
1943
2053
  """Create a ScenarioList from a Parquet file."""
1944
2054
  warnings.warn(
1945
2055
  "_from_parquet is deprecated. Use ParquetSource directly or ScenarioSource.from_source('parquet', ...) instead.",
1946
2056
  DeprecationWarning,
1947
- stacklevel=2
2057
+ stacklevel=2,
1948
2058
  )
1949
2059
  source = ParquetSource(file_path)
1950
2060
  return source.to_scenario_list()
1951
-
2061
+
1952
2062
  @staticmethod
1953
2063
  def _from_pdf(
1954
2064
  file_path: str,
@@ -1960,16 +2070,16 @@ class ScenarioSource:
1960
2070
  warnings.warn(
1961
2071
  "_from_pdf is deprecated. Use PDFSource directly or ScenarioSource.from_source('pdf', ...) instead.",
1962
2072
  DeprecationWarning,
1963
- stacklevel=2
2073
+ stacklevel=2,
1964
2074
  )
1965
2075
  source = PDFSource(
1966
2076
  file_path=file_path,
1967
2077
  chunk_type=chunk_type,
1968
2078
  chunk_size=chunk_size,
1969
- chunk_overlap=chunk_overlap
2079
+ chunk_overlap=chunk_overlap,
1970
2080
  )
1971
2081
  return source.to_scenario_list()
1972
-
2082
+
1973
2083
  @staticmethod
1974
2084
  def _from_pdf_to_image(
1975
2085
  file_path: str,
@@ -1980,11 +2090,9 @@ class ScenarioSource:
1980
2090
  warnings.warn(
1981
2091
  "_from_pdf_to_image is deprecated. Use PDFImageSource directly or ScenarioSource.from_source('pdf_to_image', ...) instead.",
1982
2092
  DeprecationWarning,
1983
- stacklevel=2
2093
+ stacklevel=2,
1984
2094
  )
1985
2095
  source = PDFImageSource(
1986
- file_path=file_path,
1987
- base_width=base_width,
1988
- include_text=include_text
2096
+ file_path=file_path, base_width=base_width, include_text=include_text
1989
2097
  )
1990
- return source.to_scenario_list()
2098
+ return source.to_scenario_list()