edsl 0.1.60__py3-none-any.whl → 0.1.62__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -16,32 +16,53 @@ from __future__ import annotations
16
16
  import functools
17
17
  import warnings
18
18
  import fnmatch
19
- from typing import Any, Callable, List, Literal, Optional, Type, TypeVar, Union, TYPE_CHECKING, cast, Any
19
+ from collections import defaultdict
20
+ import warnings
21
+ from typing import (
22
+ Any,
23
+ Callable,
24
+ List,
25
+ Literal,
26
+ Optional,
27
+ Type,
28
+ TypeVar,
29
+ Union,
30
+ TYPE_CHECKING,
31
+ cast,
32
+ Any,
33
+ )
34
+
35
+ T = TypeVar("T")
20
36
 
21
- T = TypeVar('T')
22
37
 
23
- def deprecated_classmethod(alternative: str) -> Callable[[Callable[..., T]], Callable[..., T]]:
38
+ def deprecated_classmethod(
39
+ alternative: str,
40
+ ) -> Callable[[Callable[..., T]], Callable[..., T]]:
24
41
  """
25
42
  Decorator that marks a class method as deprecated.
26
-
43
+
27
44
  Args:
28
45
  alternative: The suggested alternative to use instead
29
-
46
+
30
47
  Returns:
31
48
  A decorator function that wraps the original method with a deprecation warning
32
49
  """
50
+
33
51
  def decorator(func: Callable[..., T]) -> Callable[..., T]:
34
52
  @functools.wraps(func)
35
53
  def wrapper(*args: Any, **kwargs: Any) -> T:
36
54
  warnings.warn(
37
55
  f"{func.__qualname__} is deprecated. Use {alternative} instead.",
38
56
  DeprecationWarning,
39
- stacklevel=2
57
+ stacklevel=2,
40
58
  )
41
59
  return func(*args, **kwargs)
60
+
42
61
  return wrapper
62
+
43
63
  return decorator
44
64
 
65
+
45
66
  import os
46
67
  import csv
47
68
  import json
@@ -61,27 +82,28 @@ from .exceptions import ScenarioError
61
82
 
62
83
  from abc import ABC, abstractmethod
63
84
 
85
+
64
86
  class Source(ABC):
65
87
  # Registry to store child classes and their source types
66
- _registry: dict[str, Type['Source']] = {}
88
+ _registry: dict[str, Type["Source"]] = {}
67
89
 
68
90
  def __init_subclass__(cls, **kwargs):
69
91
  """Automatically register subclasses with their source_type."""
70
92
  super().__init_subclass__(**kwargs)
71
- if hasattr(cls, 'source_type'):
93
+ if hasattr(cls, "source_type"):
72
94
  Source._registry[cls.source_type] = cls
73
95
 
74
96
  @classmethod
75
97
  @abstractmethod
76
- def example(cls) -> 'Source':
98
+ def example(cls) -> "Source":
77
99
  """
78
100
  Return an example instance of this Source type.
79
-
101
+
80
102
  This method should return a valid instance of the Source subclass
81
103
  that can be used for testing. The instance should be created with
82
104
  reasonable default values that will produce a valid ScenarioList
83
105
  when to_scenario_list() is called.
84
-
106
+
85
107
  Returns:
86
108
  An instance of the Source subclass
87
109
  """
@@ -91,14 +113,14 @@ class Source(ABC):
91
113
  def to_scenario_list(self):
92
114
  """
93
115
  Convert the source to a ScenarioList.
94
-
116
+
95
117
  Returns:
96
118
  A ScenarioList containing the data from this source
97
119
  """
98
120
  pass
99
121
 
100
122
  @classmethod
101
- def get_source_class(cls, source_type: str) -> Type['Source']:
123
+ def get_source_class(cls, source_type: str) -> Type["Source"]:
102
124
  """Get the Source subclass for a given source_type."""
103
125
  if source_type not in cls._registry:
104
126
  raise ValueError(f"No Source subclass found for source_type: {source_type}")
@@ -114,12 +136,12 @@ class Source(ABC):
114
136
  """
115
137
  Test all registered source types by creating an example instance
116
138
  and calling to_scenario_list() on it.
117
-
139
+
118
140
  Returns:
119
141
  A dictionary mapping source types to boolean success values
120
142
  """
121
143
  from .scenario_list import ScenarioList
122
-
144
+
123
145
  results = {}
124
146
  for source_type, source_class in cls._registry.items():
125
147
  try:
@@ -130,7 +152,9 @@ class Source(ABC):
130
152
  # Basic validation
131
153
  if not isinstance(scenario_list, ScenarioList):
132
154
  results[source_type] = False
133
- print(f"Source {source_type} returned {type(scenario_list)} instead of ScenarioList")
155
+ print(
156
+ f"Source {source_type} returned {type(scenario_list)} instead of ScenarioList"
157
+ )
134
158
  else:
135
159
  results[source_type] = True
136
160
  except Exception as e:
@@ -138,6 +162,7 @@ class Source(ABC):
138
162
  print(f"Source {source_type} exception: {e}")
139
163
  return results
140
164
 
165
+
141
166
  class URLSource(Source):
142
167
  source_type = "urls"
143
168
 
@@ -146,19 +171,16 @@ class URLSource(Source):
146
171
  self.field_name = field_name
147
172
 
148
173
  @classmethod
149
- def example(cls) -> 'URLSource':
174
+ def example(cls) -> "URLSource":
150
175
  """Return an example URLSource instance."""
151
- return cls(
152
- urls=['http://www.example.com'],
153
- field_name="text"
154
- )
155
-
176
+ return cls(urls=["http://www.example.com"], field_name="text")
177
+
156
178
  def to_scenario_list(self):
157
179
  """Create a ScenarioList from a list of URLs."""
158
180
  import requests
159
-
181
+
160
182
  from .scenario_list import ScenarioList
161
-
183
+
162
184
  result = ScenarioList()
163
185
  for url in self.urls:
164
186
  try:
@@ -169,9 +191,9 @@ class URLSource(Source):
169
191
  except requests.RequestException as e:
170
192
  warnings.warn(f"Failed to fetch URL {url}: {str(e)}")
171
193
  continue
172
-
194
+
173
195
  return result
174
-
196
+
175
197
 
176
198
  class ListSource(Source):
177
199
  source_type = "list"
@@ -182,26 +204,26 @@ class ListSource(Source):
182
204
  self.use_indexes = use_indexes
183
205
 
184
206
  @classmethod
185
- def example(cls) -> 'ListSource':
207
+ def example(cls) -> "ListSource":
186
208
  """Return an example ListSource instance."""
187
209
  return cls(
188
210
  field_name="text",
189
211
  values=["example1", "example2", "example3"],
190
- use_indexes=True
212
+ use_indexes=True,
191
213
  )
192
214
 
193
215
  def to_scenario_list(self):
194
216
  """Create a ScenarioList from a list of values with a specified field name."""
195
217
  from .scenario_list import ScenarioList
196
-
218
+
197
219
  scenarios = []
198
-
220
+
199
221
  for i, value in enumerate(self.values):
200
222
  scenario_dict = {self.field_name: value}
201
223
  if self.use_indexes:
202
224
  scenario_dict["idx"] = i
203
225
  scenarios.append(Scenario(scenario_dict))
204
-
226
+
205
227
  return ScenarioList(scenarios)
206
228
 
207
229
 
@@ -225,48 +247,48 @@ class DirectorySource(Source):
225
247
  self.ignore_files = ignore_files or []
226
248
 
227
249
  @classmethod
228
- def example(cls) -> 'DirectorySource':
250
+ def example(cls) -> "DirectorySource":
229
251
  """Return an example DirectorySource instance."""
230
252
  import tempfile
231
253
  import os
232
-
254
+
233
255
  # Create a temporary directory for the example
234
256
  temp_dir = tempfile.mkdtemp(prefix="edsl_test_")
235
-
257
+
236
258
  # Create some sample files in the directory
237
259
  with open(os.path.join(temp_dir, "test1.txt"), "w") as f:
238
260
  f.write("Sample content 1")
239
-
261
+
240
262
  with open(os.path.join(temp_dir, "test2.txt"), "w") as f:
241
263
  f.write("Sample content 2")
242
-
264
+
243
265
  # Create a subdirectory with a file
244
266
  subdir = os.path.join(temp_dir, "subdir")
245
267
  os.makedirs(subdir, exist_ok=True)
246
268
  with open(os.path.join(subdir, "test3.txt"), "w") as f:
247
269
  f.write("Sample content 3")
248
-
270
+
249
271
  return cls(
250
272
  directory=temp_dir,
251
273
  pattern="*.txt",
252
274
  recursive=True,
253
275
  metadata=True,
254
276
  ignore_dirs=["__pycache__"],
255
- ignore_files=["*.pyc"]
277
+ ignore_files=["*.pyc"],
256
278
  )
257
-
279
+
258
280
  def to_scenario_list(self):
259
281
  """Create a ScenarioList from files in a directory."""
260
282
  import os
261
283
  import glob
262
-
284
+
263
285
  from .scenario_list import ScenarioList
264
-
286
+
265
287
  # Set default recursive value
266
288
  recursive = self.recursive
267
-
289
+
268
290
  # Handle paths with wildcards properly
269
- if '*' in self.directory:
291
+ if "*" in self.directory:
270
292
  # Handle "**/*.py" patterns (recursive wildcard)
271
293
  if "**" in self.directory:
272
294
  parts = self.directory.split("**")
@@ -287,52 +309,58 @@ class DirectorySource(Source):
287
309
  else:
288
310
  directory = self.directory
289
311
  pattern = self.pattern
290
-
312
+
291
313
  # Check if directory exists
292
314
  if not os.path.isdir(directory):
293
315
  from .exceptions import FileNotFoundScenarioError
316
+
294
317
  raise FileNotFoundScenarioError(f"Directory not found: {directory}")
295
-
318
+
296
319
  # Use glob directly for ** patterns to prevent duplicates
297
320
  if "**" in pattern:
298
321
  from .scenario_list import ScenarioList
299
322
  from .file_store import FileStore
300
-
323
+
301
324
  # Handle the pattern directly with glob
302
325
  full_pattern = os.path.join(directory, pattern)
303
326
  file_paths = glob.glob(full_pattern, recursive=True)
304
-
327
+
305
328
  # Remove duplicates (by converting to a set and back)
306
329
  file_paths = list(set(file_paths))
307
-
330
+
308
331
  # Create scenarios
309
332
  scenarios = []
310
333
  for file_path in file_paths:
311
334
  if os.path.isfile(file_path):
312
335
  # Check if file should be ignored
313
336
  file_name = os.path.basename(file_path)
314
- if any(fnmatch.fnmatch(file_name, ignore_pattern) for ignore_pattern in self.ignore_files or []):
337
+ if any(
338
+ fnmatch.fnmatch(file_name, ignore_pattern)
339
+ for ignore_pattern in self.ignore_files or []
340
+ ):
315
341
  continue
316
-
342
+
317
343
  # Create FileStore object
318
344
  file_store = FileStore(file_path)
319
-
345
+
320
346
  # Create scenario
321
347
  scenario_data = {"file": file_store}
322
-
348
+
323
349
  # Add metadata if requested
324
350
  if self.metadata:
325
351
  file_stat = os.stat(file_path)
326
- scenario_data.update({
327
- "file_path": file_path,
328
- "file_name": file_name,
329
- "file_size": file_stat.st_size,
330
- "file_created": file_stat.st_ctime,
331
- "file_modified": file_stat.st_mtime,
332
- })
333
-
352
+ scenario_data.update(
353
+ {
354
+ "file_path": file_path,
355
+ "file_name": file_name,
356
+ "file_size": file_stat.st_size,
357
+ "file_created": file_stat.st_ctime,
358
+ "file_modified": file_stat.st_mtime,
359
+ }
360
+ )
361
+
334
362
  scenarios.append(Scenario(scenario_data))
335
-
363
+
336
364
  return ScenarioList(scenarios)
337
365
  else:
338
366
  # Use the standard scanning method for non-** patterns
@@ -348,148 +376,146 @@ class DirectorySource(Source):
348
376
 
349
377
  class TuplesSource(Source):
350
378
  source_type = "list_of_tuples"
351
-
352
- def __init__(self, field_names: list[str], values: list[tuple], use_indexes: bool = False):
379
+
380
+ def __init__(
381
+ self, field_names: list[str], values: list[tuple], use_indexes: bool = False
382
+ ):
353
383
  self.field_names = field_names
354
384
  self.values = values
355
385
  self.use_indexes = use_indexes
356
-
386
+
357
387
  # Validate inputs
358
388
  if not all(isinstance(v, (tuple, list)) for v in values):
359
389
  raise ScenarioError("All values must be tuples or lists")
360
-
390
+
361
391
  @classmethod
362
- def example(cls) -> 'TuplesSource':
392
+ def example(cls) -> "TuplesSource":
363
393
  """Return an example TuplesSource instance."""
364
394
  return cls(
365
395
  field_names=["name", "age", "city"],
366
396
  values=[
367
397
  ("Alice", 30, "New York"),
368
398
  ("Bob", 25, "San Francisco"),
369
- ("Charlie", 35, "Boston")
399
+ ("Charlie", 35, "Boston"),
370
400
  ],
371
- use_indexes=True
401
+ use_indexes=True,
372
402
  )
373
-
403
+
374
404
  def to_scenario_list(self):
375
405
  """Create a ScenarioList from a list of tuples with specified field names."""
376
406
  from .scenario_list import ScenarioList
377
-
407
+
378
408
  scenarios = []
379
-
409
+
380
410
  for i, value_tuple in enumerate(self.values):
381
411
  if len(value_tuple) != len(self.field_names):
382
412
  raise ScenarioError(
383
413
  f"Tuple {i} has {len(value_tuple)} elements, but {len(self.field_names)} field names were provided."
384
414
  )
385
-
415
+
386
416
  scenario_dict = dict(zip(self.field_names, value_tuple))
387
417
  if self.use_indexes:
388
418
  scenario_dict["idx"] = i
389
419
  scenarios.append(Scenario(scenario_dict))
390
-
420
+
391
421
  return ScenarioList(scenarios)
392
422
 
393
423
 
394
424
  class SQLiteSource(Source):
395
425
  source_type = "sqlite"
396
-
426
+
397
427
  def __init__(self, db_path: str, table: str, fields: Optional[list] = None):
398
428
  self.db_path = db_path
399
429
  self.table = table
400
430
  self.fields = fields
401
-
431
+
402
432
  @classmethod
403
- def example(cls) -> 'SQLiteSource':
433
+ def example(cls) -> "SQLiteSource":
404
434
  """Return an example SQLiteSource instance."""
405
435
  import sqlite3
406
436
  import tempfile
407
437
  import os
408
-
438
+
409
439
  # Create a temporary SQLite database for the example
410
- fd, temp_path = tempfile.mkstemp(suffix='.db', prefix='edsl_test_')
440
+ fd, temp_path = tempfile.mkstemp(suffix=".db", prefix="edsl_test_")
411
441
  os.close(fd) # Close the file descriptor
412
-
442
+
413
443
  # Connect to the database and create a sample table
414
444
  conn = sqlite3.connect(temp_path)
415
445
  cursor = conn.cursor()
416
-
446
+
417
447
  # Create a simple table
418
- cursor.execute('CREATE TABLE test_table (id INTEGER PRIMARY KEY, name TEXT, value INTEGER)')
419
-
448
+ cursor.execute(
449
+ "CREATE TABLE test_table (id INTEGER PRIMARY KEY, name TEXT, value INTEGER)"
450
+ )
451
+
420
452
  # Insert sample data
421
- sample_data = [
422
- (1, 'Alpha', 100),
423
- (2, 'Beta', 200),
424
- (3, 'Gamma', 300)
425
- ]
426
- cursor.executemany('INSERT INTO test_table VALUES (?, ?, ?)', sample_data)
427
-
453
+ sample_data = [(1, "Alpha", 100), (2, "Beta", 200), (3, "Gamma", 300)]
454
+ cursor.executemany("INSERT INTO test_table VALUES (?, ?, ?)", sample_data)
455
+
428
456
  conn.commit()
429
457
  conn.close()
430
-
458
+
431
459
  return cls(
432
- db_path=temp_path,
433
- table='test_table',
434
- fields=['id', 'name', 'value']
460
+ db_path=temp_path, table="test_table", fields=["id", "name", "value"]
435
461
  )
436
-
462
+
437
463
  def to_scenario_list(self):
438
464
  """Create a ScenarioList from a SQLite database."""
439
465
  from .scenario_list import ScenarioList
440
466
  import sqlite3
441
-
467
+
442
468
  conn = sqlite3.connect(self.db_path)
443
469
  cursor = conn.cursor()
444
-
470
+
445
471
  # If fields weren't provided, get all fields from the table
446
472
  fields = self.fields
447
473
  if fields is None:
448
474
  cursor.execute(f"PRAGMA table_info({self.table})")
449
475
  fields = [row[1] for row in cursor.fetchall()]
450
-
476
+
451
477
  # Query the data
452
478
  field_placeholders = ", ".join(fields)
453
479
  cursor.execute(f"SELECT {field_placeholders} FROM {self.table}")
454
480
  rows = cursor.fetchall()
455
-
481
+
456
482
  # Create scenarios
457
483
  scenarios = []
458
484
  for row in rows:
459
485
  scenario_dict = dict(zip(fields, row))
460
486
  scenarios.append(Scenario(scenario_dict))
461
-
487
+
462
488
  conn.close()
463
489
  return ScenarioList(scenarios)
464
490
 
465
491
 
466
492
  class LaTeXSource(Source):
467
493
  source_type = "latex"
468
-
494
+
469
495
  def __init__(self, file_path: str, table_index: int = 0, has_header: bool = True):
470
496
  """
471
497
  Initialize a LaTeXSource with a LaTeX file path.
472
-
498
+
473
499
  Args:
474
500
  file_path: The path to the LaTeX file.
475
- table_index: The index of the table to extract (if multiple tables exist).
501
+ table_index: The index of the table to extract (if multiple tables exist).
476
502
  Default is 0 (first table).
477
503
  has_header: Whether the table has a header row. Default is True.
478
504
  """
479
505
  self.file_path = file_path
480
506
  self.table_index = table_index
481
507
  self.has_header = has_header
482
-
508
+
483
509
  @classmethod
484
- def example(cls) -> 'LaTeXSource':
510
+ def example(cls) -> "LaTeXSource":
485
511
  """Return an example LaTeXSource instance."""
486
512
  import tempfile
487
513
  import os
488
-
514
+
489
515
  # Create a temporary LaTeX file with a sample table
490
- fd, temp_path = tempfile.mkstemp(suffix='.tex', prefix='edsl_test_')
516
+ fd, temp_path = tempfile.mkstemp(suffix=".tex", prefix="edsl_test_")
491
517
  os.close(fd) # Close the file descriptor
492
-
518
+
493
519
  # Write a sample LaTeX table to the file
494
520
  sample_latex = r"""
495
521
  \documentclass{article}
@@ -505,39 +531,35 @@ Charlie & 35 & 92 \\
505
531
 
506
532
  \end{document}
507
533
  """
508
- with open(temp_path, 'w') as f:
534
+ with open(temp_path, "w") as f:
509
535
  f.write(sample_latex)
510
-
511
- return cls(
512
- file_path=temp_path,
513
- table_index=0,
514
- has_header=True
515
- )
516
-
536
+
537
+ return cls(file_path=temp_path, table_index=0, has_header=True)
538
+
517
539
  def to_scenario_list(self):
518
540
  """Create a ScenarioList from a LaTeX file."""
519
541
  from .scenario_list import ScenarioList
520
542
  import re
521
-
543
+
522
544
  with open(self.file_path, "r") as f:
523
545
  content = f.read()
524
-
546
+
525
547
  # Find all tabular environments
526
548
  tabular_pattern = r"\\begin{tabular}(.*?)\\end{tabular}"
527
549
  tables = re.findall(tabular_pattern, content, re.DOTALL)
528
-
550
+
529
551
  if not tables or self.table_index >= len(tables):
530
552
  raise ScenarioError(f"No table found at index {self.table_index}")
531
-
553
+
532
554
  table_content = tables[self.table_index]
533
-
555
+
534
556
  # Extract rows
535
557
  rows = table_content.split("\\\\")
536
558
  rows = [row.strip() for row in rows if row.strip()]
537
-
559
+
538
560
  if not rows:
539
561
  return ScenarioList()
540
-
562
+
541
563
  # Process header if available
542
564
  if self.has_header:
543
565
  header_row = rows[0]
@@ -545,98 +567,104 @@ Charlie & 35 & 92 \\
545
567
  if not header_cells:
546
568
  header_cells = header_row.split("&")
547
569
  header_cells = [h.strip() for h in header_cells]
548
-
570
+
549
571
  data_rows = rows[1:]
550
572
  else:
551
573
  # Auto-generate column names
552
574
  header_cells = [f"col{i}" for i in range(rows[0].count("&") + 1)]
553
575
  data_rows = rows
554
-
576
+
555
577
  # Process data rows
556
578
  scenarios = []
557
579
  for row in data_rows:
558
580
  cells = row.split("&")
559
581
  cells = [cell.strip() for cell in cells]
560
-
582
+
561
583
  if len(cells) != len(header_cells):
562
584
  continue # Skip malformed rows
563
-
585
+
564
586
  scenario_dict = dict(zip(header_cells, cells))
565
587
  scenarios.append(Scenario(scenario_dict))
566
-
588
+
567
589
  return ScenarioList(scenarios)
568
590
 
569
591
 
570
592
  class GoogleDocSource(Source):
571
593
  source_type = "google_doc"
572
-
594
+
573
595
  def __init__(self, url: str):
574
596
  """
575
597
  Initialize a GoogleDocSource with a Google Doc URL.
576
-
598
+
577
599
  Args:
578
600
  url: The URL to the Google Doc.
579
601
  """
580
602
  self.url = url
581
-
603
+
582
604
  @classmethod
583
- def example(cls) -> 'GoogleDocSource':
605
+ def example(cls) -> "GoogleDocSource":
584
606
  """Return an example GoogleDocSource instance."""
585
607
  # Create a mock instance that doesn't actually fetch a Google Doc
586
- instance = cls(url="https://docs.google.com/document/d/1234567890abcdefghijklmnopqrstuvwxyz/edit")
587
-
608
+ instance = cls(
609
+ url="https://docs.google.com/document/d/1234567890abcdefghijklmnopqrstuvwxyz/edit"
610
+ )
611
+
588
612
  # Override the to_scenario_list method just for the example
589
613
  def mock_to_scenario_list(self):
590
614
  from .scenario_list import ScenarioList
615
+
591
616
  # Create a simple mock ScenarioList with a few paragraphs
592
617
  scenarios = [
593
618
  Scenario({"text": "This is paragraph 1 from a sample Google Doc."}),
594
619
  Scenario({"text": "This is paragraph 2 with some more content."}),
595
- Scenario({"text": "This is the final paragraph with a conclusion."})
620
+ Scenario({"text": "This is the final paragraph with a conclusion."}),
596
621
  ]
597
622
  return ScenarioList(scenarios)
598
-
623
+
599
624
  # Replace the method on this instance only
600
625
  import types
626
+
601
627
  instance.to_scenario_list = types.MethodType(mock_to_scenario_list, instance)
602
-
628
+
603
629
  return instance
604
-
630
+
605
631
  def to_scenario_list(self):
606
632
  """Create a ScenarioList from a Google Doc."""
607
633
  from .scenario_list import ScenarioList
608
634
  import tempfile
609
635
  import requests
610
-
636
+
611
637
  # Extract the document ID from the URL
612
638
  if "/edit" in self.url:
613
639
  doc_id = self.url.split("/d/")[1].split("/edit")[0]
614
640
  else:
615
641
  raise ScenarioError("Invalid Google Doc URL format.")
616
-
642
+
617
643
  # Create the export URL to download as DOCX
618
644
  export_url = f"https://docs.google.com/document/d/{doc_id}/export?format=docx"
619
-
645
+
620
646
  try:
621
647
  # Download the Google Doc as a Word file (.docx)
622
648
  response = requests.get(export_url)
623
649
  response.raise_for_status() # Ensure the request was successful
624
-
650
+
625
651
  # Save the Word file to a temporary file
626
652
  with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as temp_file:
627
653
  temp_file.write(response.content)
628
654
  temp_filename = temp_file.name
629
-
655
+
630
656
  # Use the DocxScenario class to process the temporary file
631
657
  from .scenario_list import ScenarioList
632
658
  from .DocxScenario import DocxScenario
633
-
659
+
634
660
  # Create a scenario from the DOCX file
635
661
  docx_scenario = DocxScenario(temp_filename)
636
- scenarios = [Scenario({"text": paragraph}) for paragraph in docx_scenario.paragraphs]
637
-
662
+ scenarios = [
663
+ Scenario({"text": paragraph}) for paragraph in docx_scenario.paragraphs
664
+ ]
665
+
638
666
  return ScenarioList(scenarios)
639
-
667
+
640
668
  except requests.RequestException as e:
641
669
  raise ScenarioError(f"Failed to fetch Google Doc: {str(e)}")
642
670
  except Exception as e:
@@ -645,79 +673,84 @@ class GoogleDocSource(Source):
645
673
 
646
674
  class PandasSource(Source):
647
675
  source_type = "pandas"
648
-
676
+
649
677
  def __init__(self, df):
650
678
  """
651
679
  Initialize a PandasSource with a pandas DataFrame.
652
-
680
+
653
681
  Args:
654
682
  df: A pandas DataFrame.
655
683
  """
656
684
  try:
657
685
  import pandas as pd
686
+
658
687
  if not isinstance(df, pd.DataFrame):
659
688
  raise ScenarioError("Input must be a pandas DataFrame")
660
689
  self.df = df
661
690
  except ImportError:
662
691
  raise ImportError("pandas is required for PandasSource")
663
-
692
+
664
693
  @classmethod
665
- def example(cls) -> 'PandasSource':
694
+ def example(cls) -> "PandasSource":
666
695
  """Return an example PandasSource instance."""
667
696
  try:
668
697
  import pandas as pd
669
-
698
+
670
699
  # Create a sample DataFrame for the example
671
700
  sample_data = {
672
- 'name': ['Alice', 'Bob', 'Charlie', 'David'],
673
- 'age': [30, 25, 35, 28],
674
- 'city': ['New York', 'San Francisco', 'Boston', 'Seattle']
701
+ "name": ["Alice", "Bob", "Charlie", "David"],
702
+ "age": [30, 25, 35, 28],
703
+ "city": ["New York", "San Francisco", "Boston", "Seattle"],
675
704
  }
676
705
  df = pd.DataFrame(sample_data)
677
-
706
+
678
707
  return cls(df)
679
708
  except ImportError:
680
709
  # Create a mock instance that doesn't actually need pandas
681
710
  instance = cls.__new__(cls)
682
-
711
+
683
712
  # Override the to_scenario_list method just for the example
684
713
  def mock_to_scenario_list(self):
685
714
  from .scenario_list import ScenarioList
715
+
686
716
  # Create a simple mock ScenarioList
687
717
  scenarios = [
688
718
  Scenario({"name": "Alice", "age": 30, "city": "New York"}),
689
719
  Scenario({"name": "Bob", "age": 25, "city": "San Francisco"}),
690
720
  Scenario({"name": "Charlie", "age": 35, "city": "Boston"}),
691
- Scenario({"name": "David", "age": 28, "city": "Seattle"})
721
+ Scenario({"name": "David", "age": 28, "city": "Seattle"}),
692
722
  ]
693
723
  return ScenarioList(scenarios)
694
-
724
+
695
725
  # Replace the method on this instance only
696
726
  import types
697
- instance.to_scenario_list = types.MethodType(mock_to_scenario_list, instance)
698
-
727
+
728
+ instance.to_scenario_list = types.MethodType(
729
+ mock_to_scenario_list, instance
730
+ )
731
+
699
732
  return instance
700
-
733
+
701
734
  def to_scenario_list(self):
702
735
  """Create a ScenarioList from a pandas DataFrame."""
703
736
  from .scenario_list import ScenarioList
704
-
737
+
705
738
  # Convert DataFrame records to scenarios
706
739
  scenarios = []
707
740
  for _, row in self.df.iterrows():
708
741
  scenario_dict = row.to_dict()
709
742
  scenarios.append(Scenario(scenario_dict))
710
-
743
+
711
744
  return ScenarioList(scenarios)
712
745
 
713
746
 
714
747
  class StataSource(Source):
715
748
  source_type = "dta"
716
-
749
+
717
750
  def __init__(self, file_path: str, include_metadata: bool = True):
718
751
  """
719
752
  Initialize a StataSource with a path to a Stata data file.
720
-
753
+
721
754
  Args:
722
755
  file_path: Path to the Stata (.dta) file.
723
756
  include_metadata: If True, extract and preserve variable labels and value labels
@@ -725,102 +758,108 @@ class StataSource(Source):
725
758
  """
726
759
  self.file_path = file_path
727
760
  self.include_metadata = include_metadata
728
-
761
+
729
762
  @classmethod
730
- def example(cls) -> 'StataSource':
763
+ def example(cls) -> "StataSource":
731
764
  """Return an example StataSource instance."""
732
765
  import tempfile
733
766
  import os
734
-
767
+
735
768
  # Since we can't easily create a real Stata file for testing,
736
769
  # we'll create a mock instance with an override
737
770
  instance = cls(file_path="/path/to/nonexistent/file.dta")
738
-
771
+
739
772
  # Override the to_scenario_list method just for the example
740
773
  def mock_to_scenario_list(self):
741
774
  from .scenario_list import ScenarioList
742
-
775
+
743
776
  # Create a simple mock ScenarioList with Stata-like data
744
777
  scenarios = [
745
778
  Scenario({"id": 1, "gender": 1, "income": 50000, "education": 2}),
746
779
  Scenario({"id": 2, "gender": 2, "income": 45000, "education": 3}),
747
- Scenario({"id": 3, "gender": 1, "income": 60000, "education": 4})
780
+ Scenario({"id": 3, "gender": 1, "income": 60000, "education": 4}),
748
781
  ]
749
-
782
+
750
783
  result = ScenarioList(scenarios)
751
-
784
+
752
785
  # Add metadata similar to what would be in a Stata file
753
786
  if self.include_metadata:
754
787
  result.codebook = {
755
788
  "variable_labels": {
756
789
  "gender": "Gender (1=Male, 2=Female)",
757
790
  "income": "Annual income in USD",
758
- "education": "Education level (1-4)"
791
+ "education": "Education level (1-4)",
759
792
  },
760
793
  "value_labels": {
761
794
  "gender": {1: "Male", 2: "Female"},
762
- "education": {1: "High School", 2: "Associate", 3: "Bachelor", 4: "Graduate"}
763
- }
795
+ "education": {
796
+ 1: "High School",
797
+ 2: "Associate",
798
+ 3: "Bachelor",
799
+ 4: "Graduate",
800
+ },
801
+ },
764
802
  }
765
-
803
+
766
804
  return result
767
-
805
+
768
806
  # Replace the method on this instance only
769
807
  import types
808
+
770
809
  instance.to_scenario_list = types.MethodType(mock_to_scenario_list, instance)
771
-
810
+
772
811
  return instance
773
-
812
+
774
813
  def to_scenario_list(self):
775
814
  """Create a ScenarioList from a Stata data file."""
776
815
  from .scenario_list import ScenarioList
777
-
816
+
778
817
  try:
779
818
  import pandas as pd
780
819
  except ImportError:
781
820
  raise ImportError("pandas is required to read Stata files")
782
-
821
+
783
822
  # Read the Stata file with pandas
784
823
  df = pd.read_stata(self.file_path)
785
-
824
+
786
825
  # Create scenarios
787
826
  scenarios = []
788
827
  for _, row in df.iterrows():
789
828
  scenario_dict = row.to_dict()
790
829
  scenarios.append(Scenario(scenario_dict))
791
-
830
+
792
831
  # Create the basic ScenarioList
793
832
  result = ScenarioList(scenarios)
794
-
833
+
795
834
  # Extract and preserve metadata if requested
796
835
  if self.include_metadata:
797
836
  # Get variable labels (if any)
798
837
  variable_labels = {}
799
838
  if hasattr(df, "variable_labels") and df.variable_labels:
800
839
  variable_labels = df.variable_labels
801
-
840
+
802
841
  # Get value labels (if any)
803
842
  value_labels = {}
804
843
  if hasattr(df, "value_labels") and df.value_labels:
805
844
  value_labels = df.value_labels
806
-
845
+
807
846
  # Store the metadata in the ScenarioList's codebook
808
847
  if variable_labels or value_labels:
809
848
  result.codebook = {
810
849
  "variable_labels": variable_labels,
811
850
  "value_labels": value_labels,
812
851
  }
813
-
852
+
814
853
  return result
815
854
 
816
855
 
817
856
  class WikipediaSource(Source):
818
857
  source_type = "wikipedia"
819
-
858
+
820
859
  def __init__(self, url: str, table_index: int = 0, header: bool = True):
821
860
  """
822
861
  Initialize a WikipediaSource with a URL to a Wikipedia page.
823
-
862
+
824
863
  Args:
825
864
  url: The URL of the Wikipedia page.
826
865
  table_index: The index of the table to extract (default is 0).
@@ -829,74 +868,89 @@ class WikipediaSource(Source):
829
868
  self.url = url
830
869
  self.table_index = table_index
831
870
  self.header = header
832
-
871
+
833
872
  @classmethod
834
- def example(cls) -> 'WikipediaSource':
873
+ def example(cls) -> "WikipediaSource":
835
874
  """Return an example WikipediaSource instance."""
836
875
  # Use a real Wikipedia URL for the example, but we'll override the to_scenario_list method
837
876
  instance = cls(
838
877
  url="https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)",
839
878
  table_index=0,
840
- header=True
879
+ header=True,
841
880
  )
842
-
881
+
843
882
  # Override the to_scenario_list method just for the example
844
883
  def mock_to_scenario_list(self):
845
884
  from .scenario_list import ScenarioList
846
-
885
+
847
886
  # Create a simple mock ScenarioList with GDP data
848
887
  scenarios = [
849
- Scenario({"Rank": 1, "Country": "United States", "GDP (millions of USD)": 25460000}),
850
- Scenario({"Rank": 2, "Country": "China", "GDP (millions of USD)": 17963000}),
851
- Scenario({"Rank": 3, "Country": "Japan", "GDP (millions of USD)": 4231000}),
852
- Scenario({"Rank": 4, "Country": "Germany", "GDP (millions of USD)": 4430000}),
853
- Scenario({"Rank": 5, "Country": "India", "GDP (millions of USD)": 3737000})
888
+ Scenario(
889
+ {
890
+ "Rank": 1,
891
+ "Country": "United States",
892
+ "GDP (millions of USD)": 25460000,
893
+ }
894
+ ),
895
+ Scenario(
896
+ {"Rank": 2, "Country": "China", "GDP (millions of USD)": 17963000}
897
+ ),
898
+ Scenario(
899
+ {"Rank": 3, "Country": "Japan", "GDP (millions of USD)": 4231000}
900
+ ),
901
+ Scenario(
902
+ {"Rank": 4, "Country": "Germany", "GDP (millions of USD)": 4430000}
903
+ ),
904
+ Scenario(
905
+ {"Rank": 5, "Country": "India", "GDP (millions of USD)": 3737000}
906
+ ),
854
907
  ]
855
-
908
+
856
909
  return ScenarioList(scenarios)
857
-
910
+
858
911
  # Replace the method on this instance only
859
912
  import types
913
+
860
914
  instance.to_scenario_list = types.MethodType(mock_to_scenario_list, instance)
861
-
915
+
862
916
  return instance
863
-
917
+
864
918
  def to_scenario_list(self):
865
919
  """Create a ScenarioList from a table on a Wikipedia page."""
866
920
  from .scenario_list import ScenarioList
867
921
  import requests
868
-
922
+
869
923
  try:
870
924
  # Try to import pandas
871
925
  import pandas as pd
872
926
  except ImportError:
873
927
  raise ImportError("pandas is required to read Wikipedia tables")
874
-
928
+
875
929
  try:
876
930
  # Check if the URL is reachable
877
931
  response = requests.get(self.url)
878
932
  response.raise_for_status() # Raises HTTPError for bad responses
879
-
933
+
880
934
  # Extract tables from the Wikipedia page
881
935
  tables = pd.read_html(self.url, header=0 if self.header else None)
882
-
936
+
883
937
  # Ensure the requested table index is within the range of available tables
884
938
  if self.table_index >= len(tables) or self.table_index < 0:
885
939
  raise ScenarioError(
886
940
  f"Table index {self.table_index} is out of range. This page has {len(tables)} table(s)."
887
941
  )
888
-
942
+
889
943
  # Get the requested table
890
944
  df = tables[self.table_index]
891
-
945
+
892
946
  # Convert DataFrame to ScenarioList
893
947
  scenarios = []
894
948
  for _, row in df.iterrows():
895
949
  scenario_dict = row.to_dict()
896
950
  scenarios.append(Scenario(scenario_dict))
897
-
951
+
898
952
  return ScenarioList(scenarios)
899
-
953
+
900
954
  except requests.exceptions.RequestException as e:
901
955
  raise ScenarioError(f"Error fetching the URL: {str(e)}")
902
956
  except ValueError as e:
@@ -907,18 +961,18 @@ class WikipediaSource(Source):
907
961
 
908
962
  class ExcelSource(Source):
909
963
  source_type = "excel"
910
-
964
+
911
965
  def __init__(
912
- self,
913
- file_path: str,
914
- sheet_name: Optional[str] = None,
966
+ self,
967
+ file_path: str,
968
+ sheet_name: Optional[str] = None,
915
969
  skip_rows: Optional[List[int]] = None,
916
970
  use_codebook: bool = False,
917
- **kwargs
971
+ **kwargs,
918
972
  ):
919
973
  """
920
974
  Initialize an ExcelSource with a path to an Excel file.
921
-
975
+
922
976
  Args:
923
977
  file_path: Path to the Excel file.
924
978
  sheet_name: Name of the sheet to load. If None and multiple sheets exist,
@@ -932,76 +986,81 @@ class ExcelSource(Source):
932
986
  self.skip_rows = skip_rows
933
987
  self.use_codebook = use_codebook
934
988
  self.kwargs = kwargs
935
-
989
+
936
990
  @classmethod
937
- def example(cls) -> 'ExcelSource':
991
+ def example(cls) -> "ExcelSource":
938
992
  """Return an example ExcelSource instance."""
939
993
  import tempfile
940
994
  import os
941
-
995
+
942
996
  try:
943
997
  import pandas as pd
944
-
998
+
945
999
  # Create a temporary Excel file with sample data
946
- fd, temp_path = tempfile.mkstemp(suffix='.xlsx', prefix='edsl_test_')
1000
+ fd, temp_path = tempfile.mkstemp(suffix=".xlsx", prefix="edsl_test_")
947
1001
  os.close(fd) # Close the file descriptor
948
-
1002
+
949
1003
  # Create sample data
950
- df1 = pd.DataFrame({
951
- 'name': ['Alice', 'Bob', 'Charlie'],
952
- 'age': [30, 25, 35],
953
- 'city': ['New York', 'San Francisco', 'Boston']
954
- })
955
-
956
- df2 = pd.DataFrame({
957
- 'name': ['David', 'Eve'],
958
- 'age': [40, 45],
959
- 'city': ['Seattle', 'Chicago']
960
- })
961
-
1004
+ df1 = pd.DataFrame(
1005
+ {
1006
+ "name": ["Alice", "Bob", "Charlie"],
1007
+ "age": [30, 25, 35],
1008
+ "city": ["New York", "San Francisco", "Boston"],
1009
+ }
1010
+ )
1011
+
1012
+ df2 = pd.DataFrame(
1013
+ {
1014
+ "name": ["David", "Eve"],
1015
+ "age": [40, 45],
1016
+ "city": ["Seattle", "Chicago"],
1017
+ }
1018
+ )
1019
+
962
1020
  # Write to Excel file with multiple sheets
963
1021
  with pd.ExcelWriter(temp_path) as writer:
964
- df1.to_excel(writer, sheet_name='Sheet1', index=False)
965
- df2.to_excel(writer, sheet_name='Sheet2', index=False)
966
-
967
- return cls(
968
- file_path=temp_path,
969
- sheet_name='Sheet1'
970
- )
971
-
1022
+ df1.to_excel(writer, sheet_name="Sheet1", index=False)
1023
+ df2.to_excel(writer, sheet_name="Sheet2", index=False)
1024
+
1025
+ return cls(file_path=temp_path, sheet_name="Sheet1")
1026
+
972
1027
  except ImportError:
973
1028
  # Create a mock instance with an override if pandas is not available
974
1029
  instance = cls(file_path="/path/to/nonexistent/file.xlsx")
975
-
1030
+
976
1031
  # Override the to_scenario_list method just for the example
977
1032
  def mock_to_scenario_list(self):
978
1033
  from .scenario_list import ScenarioList
1034
+
979
1035
  # Create a simple mock ScenarioList with sample data
980
1036
  scenarios = [
981
1037
  Scenario({"name": "Alice", "age": 30, "city": "New York"}),
982
1038
  Scenario({"name": "Bob", "age": 25, "city": "San Francisco"}),
983
- Scenario({"name": "Charlie", "age": 35, "city": "Boston"})
1039
+ Scenario({"name": "Charlie", "age": 35, "city": "Boston"}),
984
1040
  ]
985
1041
  return ScenarioList(scenarios)
986
-
1042
+
987
1043
  # Replace the method on this instance only
988
1044
  import types
989
- instance.to_scenario_list = types.MethodType(mock_to_scenario_list, instance)
990
-
1045
+
1046
+ instance.to_scenario_list = types.MethodType(
1047
+ mock_to_scenario_list, instance
1048
+ )
1049
+
991
1050
  return instance
992
-
1051
+
993
1052
  def to_scenario_list(self):
994
1053
  """Create a ScenarioList from an Excel file."""
995
1054
  from .scenario_list import ScenarioList
996
-
1055
+
997
1056
  try:
998
1057
  import pandas as pd
999
1058
  except ImportError:
1000
1059
  raise ImportError("pandas is required to read Excel files")
1001
-
1060
+
1002
1061
  # Get all sheets
1003
1062
  all_sheets = pd.read_excel(self.file_path, sheet_name=None)
1004
-
1063
+
1005
1064
  # If no sheet_name is provided and there is more than one sheet, print available sheets
1006
1065
  sheet_name = self.sheet_name
1007
1066
  if sheet_name is None:
@@ -1015,27 +1074,27 @@ class ExcelSource(Source):
1015
1074
  else:
1016
1075
  # If there is only one sheet, use it
1017
1076
  sheet_name = list(all_sheets.keys())[0]
1018
-
1077
+
1019
1078
  # Load the specified or determined sheet
1020
1079
  df = pd.read_excel(self.file_path, sheet_name=sheet_name, **self.kwargs)
1021
-
1080
+
1022
1081
  # Skip specified rows if any
1023
1082
  if self.skip_rows:
1024
1083
  df = df.drop(self.skip_rows)
1025
1084
  # Reset index to ensure continuous indexing
1026
1085
  df = df.reset_index(drop=True)
1027
-
1086
+
1028
1087
  # Handle codebook if requested
1029
1088
  if self.use_codebook:
1030
1089
  codebook = {f"col_{i}": col for i, col in enumerate(df.columns)}
1031
1090
  koobedoc = {col: f"col_{i}" for i, col in enumerate(df.columns)}
1032
-
1091
+
1033
1092
  # Create scenarios with renamed columns
1034
1093
  scenarios = []
1035
1094
  for _, row in df.iterrows():
1036
1095
  scenario_dict = {koobedoc.get(k): v for k, v in row.to_dict().items()}
1037
1096
  scenarios.append(Scenario(scenario_dict))
1038
-
1097
+
1039
1098
  result = ScenarioList(scenarios)
1040
1099
  result.codebook = codebook
1041
1100
  return result
@@ -1045,23 +1104,23 @@ class ExcelSource(Source):
1045
1104
  for _, row in df.iterrows():
1046
1105
  scenario_dict = row.to_dict()
1047
1106
  scenarios.append(Scenario(scenario_dict))
1048
-
1107
+
1049
1108
  return ScenarioList(scenarios)
1050
1109
 
1051
1110
 
1052
1111
  class GoogleSheetSource(Source):
1053
1112
  source_type = "google_sheet"
1054
-
1113
+
1055
1114
  def __init__(
1056
- self,
1057
- url: str,
1058
- sheet_name: Optional[str] = None,
1115
+ self,
1116
+ url: str,
1117
+ sheet_name: Optional[str] = None,
1059
1118
  column_names: Optional[List[str]] = None,
1060
- **kwargs
1119
+ **kwargs,
1061
1120
  ):
1062
1121
  """
1063
1122
  Initialize a GoogleSheetSource with a URL to a Google Sheet.
1064
-
1123
+
1065
1124
  Args:
1066
1125
  url: The URL of the Google Sheet.
1067
1126
  sheet_name: The name of the sheet to load. If None, the first sheet will be used.
@@ -1073,67 +1132,68 @@ class GoogleSheetSource(Source):
1073
1132
  self.sheet_name = sheet_name
1074
1133
  self.column_names = column_names
1075
1134
  self.kwargs = kwargs
1076
-
1135
+
1077
1136
  @classmethod
1078
- def example(cls) -> 'GoogleSheetSource':
1137
+ def example(cls) -> "GoogleSheetSource":
1079
1138
  """Return an example GoogleSheetSource instance."""
1080
1139
  # Use a mock instance since we can't create a real Google Sheet for testing
1081
1140
  instance = cls(
1082
1141
  url="https://docs.google.com/spreadsheets/d/1234567890abcdefg/edit",
1083
- sheet_name="Sheet1"
1142
+ sheet_name="Sheet1",
1084
1143
  )
1085
-
1144
+
1086
1145
  # Override the to_scenario_list method just for the example
1087
1146
  def mock_to_scenario_list(self):
1088
1147
  from .scenario_list import ScenarioList
1089
-
1148
+
1090
1149
  # Create a simple mock ScenarioList with sample data
1091
1150
  scenarios = [
1092
1151
  Scenario({"name": "Alice", "age": 30, "city": "New York"}),
1093
1152
  Scenario({"name": "Bob", "age": 25, "city": "San Francisco"}),
1094
- Scenario({"name": "Charlie", "age": 35, "city": "Boston"})
1153
+ Scenario({"name": "Charlie", "age": 35, "city": "Boston"}),
1095
1154
  ]
1096
1155
  return ScenarioList(scenarios)
1097
-
1156
+
1098
1157
  # Replace the method on this instance only
1099
1158
  import types
1159
+
1100
1160
  instance.to_scenario_list = types.MethodType(mock_to_scenario_list, instance)
1101
-
1161
+
1102
1162
  return instance
1103
-
1163
+
1104
1164
  def to_scenario_list(self):
1105
1165
  """Create a ScenarioList from a Google Sheet."""
1106
1166
  from .scenario_list import ScenarioList
1107
1167
  import tempfile
1108
1168
  import requests
1109
-
1169
+
1110
1170
  # Extract the sheet ID from the URL
1111
1171
  if "/edit" in self.url:
1112
1172
  sheet_id = self.url.split("/d/")[1].split("/edit")[0]
1113
1173
  else:
1114
1174
  raise ScenarioError("Invalid Google Sheet URL format.")
1115
-
1175
+
1116
1176
  # Create the export URL for XLSX format
1117
- export_url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=xlsx"
1118
-
1177
+ export_url = (
1178
+ f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=xlsx"
1179
+ )
1180
+
1119
1181
  try:
1120
1182
  # Download the Google Sheet as an Excel file
1121
1183
  response = requests.get(export_url)
1122
1184
  response.raise_for_status() # Ensure the request was successful
1123
-
1185
+
1124
1186
  # Save the Excel file to a temporary file
1125
1187
  with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as temp_file:
1126
1188
  temp_file.write(response.content)
1127
1189
  temp_filename = temp_file.name
1128
-
1190
+
1129
1191
  # Use ExcelSource to create the initial ScenarioList
1130
1192
  excel_source = ExcelSource(
1131
- file_path=temp_filename,
1132
- sheet_name=self.sheet_name,
1133
- **self.kwargs
1193
+ file_path=temp_filename, sheet_name=self.sheet_name, **self.kwargs
1134
1194
  )
1135
1195
  scenario_list = excel_source.to_scenario_list()
1136
-
1196
+
1137
1197
  # Apply column renaming if specified
1138
1198
  if self.column_names is not None and scenario_list:
1139
1199
  if len(self.column_names) != len(scenario_list[0].keys()):
@@ -1141,21 +1201,23 @@ class GoogleSheetSource(Source):
1141
1201
  f"Number of provided column names ({len(self.column_names)}) "
1142
1202
  f"does not match number of columns in sheet ({len(scenario_list[0].keys())})"
1143
1203
  )
1144
-
1204
+
1145
1205
  # Create a mapping from original keys to new names
1146
1206
  original_keys = list(scenario_list[0].keys())
1147
1207
  column_mapping = dict(zip(original_keys, self.column_names))
1148
-
1208
+
1149
1209
  # Create a new ScenarioList with renamed columns
1150
1210
  renamed_scenarios = []
1151
1211
  for scenario in scenario_list:
1152
- renamed_scenario = {column_mapping.get(k, k): v for k, v in scenario.items()}
1212
+ renamed_scenario = {
1213
+ column_mapping.get(k, k): v for k, v in scenario.items()
1214
+ }
1153
1215
  renamed_scenarios.append(Scenario(renamed_scenario))
1154
-
1216
+
1155
1217
  return ScenarioList(renamed_scenarios)
1156
-
1218
+
1157
1219
  return scenario_list
1158
-
1220
+
1159
1221
  except requests.exceptions.RequestException as e:
1160
1222
  raise ScenarioError(f"Error fetching the Google Sheet: {str(e)}")
1161
1223
  except Exception as e:
@@ -1164,18 +1226,18 @@ class GoogleSheetSource(Source):
1164
1226
 
1165
1227
  class DelimitedFileSource(Source):
1166
1228
  source_type = "delimited_file"
1167
-
1229
+
1168
1230
  def __init__(
1169
- self,
1231
+ self,
1170
1232
  file_or_url: str,
1171
1233
  delimiter: str = ",",
1172
1234
  has_header: bool = True,
1173
1235
  encoding: str = "utf-8",
1174
- **kwargs
1236
+ **kwargs,
1175
1237
  ):
1176
1238
  """
1177
1239
  Initialize a DelimitedFileSource with a path to a delimited file or URL.
1178
-
1240
+
1179
1241
  Args:
1180
1242
  file_or_url: Path to a local file or URL to a remote file.
1181
1243
  delimiter: The delimiter character used in the file (default is ',').
@@ -1188,42 +1250,38 @@ class DelimitedFileSource(Source):
1188
1250
  self.has_header = has_header
1189
1251
  self.encoding = encoding
1190
1252
  self.kwargs = kwargs
1191
-
1253
+
1192
1254
  @classmethod
1193
- def example(cls) -> 'DelimitedFileSource':
1255
+ def example(cls) -> "DelimitedFileSource":
1194
1256
  """Return an example DelimitedFileSource instance."""
1195
1257
  import tempfile
1196
1258
  import os
1197
-
1259
+
1198
1260
  # Create a temporary CSV file with sample data
1199
- fd, temp_path = tempfile.mkstemp(suffix='.csv', prefix='edsl_test_')
1261
+ fd, temp_path = tempfile.mkstemp(suffix=".csv", prefix="edsl_test_")
1200
1262
  os.close(fd) # Close the file descriptor
1201
-
1263
+
1202
1264
  # Write sample data to the file
1203
- with open(temp_path, 'w', newline='') as f:
1265
+ with open(temp_path, "w", newline="") as f:
1204
1266
  f.write("name,age,city\n")
1205
1267
  f.write("Alice,30,New York\n")
1206
1268
  f.write("Bob,25,San Francisco\n")
1207
1269
  f.write("Charlie,35,Boston\n")
1208
-
1209
- return cls(
1210
- file_or_url=temp_path,
1211
- delimiter=",",
1212
- has_header=True
1213
- )
1214
-
1270
+
1271
+ return cls(file_or_url=temp_path, delimiter=",", has_header=True)
1272
+
1215
1273
  def to_scenario_list(self):
1216
1274
  """Create a ScenarioList from a delimited file or URL."""
1217
1275
  from .scenario_list import ScenarioList
1218
1276
  import requests
1219
-
1277
+
1220
1278
  # Check if the input is a URL
1221
1279
  parsed_url = urlparse(self.file_or_url)
1222
1280
  if parsed_url.scheme in ("http", "https"):
1223
1281
  try:
1224
1282
  headers = {
1225
1283
  "Accept": "text/csv,application/csv,text/plain",
1226
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
1284
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
1227
1285
  }
1228
1286
  response = requests.get(self.file_or_url, headers=headers)
1229
1287
  response.raise_for_status()
@@ -1240,7 +1298,7 @@ class DelimitedFileSource(Source):
1240
1298
  encodings_to_try = ["latin-1", "cp1252", "ISO-8859-1"]
1241
1299
  if self.encoding in encodings_to_try:
1242
1300
  encodings_to_try.remove(self.encoding)
1243
-
1301
+
1244
1302
  for encoding in encodings_to_try:
1245
1303
  try:
1246
1304
  with open(self.file_or_url, "r", encoding=encoding) as f:
@@ -1249,17 +1307,21 @@ class DelimitedFileSource(Source):
1249
1307
  except UnicodeDecodeError:
1250
1308
  continue
1251
1309
  else:
1252
- raise ScenarioError(f"Failed to decode file with any of the attempted encodings")
1310
+ raise ScenarioError(
1311
+ f"Failed to decode file with any of the attempted encodings"
1312
+ )
1253
1313
  except Exception as e:
1254
1314
  raise ScenarioError(f"Failed to read file: {str(e)}")
1255
-
1315
+
1256
1316
  # Parse the content
1257
- csv_reader = csv.reader(StringIO(content), delimiter=self.delimiter, **self.kwargs)
1317
+ csv_reader = csv.reader(
1318
+ StringIO(content), delimiter=self.delimiter, **self.kwargs
1319
+ )
1258
1320
  rows = list(csv_reader)
1259
-
1321
+
1260
1322
  if not rows:
1261
1323
  return ScenarioList()
1262
-
1324
+
1263
1325
  # Handle header row
1264
1326
  if self.has_header:
1265
1327
  header = rows[0]
@@ -1268,33 +1330,49 @@ class DelimitedFileSource(Source):
1268
1330
  # Auto-generate column names
1269
1331
  header = [f"col{i}" for i in range(len(rows[0]))]
1270
1332
  data_rows = rows
1271
-
1333
+
1334
+ header_counts = defaultdict(lambda: 0)
1335
+ new_header = []
1336
+ for h in header:
1337
+ if header_counts[h] >= 1:
1338
+ new_header.append(f"{h}_{header_counts[h]}")
1339
+ warnings.warn(
1340
+ f"Duplicate header found: {h}. Renamed to {h}_{header_counts[h]}"
1341
+ )
1342
+ else:
1343
+ new_header.append(h)
1344
+ header_counts[h] += 1
1345
+
1346
+ assert len(new_header) == len(set(new_header))
1347
+
1272
1348
  # Create scenarios
1273
1349
  scenarios = []
1274
1350
  for row in data_rows:
1275
- if len(row) != len(header):
1276
- warnings.warn(f"Skipping row with {len(row)} values (expected {len(header)})")
1351
+ if len(row) != len(new_header):
1352
+ warnings.warn(
1353
+ f"Skipping row with {len(row)} values (expected {len(header)})"
1354
+ )
1277
1355
  continue
1278
-
1279
- scenario_dict = dict(zip(header, row))
1356
+
1357
+ scenario_dict = dict(zip(new_header, row))
1280
1358
  scenarios.append(Scenario(scenario_dict))
1281
-
1359
+
1282
1360
  return ScenarioList(scenarios)
1283
1361
 
1284
1362
 
1285
1363
  class CSVSource(DelimitedFileSource):
1286
1364
  source_type = "csv"
1287
-
1365
+
1288
1366
  def __init__(
1289
- self,
1367
+ self,
1290
1368
  file_or_url: str,
1291
1369
  has_header: bool = True,
1292
1370
  encoding: str = "utf-8",
1293
- **kwargs
1371
+ **kwargs,
1294
1372
  ):
1295
1373
  """
1296
1374
  Initialize a CSVSource with a path to a CSV file or URL.
1297
-
1375
+
1298
1376
  Args:
1299
1377
  file_or_url: Path to a local file or URL to a remote file.
1300
1378
  has_header: Whether the file has a header row (default is True).
@@ -1306,45 +1384,42 @@ class CSVSource(DelimitedFileSource):
1306
1384
  delimiter=",",
1307
1385
  has_header=has_header,
1308
1386
  encoding=encoding,
1309
- **kwargs
1387
+ **kwargs,
1310
1388
  )
1311
-
1389
+
1312
1390
  @classmethod
1313
- def example(cls) -> 'CSVSource':
1391
+ def example(cls) -> "CSVSource":
1314
1392
  """Return an example CSVSource instance."""
1315
1393
  import tempfile
1316
1394
  import os
1317
-
1395
+
1318
1396
  # Create a temporary CSV file with sample data
1319
- fd, temp_path = tempfile.mkstemp(suffix='.csv', prefix='edsl_test_')
1397
+ fd, temp_path = tempfile.mkstemp(suffix=".csv", prefix="edsl_test_")
1320
1398
  os.close(fd) # Close the file descriptor
1321
-
1399
+
1322
1400
  # Write sample data to the file
1323
- with open(temp_path, 'w', newline='') as f:
1401
+ with open(temp_path, "w", newline="") as f:
1324
1402
  f.write("name,age,city\n")
1325
1403
  f.write("Alice,30,New York\n")
1326
1404
  f.write("Bob,25,San Francisco\n")
1327
1405
  f.write("Charlie,35,Boston\n")
1328
-
1329
- return cls(
1330
- file_or_url=temp_path,
1331
- has_header=True
1332
- )
1406
+
1407
+ return cls(file_or_url=temp_path, has_header=True)
1333
1408
 
1334
1409
 
1335
1410
  class TSVSource(DelimitedFileSource):
1336
1411
  source_type = "tsv"
1337
-
1412
+
1338
1413
  def __init__(
1339
- self,
1414
+ self,
1340
1415
  file_or_url: str,
1341
1416
  has_header: bool = True,
1342
1417
  encoding: str = "utf-8",
1343
- **kwargs
1418
+ **kwargs,
1344
1419
  ):
1345
1420
  """
1346
1421
  Initialize a TSVSource with a path to a TSV file or URL.
1347
-
1422
+
1348
1423
  Args:
1349
1424
  file_or_url: Path to a local file or URL to a remote file.
1350
1425
  has_header: Whether the file has a header row (default is True).
@@ -1356,130 +1431,134 @@ class TSVSource(DelimitedFileSource):
1356
1431
  delimiter="\t",
1357
1432
  has_header=has_header,
1358
1433
  encoding=encoding,
1359
- **kwargs
1434
+ **kwargs,
1360
1435
  )
1361
-
1436
+
1362
1437
  @classmethod
1363
- def example(cls) -> 'TSVSource':
1438
+ def example(cls) -> "TSVSource":
1364
1439
  """Return an example TSVSource instance."""
1365
1440
  import tempfile
1366
1441
  import os
1367
-
1442
+
1368
1443
  # Create a temporary TSV file with sample data
1369
- fd, temp_path = tempfile.mkstemp(suffix='.tsv', prefix='edsl_test_')
1444
+ fd, temp_path = tempfile.mkstemp(suffix=".tsv", prefix="edsl_test_")
1370
1445
  os.close(fd) # Close the file descriptor
1371
-
1446
+
1372
1447
  # Write sample data to the file
1373
- with open(temp_path, 'w', newline='') as f:
1448
+ with open(temp_path, "w", newline="") as f:
1374
1449
  f.write("name\tage\tcity\n")
1375
1450
  f.write("Alice\t30\tNew York\n")
1376
1451
  f.write("Bob\t25\tSan Francisco\n")
1377
1452
  f.write("Charlie\t35\tBoston\n")
1378
-
1379
- return cls(
1380
- file_or_url=temp_path,
1381
- has_header=True
1382
- )
1453
+
1454
+ return cls(file_or_url=temp_path, has_header=True)
1455
+
1383
1456
 
1384
1457
  class ParquetSource(Source):
1385
1458
  source_type = "parquet"
1386
-
1459
+
1387
1460
  def __init__(self, file_path: str):
1388
1461
  """
1389
1462
  Initialize a ParquetSource with a path to a Parquet file.
1390
-
1463
+
1391
1464
  Args:
1392
1465
  file_path: Path to the Parquet file.
1393
1466
  """
1394
1467
  self.file_path = file_path
1395
-
1468
+
1396
1469
  @classmethod
1397
- def example(cls) -> 'ParquetSource':
1470
+ def example(cls) -> "ParquetSource":
1398
1471
  """Return an example ParquetSource instance."""
1399
1472
  import tempfile
1400
1473
  import os
1401
-
1474
+
1402
1475
  try:
1403
1476
  import pandas as pd
1404
1477
  import pyarrow as pa
1405
1478
  import pyarrow.parquet as pq
1406
-
1479
+
1407
1480
  # Create a temporary Parquet file with sample data
1408
- fd, temp_path = tempfile.mkstemp(suffix='.parquet', prefix='edsl_test_')
1481
+ fd, temp_path = tempfile.mkstemp(suffix=".parquet", prefix="edsl_test_")
1409
1482
  os.close(fd) # Close the file descriptor
1410
-
1483
+
1411
1484
  # Create sample data
1412
- df = pd.DataFrame({
1413
- 'name': ['Alice', 'Bob', 'Charlie'],
1414
- 'age': [30, 25, 35],
1415
- 'city': ['New York', 'San Francisco', 'Boston']
1416
- })
1417
-
1485
+ df = pd.DataFrame(
1486
+ {
1487
+ "name": ["Alice", "Bob", "Charlie"],
1488
+ "age": [30, 25, 35],
1489
+ "city": ["New York", "San Francisco", "Boston"],
1490
+ }
1491
+ )
1492
+
1418
1493
  # Write to Parquet file
1419
1494
  df.to_parquet(temp_path)
1420
-
1495
+
1421
1496
  return cls(file_path=temp_path)
1422
-
1497
+
1423
1498
  except ImportError:
1424
1499
  # Create a mock instance with an override if pandas or pyarrow is not available
1425
1500
  instance = cls(file_path="/path/to/nonexistent/file.parquet")
1426
-
1501
+
1427
1502
  # Override the to_scenario_list method just for the example
1428
1503
  def mock_to_scenario_list(self):
1429
1504
  from .scenario_list import ScenarioList
1505
+
1430
1506
  # Create a simple mock ScenarioList with sample data
1431
1507
  scenarios = [
1432
1508
  Scenario({"name": "Alice", "age": 30, "city": "New York"}),
1433
1509
  Scenario({"name": "Bob", "age": 25, "city": "San Francisco"}),
1434
- Scenario({"name": "Charlie", "age": 35, "city": "Boston"})
1510
+ Scenario({"name": "Charlie", "age": 35, "city": "Boston"}),
1435
1511
  ]
1436
1512
  return ScenarioList(scenarios)
1437
-
1513
+
1438
1514
  # Replace the method on this instance only
1439
1515
  import types
1440
- instance.to_scenario_list = types.MethodType(mock_to_scenario_list, instance)
1441
-
1516
+
1517
+ instance.to_scenario_list = types.MethodType(
1518
+ mock_to_scenario_list, instance
1519
+ )
1520
+
1442
1521
  return instance
1443
-
1522
+
1444
1523
  def to_scenario_list(self):
1445
1524
  """Create a ScenarioList from a Parquet file."""
1446
1525
  from .scenario_list import ScenarioList
1447
-
1526
+
1448
1527
  try:
1449
1528
  import pandas as pd
1450
1529
  except ImportError:
1451
1530
  raise ImportError("pandas is required to read Parquet files")
1452
-
1531
+
1453
1532
  try:
1454
1533
  import pyarrow
1455
1534
  except ImportError:
1456
1535
  raise ImportError("pyarrow is required to read Parquet files")
1457
-
1536
+
1458
1537
  # Read the Parquet file
1459
1538
  df = pd.read_parquet(self.file_path)
1460
-
1539
+
1461
1540
  # Convert DataFrame to ScenarioList
1462
1541
  scenarios = []
1463
1542
  for _, row in df.iterrows():
1464
1543
  scenario_dict = row.to_dict()
1465
1544
  scenarios.append(Scenario(scenario_dict))
1466
-
1545
+
1467
1546
  return ScenarioList(scenarios)
1468
1547
 
1469
1548
 
1470
1549
  class PDFSource(Source):
1471
1550
  source_type = "pdf"
1472
-
1551
+
1473
1552
  def __init__(
1474
- self,
1553
+ self,
1475
1554
  file_path: str,
1476
1555
  chunk_type: Literal["page", "text"] = "page",
1477
1556
  chunk_size: int = 1,
1478
- chunk_overlap: int = 0
1557
+ chunk_overlap: int = 0,
1479
1558
  ):
1480
1559
  """
1481
1560
  Initialize a PDFSource with a path to a PDF file.
1482
-
1561
+
1483
1562
  Args:
1484
1563
  file_path: Path to the PDF file or URL to a PDF.
1485
1564
  chunk_type: Type of chunking to use ("page" or "text").
@@ -1490,39 +1569,53 @@ class PDFSource(Source):
1490
1569
  self.chunk_type = chunk_type
1491
1570
  self.chunk_size = chunk_size
1492
1571
  self.chunk_overlap = chunk_overlap
1493
-
1572
+
1494
1573
  @classmethod
1495
- def example(cls) -> 'PDFSource':
1574
+ def example(cls) -> "PDFSource":
1496
1575
  """Return an example PDFSource instance."""
1497
1576
  # Skip actual file creation and just use a mock instance
1498
1577
  instance = cls(
1499
1578
  file_path="/path/to/nonexistent/file.pdf",
1500
1579
  chunk_type="page",
1501
1580
  chunk_size=1,
1502
- chunk_overlap=0
1581
+ chunk_overlap=0,
1503
1582
  )
1504
-
1583
+
1505
1584
  # Override the to_scenario_list method just for the example
1506
1585
  def mock_to_scenario_list(self):
1507
1586
  from .scenario_list import ScenarioList
1587
+
1508
1588
  # Create a simple mock ScenarioList with sample PDF data
1509
1589
  scenarios = [
1510
- Scenario({"filename": "example.pdf", "page": 1, "text": "This is page 1 content"}),
1511
- Scenario({"filename": "example.pdf", "page": 2, "text": "This is page 2 content"})
1590
+ Scenario(
1591
+ {
1592
+ "filename": "example.pdf",
1593
+ "page": 1,
1594
+ "text": "This is page 1 content",
1595
+ }
1596
+ ),
1597
+ Scenario(
1598
+ {
1599
+ "filename": "example.pdf",
1600
+ "page": 2,
1601
+ "text": "This is page 2 content",
1602
+ }
1603
+ ),
1512
1604
  ]
1513
1605
  return ScenarioList(scenarios)
1514
-
1606
+
1515
1607
  # Replace the method on this instance only
1516
1608
  import types
1609
+
1517
1610
  instance.to_scenario_list = types.MethodType(mock_to_scenario_list, instance)
1518
-
1611
+
1519
1612
  return instance
1520
-
1613
+
1521
1614
  def to_scenario_list(self):
1522
1615
  """Create a ScenarioList from a PDF file."""
1523
1616
  from .scenario_list import ScenarioList
1524
1617
  from .scenario_list_pdf_tools import PdfTools
1525
-
1618
+
1526
1619
  try:
1527
1620
  # Check if it's a URL
1528
1621
  if PdfTools.is_url(self.file_path):
@@ -1534,14 +1627,16 @@ class PDFSource(Source):
1534
1627
  )
1535
1628
  else:
1536
1629
  # It's a regular URL
1537
- local_path = PdfTools.fetch_and_save_pdf(self.file_path, "temp_pdf.pdf")
1630
+ local_path = PdfTools.fetch_and_save_pdf(
1631
+ self.file_path, "temp_pdf.pdf"
1632
+ )
1538
1633
  else:
1539
1634
  # It's a local file path
1540
1635
  local_path = self.file_path
1541
-
1636
+
1542
1637
  # Extract scenarios from the PDF
1543
1638
  scenarios = list(PdfTools.extract_text_from_pdf(local_path))
1544
-
1639
+
1545
1640
  # Handle chunking based on the specified parameters
1546
1641
  if self.chunk_type == "page":
1547
1642
  # Default behavior - one scenario per page
@@ -1551,31 +1646,31 @@ class PDFSource(Source):
1551
1646
  combined_text = ""
1552
1647
  for scenario in scenarios:
1553
1648
  combined_text += scenario["text"]
1554
-
1649
+
1555
1650
  # Create a single scenario with all text
1556
1651
  base_scenario = scenarios[0].copy()
1557
1652
  base_scenario["text"] = combined_text
1558
1653
  return ScenarioList([base_scenario])
1559
1654
  else:
1560
- raise ValueError(f"Invalid chunk_type: {self.chunk_type}. Must be 'page' or 'text'.")
1561
-
1655
+ raise ValueError(
1656
+ f"Invalid chunk_type: {self.chunk_type}. Must be 'page' or 'text'."
1657
+ )
1658
+
1562
1659
  except Exception as e:
1563
1660
  from .exceptions import ScenarioError
1661
+
1564
1662
  raise ScenarioError(f"Error processing PDF: {str(e)}")
1565
1663
 
1566
1664
 
1567
1665
  class PDFImageSource(Source):
1568
1666
  source_type = "pdf_to_image"
1569
-
1667
+
1570
1668
  def __init__(
1571
- self,
1572
- file_path: str,
1573
- base_width: int = 2000,
1574
- include_text: bool = True
1669
+ self, file_path: str, base_width: int = 2000, include_text: bool = True
1575
1670
  ):
1576
1671
  """
1577
1672
  Initialize a PDFImageSource with a path to a PDF file.
1578
-
1673
+
1579
1674
  Args:
1580
1675
  file_path: Path to the PDF file.
1581
1676
  base_width: Width to use for the generated images.
@@ -1584,74 +1679,91 @@ class PDFImageSource(Source):
1584
1679
  self.file_path = file_path
1585
1680
  self.base_width = base_width
1586
1681
  self.include_text = include_text
1587
-
1682
+
1588
1683
  @classmethod
1589
- def example(cls) -> 'PDFImageSource':
1684
+ def example(cls) -> "PDFImageSource":
1590
1685
  """Return an example PDFImageSource instance."""
1591
1686
  # Skip actual file creation and just use a mock instance
1592
1687
  instance = cls(
1593
1688
  file_path="/path/to/nonexistent/file.pdf",
1594
1689
  base_width=2000,
1595
- include_text=True
1690
+ include_text=True,
1596
1691
  )
1597
-
1692
+
1598
1693
  # Override the to_scenario_list method just for the example
1599
1694
  def mock_to_scenario_list(self):
1600
1695
  from .scenario_list import ScenarioList
1696
+
1601
1697
  # Create a simple mock ScenarioList with sample PDF image data
1602
1698
  scenarios = [
1603
- Scenario({"filepath": "/tmp/page_1.jpeg", "page": 0, "text": "This is page 1 content"}),
1604
- Scenario({"filepath": "/tmp/page_2.jpeg", "page": 1, "text": "This is page 2 content"})
1699
+ Scenario(
1700
+ {
1701
+ "filepath": "/tmp/page_1.jpeg",
1702
+ "page": 0,
1703
+ "text": "This is page 1 content",
1704
+ }
1705
+ ),
1706
+ Scenario(
1707
+ {
1708
+ "filepath": "/tmp/page_2.jpeg",
1709
+ "page": 1,
1710
+ "text": "This is page 2 content",
1711
+ }
1712
+ ),
1605
1713
  ]
1606
1714
  return ScenarioList(scenarios)
1607
-
1715
+
1608
1716
  # Replace the method on this instance only
1609
1717
  import types
1718
+
1610
1719
  instance.to_scenario_list = types.MethodType(mock_to_scenario_list, instance)
1611
-
1720
+
1612
1721
  return instance
1613
-
1722
+
1614
1723
  def to_scenario_list(self):
1615
1724
  """Create a ScenarioList from a PDF file, converting pages to images."""
1616
1725
  from .scenario_list import ScenarioList
1617
1726
  from .scenario_list_pdf_tools import PdfTools
1618
-
1727
+
1619
1728
  try:
1620
1729
  # Import pdf2image library
1621
1730
  try:
1622
1731
  from pdf2image import convert_from_path
1623
1732
  except ImportError:
1624
- raise ImportError("pdf2image is required to convert PDF to images. Install it with 'pip install pdf2image'.")
1625
-
1733
+ raise ImportError(
1734
+ "pdf2image is required to convert PDF to images. Install it with 'pip install pdf2image'."
1735
+ )
1736
+
1626
1737
  # Convert PDF pages to images
1627
1738
  scenarios = PdfTools.from_pdf_to_image(self.file_path, image_format="jpeg")
1628
1739
  return ScenarioList(scenarios)
1629
-
1740
+
1630
1741
  except Exception as e:
1631
1742
  from .exceptions import ScenarioError
1743
+
1632
1744
  raise ScenarioError(f"Error converting PDF to images: {str(e)}")
1633
1745
 
1634
1746
 
1635
1747
  class ScenarioSource:
1636
1748
  """
1637
1749
  Factory class for creating ScenarioList objects from various sources.
1638
-
1750
+
1639
1751
  This class provides static methods for creating ScenarioList objects from different
1640
1752
  data sources, centralizing the creation logic that was previously scattered across
1641
1753
  different classmethods in the ScenarioList class.
1642
-
1754
+
1643
1755
  The main entry point is the from_source method, which dispatches to appropriate
1644
1756
  source-specific methods based on the source_type parameter.
1645
1757
  """
1646
-
1758
+
1647
1759
  @staticmethod
1648
1760
  def from_source(source_type: str, *args, **kwargs):
1649
1761
  """
1650
1762
  Create a ScenarioList from a specified source type.
1651
-
1763
+
1652
1764
  This method serves as the main entry point for creating ScenarioList objects,
1653
1765
  dispatching to the appropriate source-specific method based on the source_type.
1654
-
1766
+
1655
1767
  Args:
1656
1768
  source_type: The type of source to create a ScenarioList from.
1657
1769
  Valid values include: 'urls', 'directory', 'list', 'list_of_tuples',
@@ -1660,10 +1772,10 @@ class ScenarioSource:
1660
1772
  'nested_dict', 'parquet', 'pdf', 'pdf_to_image'.
1661
1773
  *args: Positional arguments to pass to the source-specific method.
1662
1774
  **kwargs: Keyword arguments to pass to the source-specific method.
1663
-
1775
+
1664
1776
  Returns:
1665
1777
  A ScenarioList object created from the specified source.
1666
-
1778
+
1667
1779
  Raises:
1668
1780
  ValueError: If the source_type is not recognized.
1669
1781
  """
@@ -1679,14 +1791,14 @@ class ScenarioSource:
1679
1791
  return method(*args, **kwargs)
1680
1792
  else:
1681
1793
  raise ValueError(f"Unsupported source type: {source_type}")
1682
-
1794
+
1683
1795
  @staticmethod
1684
1796
  def _from_urls(urls: list[str], field_name: Optional[str] = "text"):
1685
1797
  """Create a ScenarioList from a list of URLs."""
1686
1798
  from .scenario_list import ScenarioList
1687
-
1799
+
1688
1800
  import requests
1689
-
1801
+
1690
1802
  result = ScenarioList()
1691
1803
  for url in urls:
1692
1804
  try:
@@ -1697,9 +1809,9 @@ class ScenarioSource:
1697
1809
  except requests.RequestException as e:
1698
1810
  warnings.warn(f"Failed to fetch URL {url}: {str(e)}")
1699
1811
  continue
1700
-
1812
+
1701
1813
  return result
1702
-
1814
+
1703
1815
  @staticmethod
1704
1816
  def _from_directory(
1705
1817
  directory: str,
@@ -1713,7 +1825,7 @@ class ScenarioSource:
1713
1825
  warnings.warn(
1714
1826
  "_from_directory is deprecated. Use DirectorySource directly or ScenarioSource.from_source('directory', ...) instead.",
1715
1827
  DeprecationWarning,
1716
- stacklevel=2
1828
+ stacklevel=2,
1717
1829
  )
1718
1830
  source = DirectorySource(
1719
1831
  directory=directory,
@@ -1721,23 +1833,21 @@ class ScenarioSource:
1721
1833
  recursive=recursive,
1722
1834
  metadata=metadata,
1723
1835
  ignore_dirs=ignore_dirs,
1724
- ignore_files=ignore_files
1836
+ ignore_files=ignore_files,
1725
1837
  )
1726
1838
  return source.to_scenario_list()
1727
-
1839
+
1728
1840
  @staticmethod
1729
- def _from_list(
1730
- field_name: str, values: list, use_indexes: bool = False
1731
- ):
1841
+ def _from_list(field_name: str, values: list, use_indexes: bool = False):
1732
1842
  """Create a ScenarioList from a list of values with a specified field name."""
1733
1843
  warnings.warn(
1734
1844
  "_from_list is deprecated. Use ListSource directly or ScenarioSource.from_source('list', ...) instead.",
1735
1845
  DeprecationWarning,
1736
- stacklevel=2
1846
+ stacklevel=2,
1737
1847
  )
1738
1848
  source = ListSource(field_name, values, use_indexes)
1739
1849
  return source.to_scenario_list()
1740
-
1850
+
1741
1851
  @staticmethod
1742
1852
  def _from_list_of_tuples(
1743
1853
  field_names: list[str], values: list[tuple], use_indexes: bool = False
@@ -1746,107 +1856,106 @@ class ScenarioSource:
1746
1856
  warnings.warn(
1747
1857
  "_from_list_of_tuples is deprecated. Use TuplesSource directly or ScenarioSource.from_source('list_of_tuples', ...) instead.",
1748
1858
  DeprecationWarning,
1749
- stacklevel=2
1859
+ stacklevel=2,
1750
1860
  )
1751
1861
  source = TuplesSource(field_names, values, use_indexes)
1752
1862
  return source.to_scenario_list()
1753
-
1863
+
1754
1864
  @staticmethod
1755
- def _from_sqlite(
1756
- db_path: str, table: str, fields: Optional[list] = None
1757
- ):
1865
+ def _from_sqlite(db_path: str, table: str, fields: Optional[list] = None):
1758
1866
  """Create a ScenarioList from a SQLite database."""
1759
1867
  warnings.warn(
1760
1868
  "_from_sqlite is deprecated. Use SQLiteSource directly or ScenarioSource.from_source('sqlite', ...) instead.",
1761
1869
  DeprecationWarning,
1762
- stacklevel=2
1870
+ stacklevel=2,
1763
1871
  )
1764
1872
  source = SQLiteSource(db_path, table, fields)
1765
1873
  return source.to_scenario_list()
1766
-
1874
+
1767
1875
  @staticmethod
1768
- def _from_latex(
1769
- file_path: str, table_index: int = 0, has_header: bool = True
1770
- ):
1876
+ def _from_latex(file_path: str, table_index: int = 0, has_header: bool = True):
1771
1877
  """Create a ScenarioList from a LaTeX file."""
1772
1878
  warnings.warn(
1773
1879
  "_from_latex is deprecated. Use LaTeXSource directly or ScenarioSource.from_source('latex', ...) instead.",
1774
1880
  DeprecationWarning,
1775
- stacklevel=2
1881
+ stacklevel=2,
1776
1882
  )
1777
1883
  source = LaTeXSource(file_path, table_index, has_header)
1778
1884
  return source.to_scenario_list()
1779
-
1885
+
1780
1886
  @staticmethod
1781
1887
  def _from_google_doc(url: str):
1782
1888
  """Create a ScenarioList from a Google Doc."""
1783
1889
  warnings.warn(
1784
1890
  "_from_google_doc is deprecated. Use GoogleDocSource directly or ScenarioSource.from_source('google_doc', ...) instead.",
1785
1891
  DeprecationWarning,
1786
- stacklevel=2
1892
+ stacklevel=2,
1787
1893
  )
1788
1894
  source = GoogleDocSource(url)
1789
1895
  return source.to_scenario_list()
1790
-
1896
+
1791
1897
  @staticmethod
1792
1898
  def _from_pandas(df):
1793
1899
  """Create a ScenarioList from a pandas DataFrame."""
1794
1900
  warnings.warn(
1795
1901
  "_from_pandas is deprecated. Use PandasSource directly or ScenarioSource.from_source('pandas', ...) instead.",
1796
1902
  DeprecationWarning,
1797
- stacklevel=2
1903
+ stacklevel=2,
1798
1904
  )
1799
1905
  source = PandasSource(df)
1800
1906
  return source.to_scenario_list()
1801
-
1907
+
1802
1908
  @staticmethod
1803
1909
  def _from_dta(file_path: str, include_metadata: bool = True):
1804
1910
  """Create a ScenarioList from a Stata data file."""
1805
1911
  warnings.warn(
1806
1912
  "_from_dta is deprecated. Use StataSource directly or ScenarioSource.from_source('dta', ...) instead.",
1807
1913
  DeprecationWarning,
1808
- stacklevel=2
1914
+ stacklevel=2,
1809
1915
  )
1810
1916
  source = StataSource(file_path, include_metadata)
1811
1917
  return source.to_scenario_list()
1812
-
1918
+
1813
1919
  @staticmethod
1814
- def _from_wikipedia(
1815
- url: str, table_index: int = 0, header: bool = True
1816
- ):
1920
+ def _from_wikipedia(url: str, table_index: int = 0, header: bool = True):
1817
1921
  """Create a ScenarioList from a table on a Wikipedia page."""
1818
1922
  warnings.warn(
1819
1923
  "_from_wikipedia is deprecated. Use WikipediaSource directly or ScenarioSource.from_source('wikipedia', ...) instead.",
1820
1924
  DeprecationWarning,
1821
- stacklevel=2
1925
+ stacklevel=2,
1822
1926
  )
1823
1927
  source = WikipediaSource(url, table_index, header)
1824
1928
  return source.to_scenario_list()
1825
-
1929
+
1826
1930
  @staticmethod
1827
- def _from_excel(
1828
- file_path: str, sheet_name: Optional[str] = None, **kwargs
1829
- ):
1931
+ def _from_excel(file_path: str, sheet_name: Optional[str] = None, **kwargs):
1830
1932
  """Create a ScenarioList from an Excel file."""
1831
1933
  warnings.warn(
1832
1934
  "_from_excel is deprecated. Use ExcelSource directly or ScenarioSource.from_source('excel', ...) instead.",
1833
1935
  DeprecationWarning,
1834
- stacklevel=2
1936
+ stacklevel=2,
1835
1937
  )
1836
1938
  source = ExcelSource(file_path, sheet_name=sheet_name, **kwargs)
1837
1939
  return source.to_scenario_list()
1838
-
1940
+
1839
1941
  @staticmethod
1840
- def _from_google_sheet(url: str, sheet_name: Optional[str] = None, column_names: Optional[List[str]] = None, **kwargs):
1942
+ def _from_google_sheet(
1943
+ url: str,
1944
+ sheet_name: Optional[str] = None,
1945
+ column_names: Optional[List[str]] = None,
1946
+ **kwargs,
1947
+ ):
1841
1948
  """Create a ScenarioList from a Google Sheet."""
1842
1949
  warnings.warn(
1843
1950
  "_from_google_sheet is deprecated. Use GoogleSheetSource directly or ScenarioSource.from_source('google_sheet', ...) instead.",
1844
1951
  DeprecationWarning,
1845
- stacklevel=2
1952
+ stacklevel=2,
1953
+ )
1954
+ source = GoogleSheetSource(
1955
+ url, sheet_name=sheet_name, column_names=column_names, **kwargs
1846
1956
  )
1847
- source = GoogleSheetSource(url, sheet_name=sheet_name, column_names=column_names, **kwargs)
1848
1957
  return source.to_scenario_list()
1849
-
1958
+
1850
1959
  @staticmethod
1851
1960
  def _from_delimited_file(
1852
1961
  file_or_url: str,
@@ -1859,44 +1968,44 @@ class ScenarioSource:
1859
1968
  warnings.warn(
1860
1969
  "_from_delimited_file is deprecated. Use DelimitedFileSource directly or ScenarioSource.from_source('delimited_file', ...) instead.",
1861
1970
  DeprecationWarning,
1862
- stacklevel=2
1971
+ stacklevel=2,
1863
1972
  )
1864
1973
  source = DelimitedFileSource(
1865
1974
  file_or_url=file_or_url,
1866
1975
  delimiter=delimiter,
1867
1976
  has_header=has_header,
1868
1977
  encoding=encoding,
1869
- **kwargs
1978
+ **kwargs,
1870
1979
  )
1871
1980
  return source.to_scenario_list()
1872
-
1981
+
1873
1982
  @staticmethod
1874
1983
  def _from_csv(file_or_url: str, **kwargs):
1875
1984
  """Create a ScenarioList from a CSV file or URL."""
1876
1985
  warnings.warn(
1877
1986
  "_from_csv is deprecated. Use CSVSource directly or ScenarioSource.from_source('csv', ...) instead.",
1878
1987
  DeprecationWarning,
1879
- stacklevel=2
1988
+ stacklevel=2,
1880
1989
  )
1881
1990
  source = CSVSource(file_or_url=file_or_url, **kwargs)
1882
1991
  return source.to_scenario_list()
1883
-
1992
+
1884
1993
  @staticmethod
1885
1994
  def _from_tsv(file_or_url: str, **kwargs):
1886
1995
  """Create a ScenarioList from a TSV file or URL."""
1887
1996
  warnings.warn(
1888
1997
  "_from_tsv is deprecated. Use TSVSource directly or ScenarioSource.from_source('tsv', ...) instead.",
1889
1998
  DeprecationWarning,
1890
- stacklevel=2
1999
+ stacklevel=2,
1891
2000
  )
1892
2001
  source = TSVSource(file_or_url=file_or_url, **kwargs)
1893
2002
  return source.to_scenario_list()
1894
-
2003
+
1895
2004
  @staticmethod
1896
2005
  def _from_dict(data: dict):
1897
2006
  """Create a ScenarioList from a dictionary."""
1898
2007
  from .scenario_list import ScenarioList
1899
-
2008
+
1900
2009
  if "scenarios" in data:
1901
2010
  scenarios = [Scenario(s) for s in data["scenarios"]]
1902
2011
  codebook = data.get("codebook", {})
@@ -1907,48 +2016,48 @@ class ScenarioSource:
1907
2016
  field_names = list(data.keys())
1908
2017
  if not all(isinstance(v, list) for v in data.values()):
1909
2018
  raise ScenarioError("All values in the dictionary must be lists")
1910
-
2019
+
1911
2020
  # Check all lists have the same length
1912
2021
  list_lengths = [len(v) for v in data.values()]
1913
2022
  if not all(l == list_lengths[0] for l in list_lengths):
1914
2023
  raise ScenarioError("All lists must have the same length")
1915
-
2024
+
1916
2025
  # Create scenarios
1917
2026
  for i in range(list_lengths[0]):
1918
2027
  scenario_dict = {k: data[k][i] for k in field_names}
1919
2028
  scenarios.append(Scenario(scenario_dict))
1920
-
2029
+
1921
2030
  return ScenarioList(scenarios)
1922
-
2031
+
1923
2032
  @staticmethod
1924
2033
  def _from_nested_dict(data: dict, id_field: Optional[str] = None):
1925
2034
  """Create a ScenarioList from a nested dictionary."""
1926
2035
  from .scenario_list import ScenarioList
1927
-
2036
+
1928
2037
  scenarios = []
1929
-
2038
+
1930
2039
  for key, value in data.items():
1931
2040
  if not isinstance(value, dict):
1932
2041
  raise ScenarioError(f"Value for key {key} is not a dictionary")
1933
-
2042
+
1934
2043
  scenario_dict = value.copy()
1935
2044
  if id_field:
1936
2045
  scenario_dict[id_field] = key
1937
2046
  scenarios.append(Scenario(scenario_dict))
1938
-
2047
+
1939
2048
  return ScenarioList(scenarios)
1940
-
2049
+
1941
2050
  @staticmethod
1942
2051
  def _from_parquet(file_path: str):
1943
2052
  """Create a ScenarioList from a Parquet file."""
1944
2053
  warnings.warn(
1945
2054
  "_from_parquet is deprecated. Use ParquetSource directly or ScenarioSource.from_source('parquet', ...) instead.",
1946
2055
  DeprecationWarning,
1947
- stacklevel=2
2056
+ stacklevel=2,
1948
2057
  )
1949
2058
  source = ParquetSource(file_path)
1950
2059
  return source.to_scenario_list()
1951
-
2060
+
1952
2061
  @staticmethod
1953
2062
  def _from_pdf(
1954
2063
  file_path: str,
@@ -1960,16 +2069,16 @@ class ScenarioSource:
1960
2069
  warnings.warn(
1961
2070
  "_from_pdf is deprecated. Use PDFSource directly or ScenarioSource.from_source('pdf', ...) instead.",
1962
2071
  DeprecationWarning,
1963
- stacklevel=2
2072
+ stacklevel=2,
1964
2073
  )
1965
2074
  source = PDFSource(
1966
2075
  file_path=file_path,
1967
2076
  chunk_type=chunk_type,
1968
2077
  chunk_size=chunk_size,
1969
- chunk_overlap=chunk_overlap
2078
+ chunk_overlap=chunk_overlap,
1970
2079
  )
1971
2080
  return source.to_scenario_list()
1972
-
2081
+
1973
2082
  @staticmethod
1974
2083
  def _from_pdf_to_image(
1975
2084
  file_path: str,
@@ -1980,11 +2089,9 @@ class ScenarioSource:
1980
2089
  warnings.warn(
1981
2090
  "_from_pdf_to_image is deprecated. Use PDFImageSource directly or ScenarioSource.from_source('pdf_to_image', ...) instead.",
1982
2091
  DeprecationWarning,
1983
- stacklevel=2
2092
+ stacklevel=2,
1984
2093
  )
1985
2094
  source = PDFImageSource(
1986
- file_path=file_path,
1987
- base_width=base_width,
1988
- include_text=include_text
2095
+ file_path=file_path, base_width=base_width, include_text=include_text
1989
2096
  )
1990
- return source.to_scenario_list()
2097
+ return source.to_scenario_list()