edsl 0.1.54__py3-none-any.whl → 0.1.56__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. edsl/__init__.py +8 -1
  2. edsl/__init__original.py +134 -0
  3. edsl/__version__.py +1 -1
  4. edsl/agents/agent.py +29 -0
  5. edsl/agents/agent_list.py +36 -1
  6. edsl/base/base_class.py +281 -151
  7. edsl/base/data_transfer_models.py +15 -4
  8. edsl/buckets/__init__.py +8 -3
  9. edsl/buckets/bucket_collection.py +9 -3
  10. edsl/buckets/model_buckets.py +4 -2
  11. edsl/buckets/token_bucket.py +2 -2
  12. edsl/buckets/token_bucket_client.py +5 -3
  13. edsl/caching/cache.py +131 -62
  14. edsl/caching/cache_entry.py +70 -58
  15. edsl/caching/sql_dict.py +17 -0
  16. edsl/cli.py +99 -0
  17. edsl/config/config_class.py +16 -0
  18. edsl/conversation/__init__.py +31 -0
  19. edsl/coop/coop.py +276 -242
  20. edsl/coop/coop_jobs_objects.py +59 -0
  21. edsl/coop/coop_objects.py +29 -0
  22. edsl/coop/coop_regular_objects.py +26 -0
  23. edsl/coop/utils.py +24 -19
  24. edsl/dataset/dataset.py +338 -101
  25. edsl/dataset/dataset_operations_mixin.py +216 -180
  26. edsl/db_list/sqlite_list.py +349 -0
  27. edsl/inference_services/__init__.py +40 -5
  28. edsl/inference_services/exceptions.py +11 -0
  29. edsl/inference_services/services/anthropic_service.py +5 -2
  30. edsl/inference_services/services/aws_bedrock.py +6 -2
  31. edsl/inference_services/services/azure_ai.py +6 -2
  32. edsl/inference_services/services/google_service.py +7 -3
  33. edsl/inference_services/services/mistral_ai_service.py +6 -2
  34. edsl/inference_services/services/open_ai_service.py +6 -2
  35. edsl/inference_services/services/perplexity_service.py +6 -2
  36. edsl/inference_services/services/test_service.py +94 -5
  37. edsl/interviews/answering_function.py +167 -59
  38. edsl/interviews/interview.py +124 -72
  39. edsl/interviews/interview_task_manager.py +10 -0
  40. edsl/interviews/request_token_estimator.py +8 -0
  41. edsl/invigilators/invigilators.py +35 -13
  42. edsl/jobs/async_interview_runner.py +146 -104
  43. edsl/jobs/data_structures.py +6 -4
  44. edsl/jobs/decorators.py +61 -0
  45. edsl/jobs/fetch_invigilator.py +61 -18
  46. edsl/jobs/html_table_job_logger.py +14 -2
  47. edsl/jobs/jobs.py +180 -104
  48. edsl/jobs/jobs_component_constructor.py +2 -2
  49. edsl/jobs/jobs_interview_constructor.py +2 -0
  50. edsl/jobs/jobs_pricing_estimation.py +154 -113
  51. edsl/jobs/jobs_remote_inference_logger.py +4 -0
  52. edsl/jobs/jobs_runner_status.py +30 -25
  53. edsl/jobs/progress_bar_manager.py +79 -0
  54. edsl/jobs/remote_inference.py +35 -1
  55. edsl/key_management/key_lookup_builder.py +6 -1
  56. edsl/language_models/language_model.py +110 -12
  57. edsl/language_models/model.py +10 -3
  58. edsl/language_models/price_manager.py +176 -71
  59. edsl/language_models/registry.py +5 -0
  60. edsl/notebooks/notebook.py +77 -10
  61. edsl/questions/VALIDATION_README.md +134 -0
  62. edsl/questions/__init__.py +24 -1
  63. edsl/questions/exceptions.py +21 -0
  64. edsl/questions/question_dict.py +201 -16
  65. edsl/questions/question_multiple_choice_with_other.py +624 -0
  66. edsl/questions/question_registry.py +2 -1
  67. edsl/questions/templates/multiple_choice_with_other/__init__.py +0 -0
  68. edsl/questions/templates/multiple_choice_with_other/answering_instructions.jinja +15 -0
  69. edsl/questions/templates/multiple_choice_with_other/question_presentation.jinja +17 -0
  70. edsl/questions/validation_analysis.py +185 -0
  71. edsl/questions/validation_cli.py +131 -0
  72. edsl/questions/validation_html_report.py +404 -0
  73. edsl/questions/validation_logger.py +136 -0
  74. edsl/results/result.py +115 -46
  75. edsl/results/results.py +702 -171
  76. edsl/scenarios/construct_download_link.py +16 -3
  77. edsl/scenarios/directory_scanner.py +226 -226
  78. edsl/scenarios/file_methods.py +5 -0
  79. edsl/scenarios/file_store.py +150 -9
  80. edsl/scenarios/handlers/__init__.py +5 -1
  81. edsl/scenarios/handlers/mp4_file_store.py +104 -0
  82. edsl/scenarios/handlers/webm_file_store.py +104 -0
  83. edsl/scenarios/scenario.py +120 -101
  84. edsl/scenarios/scenario_list.py +800 -727
  85. edsl/scenarios/scenario_list_gc_test.py +146 -0
  86. edsl/scenarios/scenario_list_memory_test.py +214 -0
  87. edsl/scenarios/scenario_list_source_refactor.md +35 -0
  88. edsl/scenarios/scenario_selector.py +5 -4
  89. edsl/scenarios/scenario_source.py +1990 -0
  90. edsl/scenarios/tests/test_scenario_list_sources.py +52 -0
  91. edsl/surveys/survey.py +22 -0
  92. edsl/tasks/__init__.py +4 -2
  93. edsl/tasks/task_history.py +198 -36
  94. edsl/tests/scenarios/test_ScenarioSource.py +51 -0
  95. edsl/tests/scenarios/test_scenario_list_sources.py +51 -0
  96. edsl/utilities/__init__.py +2 -1
  97. edsl/utilities/decorators.py +121 -0
  98. edsl/utilities/memory_debugger.py +1010 -0
  99. {edsl-0.1.54.dist-info → edsl-0.1.56.dist-info}/METADATA +51 -76
  100. {edsl-0.1.54.dist-info → edsl-0.1.56.dist-info}/RECORD +103 -79
  101. edsl/jobs/jobs_runner_asyncio.py +0 -281
  102. edsl/language_models/unused/fake_openai_service.py +0 -60
  103. {edsl-0.1.54.dist-info → edsl-0.1.56.dist-info}/LICENSE +0 -0
  104. {edsl-0.1.54.dist-info → edsl-0.1.56.dist-info}/WHEEL +0 -0
  105. {edsl-0.1.54.dist-info → edsl-0.1.56.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,1990 @@
1
+ """
2
+ ScenarioSource provides factory methods for creating ScenarioList objects from external sources.
3
+
4
+ This module contains the ScenarioSource class, which serves as a factory for creating
5
+ ScenarioList objects from various external data sources like files, directories, and URLs.
6
+ It centralizes all the file/external-source creation logic that was previously scattered
7
+ across different classmethods in the ScenarioList class.
8
+
9
+ Key features include:
10
+ - A unified from_source method that dispatches to appropriate source-specific methods
11
+ - Support for various data sources (CSV, Excel, PDF, directories, URLs, etc.)
12
+ - Deprecation decorators for backward compatibility with ScenarioList class methods
13
+ """
14
+
15
+ from __future__ import annotations
16
+ import functools
17
+ import warnings
18
+ import fnmatch
19
+ from typing import Any, Callable, List, Literal, Optional, Type, TypeVar, Union, TYPE_CHECKING, cast, Any
20
+
21
+ T = TypeVar('T')
22
+
23
+ def deprecated_classmethod(alternative: str) -> Callable[[Callable[..., T]], Callable[..., T]]:
24
+ """
25
+ Decorator that marks a class method as deprecated.
26
+
27
+ Args:
28
+ alternative: The suggested alternative to use instead
29
+
30
+ Returns:
31
+ A decorator function that wraps the original method with a deprecation warning
32
+ """
33
+ def decorator(func: Callable[..., T]) -> Callable[..., T]:
34
+ @functools.wraps(func)
35
+ def wrapper(*args: Any, **kwargs: Any) -> T:
36
+ warnings.warn(
37
+ f"{func.__qualname__} is deprecated. Use {alternative} instead.",
38
+ DeprecationWarning,
39
+ stacklevel=2
40
+ )
41
+ return func(*args, **kwargs)
42
+ return wrapper
43
+ return decorator
44
+
45
+ import os
46
+ import csv
47
+ import json
48
+ import warnings
49
+ from io import StringIO
50
+ from urllib.parse import urlparse
51
+
52
+ if TYPE_CHECKING:
53
+ import pandas as pd
54
+ from urllib.parse import ParseResult
55
+ from .scenario_list import ScenarioList
56
+
57
+ # Local imports
58
+ from .scenario import Scenario
59
+ from .directory_scanner import DirectoryScanner
60
+ from .exceptions import ScenarioError
61
+
62
+ from abc import ABC, abstractmethod
63
+
64
+ class Source(ABC):
65
+ # Registry to store child classes and their source types
66
+ _registry: dict[str, Type['Source']] = {}
67
+
68
+ def __init_subclass__(cls, **kwargs):
69
+ """Automatically register subclasses with their source_type."""
70
+ super().__init_subclass__(**kwargs)
71
+ if hasattr(cls, 'source_type'):
72
+ Source._registry[cls.source_type] = cls
73
+
74
+ @classmethod
75
+ @abstractmethod
76
+ def example(cls) -> 'Source':
77
+ """
78
+ Return an example instance of this Source type.
79
+
80
+ This method should return a valid instance of the Source subclass
81
+ that can be used for testing. The instance should be created with
82
+ reasonable default values that will produce a valid ScenarioList
83
+ when to_scenario_list() is called.
84
+
85
+ Returns:
86
+ An instance of the Source subclass
87
+ """
88
+ pass
89
+
90
+ @abstractmethod
91
+ def to_scenario_list(self):
92
+ """
93
+ Convert the source to a ScenarioList.
94
+
95
+ Returns:
96
+ A ScenarioList containing the data from this source
97
+ """
98
+ pass
99
+
100
+ @classmethod
101
+ def get_source_class(cls, source_type: str) -> Type['Source']:
102
+ """Get the Source subclass for a given source_type."""
103
+ if source_type not in cls._registry:
104
+ raise ValueError(f"No Source subclass found for source_type: {source_type}")
105
+ return cls._registry[source_type]
106
+
107
+ @classmethod
108
+ def get_registered_types(cls) -> list[str]:
109
+ """Get a list of all registered source types."""
110
+ return list(cls._registry.keys())
111
+
112
+ @classmethod
113
+ def test_all_sources(cls) -> dict[str, bool]:
114
+ """
115
+ Test all registered source types by creating an example instance
116
+ and calling to_scenario_list() on it.
117
+
118
+ Returns:
119
+ A dictionary mapping source types to boolean success values
120
+ """
121
+ from .scenario_list import ScenarioList
122
+
123
+ results = {}
124
+ for source_type, source_class in cls._registry.items():
125
+ try:
126
+ # Create example instance
127
+ example_instance = source_class.example()
128
+ # Convert to scenario list
129
+ scenario_list = example_instance.to_scenario_list()
130
+ # Basic validation
131
+ if not isinstance(scenario_list, ScenarioList):
132
+ results[source_type] = False
133
+ print(f"Source {source_type} returned {type(scenario_list)} instead of ScenarioList")
134
+ else:
135
+ results[source_type] = True
136
+ except Exception as e:
137
+ results[source_type] = False
138
+ print(f"Source {source_type} exception: {e}")
139
+ return results
140
+
141
+ class URLSource(Source):
142
+ source_type = "urls"
143
+
144
+ def __init__(self, urls: list[str], field_name: str):
145
+ self.urls = urls
146
+ self.field_name = field_name
147
+
148
+ @classmethod
149
+ def example(cls) -> 'URLSource':
150
+ """Return an example URLSource instance."""
151
+ return cls(
152
+ urls=['http://www.example.com'],
153
+ field_name="text"
154
+ )
155
+
156
+ def to_scenario_list(self):
157
+ """Create a ScenarioList from a list of URLs."""
158
+ import requests
159
+
160
+ from .scenario_list import ScenarioList
161
+
162
+ result = ScenarioList()
163
+ for url in self.urls:
164
+ try:
165
+ response = requests.get(url)
166
+ response.raise_for_status()
167
+ scenario = Scenario({self.field_name: response.text})
168
+ result.append(scenario)
169
+ except requests.RequestException as e:
170
+ warnings.warn(f"Failed to fetch URL {url}: {str(e)}")
171
+ continue
172
+
173
+ return result
174
+
175
+
176
+ class ListSource(Source):
177
+ source_type = "list"
178
+
179
+ def __init__(self, field_name: str, values: list, use_indexes: bool = False):
180
+ self.field_name = field_name
181
+ self.values = values
182
+ self.use_indexes = use_indexes
183
+
184
+ @classmethod
185
+ def example(cls) -> 'ListSource':
186
+ """Return an example ListSource instance."""
187
+ return cls(
188
+ field_name="text",
189
+ values=["example1", "example2", "example3"],
190
+ use_indexes=True
191
+ )
192
+
193
+ def to_scenario_list(self):
194
+ """Create a ScenarioList from a list of values with a specified field name."""
195
+ from .scenario_list import ScenarioList
196
+
197
+ scenarios = []
198
+
199
+ for i, value in enumerate(self.values):
200
+ scenario_dict = {self.field_name: value}
201
+ if self.use_indexes:
202
+ scenario_dict["idx"] = i
203
+ scenarios.append(Scenario(scenario_dict))
204
+
205
+ return ScenarioList(scenarios)
206
+
207
+
208
+ class DirectorySource(Source):
209
+ source_type = "directory"
210
+
211
+ def __init__(
212
+ self,
213
+ directory: str,
214
+ pattern: str = "*",
215
+ recursive: bool = False,
216
+ metadata: bool = True,
217
+ ignore_dirs: List[str] = None,
218
+ ignore_files: List[str] = None,
219
+ ):
220
+ self.directory = directory
221
+ self.pattern = pattern
222
+ self.recursive = recursive
223
+ self.metadata = metadata
224
+ self.ignore_dirs = ignore_dirs or []
225
+ self.ignore_files = ignore_files or []
226
+
227
+ @classmethod
228
+ def example(cls) -> 'DirectorySource':
229
+ """Return an example DirectorySource instance."""
230
+ import tempfile
231
+ import os
232
+
233
+ # Create a temporary directory for the example
234
+ temp_dir = tempfile.mkdtemp(prefix="edsl_test_")
235
+
236
+ # Create some sample files in the directory
237
+ with open(os.path.join(temp_dir, "test1.txt"), "w") as f:
238
+ f.write("Sample content 1")
239
+
240
+ with open(os.path.join(temp_dir, "test2.txt"), "w") as f:
241
+ f.write("Sample content 2")
242
+
243
+ # Create a subdirectory with a file
244
+ subdir = os.path.join(temp_dir, "subdir")
245
+ os.makedirs(subdir, exist_ok=True)
246
+ with open(os.path.join(subdir, "test3.txt"), "w") as f:
247
+ f.write("Sample content 3")
248
+
249
+ return cls(
250
+ directory=temp_dir,
251
+ pattern="*.txt",
252
+ recursive=True,
253
+ metadata=True,
254
+ ignore_dirs=["__pycache__"],
255
+ ignore_files=["*.pyc"]
256
+ )
257
+
258
+ def to_scenario_list(self):
259
+ """Create a ScenarioList from files in a directory."""
260
+ import os
261
+ import glob
262
+
263
+ from .scenario_list import ScenarioList
264
+
265
+ # Set default recursive value
266
+ recursive = self.recursive
267
+
268
+ # Handle paths with wildcards properly
269
+ if '*' in self.directory:
270
+ # Handle "**/*.py" patterns (recursive wildcard)
271
+ if "**" in self.directory:
272
+ parts = self.directory.split("**")
273
+ directory = parts[0].rstrip("/\\")
274
+ if not directory:
275
+ directory = os.getcwd()
276
+ pattern = f"**{parts[1]}" if len(parts) > 1 else "**/*"
277
+ # Force recursive=True for ** patterns
278
+ recursive = True
279
+ # Handle "*.txt" patterns (just wildcard with no directory)
280
+ elif os.path.dirname(self.directory) == "":
281
+ directory = os.getcwd()
282
+ pattern = self.directory
283
+ # Handle "/path/to/dir/*.py" patterns
284
+ else:
285
+ directory = os.path.dirname(self.directory)
286
+ pattern = os.path.basename(self.directory)
287
+ else:
288
+ directory = self.directory
289
+ pattern = self.pattern
290
+
291
+ # Check if directory exists
292
+ if not os.path.isdir(directory):
293
+ from .exceptions import FileNotFoundScenarioError
294
+ raise FileNotFoundScenarioError(f"Directory not found: {directory}")
295
+
296
+ # Use glob directly for ** patterns to prevent duplicates
297
+ if "**" in pattern:
298
+ from .scenario_list import ScenarioList
299
+ from .file_store import FileStore
300
+
301
+ # Handle the pattern directly with glob
302
+ full_pattern = os.path.join(directory, pattern)
303
+ file_paths = glob.glob(full_pattern, recursive=True)
304
+
305
+ # Remove duplicates (by converting to a set and back)
306
+ file_paths = list(set(file_paths))
307
+
308
+ # Create scenarios
309
+ scenarios = []
310
+ for file_path in file_paths:
311
+ if os.path.isfile(file_path):
312
+ # Check if file should be ignored
313
+ file_name = os.path.basename(file_path)
314
+ if any(fnmatch.fnmatch(file_name, ignore_pattern) for ignore_pattern in self.ignore_files or []):
315
+ continue
316
+
317
+ # Create FileStore object
318
+ file_store = FileStore(file_path)
319
+
320
+ # Create scenario
321
+ scenario_data = {"file": file_store}
322
+
323
+ # Add metadata if requested
324
+ if self.metadata:
325
+ file_stat = os.stat(file_path)
326
+ scenario_data.update({
327
+ "file_path": file_path,
328
+ "file_name": file_name,
329
+ "file_size": file_stat.st_size,
330
+ "file_created": file_stat.st_ctime,
331
+ "file_modified": file_stat.st_mtime,
332
+ })
333
+
334
+ scenarios.append(Scenario(scenario_data))
335
+
336
+ return ScenarioList(scenarios)
337
+ else:
338
+ # Use the standard scanning method for non-** patterns
339
+ return DirectoryScanner.scan_directory(
340
+ directory=directory,
341
+ pattern=pattern,
342
+ recursive=recursive,
343
+ metadata=self.metadata,
344
+ ignore_dirs=self.ignore_dirs,
345
+ ignore_files=self.ignore_files,
346
+ )
347
+
348
+
349
+ class TuplesSource(Source):
350
+ source_type = "list_of_tuples"
351
+
352
+ def __init__(self, field_names: list[str], values: list[tuple], use_indexes: bool = False):
353
+ self.field_names = field_names
354
+ self.values = values
355
+ self.use_indexes = use_indexes
356
+
357
+ # Validate inputs
358
+ if not all(isinstance(v, (tuple, list)) for v in values):
359
+ raise ScenarioError("All values must be tuples or lists")
360
+
361
+ @classmethod
362
+ def example(cls) -> 'TuplesSource':
363
+ """Return an example TuplesSource instance."""
364
+ return cls(
365
+ field_names=["name", "age", "city"],
366
+ values=[
367
+ ("Alice", 30, "New York"),
368
+ ("Bob", 25, "San Francisco"),
369
+ ("Charlie", 35, "Boston")
370
+ ],
371
+ use_indexes=True
372
+ )
373
+
374
+ def to_scenario_list(self):
375
+ """Create a ScenarioList from a list of tuples with specified field names."""
376
+ from .scenario_list import ScenarioList
377
+
378
+ scenarios = []
379
+
380
+ for i, value_tuple in enumerate(self.values):
381
+ if len(value_tuple) != len(self.field_names):
382
+ raise ScenarioError(
383
+ f"Tuple {i} has {len(value_tuple)} elements, but {len(self.field_names)} field names were provided."
384
+ )
385
+
386
+ scenario_dict = dict(zip(self.field_names, value_tuple))
387
+ if self.use_indexes:
388
+ scenario_dict["idx"] = i
389
+ scenarios.append(Scenario(scenario_dict))
390
+
391
+ return ScenarioList(scenarios)
392
+
393
+
394
+ class SQLiteSource(Source):
395
+ source_type = "sqlite"
396
+
397
+ def __init__(self, db_path: str, table: str, fields: Optional[list] = None):
398
+ self.db_path = db_path
399
+ self.table = table
400
+ self.fields = fields
401
+
402
+ @classmethod
403
+ def example(cls) -> 'SQLiteSource':
404
+ """Return an example SQLiteSource instance."""
405
+ import sqlite3
406
+ import tempfile
407
+ import os
408
+
409
+ # Create a temporary SQLite database for the example
410
+ fd, temp_path = tempfile.mkstemp(suffix='.db', prefix='edsl_test_')
411
+ os.close(fd) # Close the file descriptor
412
+
413
+ # Connect to the database and create a sample table
414
+ conn = sqlite3.connect(temp_path)
415
+ cursor = conn.cursor()
416
+
417
+ # Create a simple table
418
+ cursor.execute('CREATE TABLE test_table (id INTEGER PRIMARY KEY, name TEXT, value INTEGER)')
419
+
420
+ # Insert sample data
421
+ sample_data = [
422
+ (1, 'Alpha', 100),
423
+ (2, 'Beta', 200),
424
+ (3, 'Gamma', 300)
425
+ ]
426
+ cursor.executemany('INSERT INTO test_table VALUES (?, ?, ?)', sample_data)
427
+
428
+ conn.commit()
429
+ conn.close()
430
+
431
+ return cls(
432
+ db_path=temp_path,
433
+ table='test_table',
434
+ fields=['id', 'name', 'value']
435
+ )
436
+
437
+ def to_scenario_list(self):
438
+ """Create a ScenarioList from a SQLite database."""
439
+ from .scenario_list import ScenarioList
440
+ import sqlite3
441
+
442
+ conn = sqlite3.connect(self.db_path)
443
+ cursor = conn.cursor()
444
+
445
+ # If fields weren't provided, get all fields from the table
446
+ fields = self.fields
447
+ if fields is None:
448
+ cursor.execute(f"PRAGMA table_info({self.table})")
449
+ fields = [row[1] for row in cursor.fetchall()]
450
+
451
+ # Query the data
452
+ field_placeholders = ", ".join(fields)
453
+ cursor.execute(f"SELECT {field_placeholders} FROM {self.table}")
454
+ rows = cursor.fetchall()
455
+
456
+ # Create scenarios
457
+ scenarios = []
458
+ for row in rows:
459
+ scenario_dict = dict(zip(fields, row))
460
+ scenarios.append(Scenario(scenario_dict))
461
+
462
+ conn.close()
463
+ return ScenarioList(scenarios)
464
+
465
+
466
+ class LaTeXSource(Source):
467
+ source_type = "latex"
468
+
469
+ def __init__(self, file_path: str, table_index: int = 0, has_header: bool = True):
470
+ """
471
+ Initialize a LaTeXSource with a LaTeX file path.
472
+
473
+ Args:
474
+ file_path: The path to the LaTeX file.
475
+ table_index: The index of the table to extract (if multiple tables exist).
476
+ Default is 0 (first table).
477
+ has_header: Whether the table has a header row. Default is True.
478
+ """
479
+ self.file_path = file_path
480
+ self.table_index = table_index
481
+ self.has_header = has_header
482
+
483
+ @classmethod
484
+ def example(cls) -> 'LaTeXSource':
485
+ """Return an example LaTeXSource instance."""
486
+ import tempfile
487
+ import os
488
+
489
+ # Create a temporary LaTeX file with a sample table
490
+ fd, temp_path = tempfile.mkstemp(suffix='.tex', prefix='edsl_test_')
491
+ os.close(fd) # Close the file descriptor
492
+
493
+ # Write a sample LaTeX table to the file
494
+ sample_latex = r"""
495
+ \documentclass{article}
496
+ \begin{document}
497
+ This is a sample document with a table:
498
+
499
+ \begin{tabular}{lrr}
500
+ \textbf{Name} & \textbf{Age} & \textbf{Score} \\
501
+ Alice & 30 & 95 \\
502
+ Bob & 25 & 87 \\
503
+ Charlie & 35 & 92 \\
504
+ \end{tabular}
505
+
506
+ \end{document}
507
+ """
508
+ with open(temp_path, 'w') as f:
509
+ f.write(sample_latex)
510
+
511
+ return cls(
512
+ file_path=temp_path,
513
+ table_index=0,
514
+ has_header=True
515
+ )
516
+
517
+ def to_scenario_list(self):
518
+ """Create a ScenarioList from a LaTeX file."""
519
+ from .scenario_list import ScenarioList
520
+ import re
521
+
522
+ with open(self.file_path, "r") as f:
523
+ content = f.read()
524
+
525
+ # Find all tabular environments
526
+ tabular_pattern = r"\\begin{tabular}(.*?)\\end{tabular}"
527
+ tables = re.findall(tabular_pattern, content, re.DOTALL)
528
+
529
+ if not tables or self.table_index >= len(tables):
530
+ raise ScenarioError(f"No table found at index {self.table_index}")
531
+
532
+ table_content = tables[self.table_index]
533
+
534
+ # Extract rows
535
+ rows = table_content.split("\\\\")
536
+ rows = [row.strip() for row in rows if row.strip()]
537
+
538
+ if not rows:
539
+ return ScenarioList()
540
+
541
+ # Process header if available
542
+ if self.has_header:
543
+ header_row = rows[0]
544
+ header_cells = re.findall(r"\\textbf{(.*?)}", header_row)
545
+ if not header_cells:
546
+ header_cells = header_row.split("&")
547
+ header_cells = [h.strip() for h in header_cells]
548
+
549
+ data_rows = rows[1:]
550
+ else:
551
+ # Auto-generate column names
552
+ header_cells = [f"col{i}" for i in range(rows[0].count("&") + 1)]
553
+ data_rows = rows
554
+
555
+ # Process data rows
556
+ scenarios = []
557
+ for row in data_rows:
558
+ cells = row.split("&")
559
+ cells = [cell.strip() for cell in cells]
560
+
561
+ if len(cells) != len(header_cells):
562
+ continue # Skip malformed rows
563
+
564
+ scenario_dict = dict(zip(header_cells, cells))
565
+ scenarios.append(Scenario(scenario_dict))
566
+
567
+ return ScenarioList(scenarios)
568
+
569
+
570
+ class GoogleDocSource(Source):
571
+ source_type = "google_doc"
572
+
573
+ def __init__(self, url: str):
574
+ """
575
+ Initialize a GoogleDocSource with a Google Doc URL.
576
+
577
+ Args:
578
+ url: The URL to the Google Doc.
579
+ """
580
+ self.url = url
581
+
582
+ @classmethod
583
+ def example(cls) -> 'GoogleDocSource':
584
+ """Return an example GoogleDocSource instance."""
585
+ # Create a mock instance that doesn't actually fetch a Google Doc
586
+ instance = cls(url="https://docs.google.com/document/d/1234567890abcdefghijklmnopqrstuvwxyz/edit")
587
+
588
+ # Override the to_scenario_list method just for the example
589
+ def mock_to_scenario_list(self):
590
+ from .scenario_list import ScenarioList
591
+ # Create a simple mock ScenarioList with a few paragraphs
592
+ scenarios = [
593
+ Scenario({"text": "This is paragraph 1 from a sample Google Doc."}),
594
+ Scenario({"text": "This is paragraph 2 with some more content."}),
595
+ Scenario({"text": "This is the final paragraph with a conclusion."})
596
+ ]
597
+ return ScenarioList(scenarios)
598
+
599
+ # Replace the method on this instance only
600
+ import types
601
+ instance.to_scenario_list = types.MethodType(mock_to_scenario_list, instance)
602
+
603
+ return instance
604
+
605
+ def to_scenario_list(self):
606
+ """Create a ScenarioList from a Google Doc."""
607
+ from .scenario_list import ScenarioList
608
+ import tempfile
609
+ import requests
610
+
611
+ # Extract the document ID from the URL
612
+ if "/edit" in self.url:
613
+ doc_id = self.url.split("/d/")[1].split("/edit")[0]
614
+ else:
615
+ raise ScenarioError("Invalid Google Doc URL format.")
616
+
617
+ # Create the export URL to download as DOCX
618
+ export_url = f"https://docs.google.com/document/d/{doc_id}/export?format=docx"
619
+
620
+ try:
621
+ # Download the Google Doc as a Word file (.docx)
622
+ response = requests.get(export_url)
623
+ response.raise_for_status() # Ensure the request was successful
624
+
625
+ # Save the Word file to a temporary file
626
+ with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as temp_file:
627
+ temp_file.write(response.content)
628
+ temp_filename = temp_file.name
629
+
630
+ # Use the DocxScenario class to process the temporary file
631
+ from .scenario_list import ScenarioList
632
+ from .DocxScenario import DocxScenario
633
+
634
+ # Create a scenario from the DOCX file
635
+ docx_scenario = DocxScenario(temp_filename)
636
+ scenarios = [Scenario({"text": paragraph}) for paragraph in docx_scenario.paragraphs]
637
+
638
+ return ScenarioList(scenarios)
639
+
640
+ except requests.RequestException as e:
641
+ raise ScenarioError(f"Failed to fetch Google Doc: {str(e)}")
642
+ except Exception as e:
643
+ raise ScenarioError(f"Error processing Google Doc: {str(e)}")
644
+
645
+
646
+ class PandasSource(Source):
647
+ source_type = "pandas"
648
+
649
+ def __init__(self, df):
650
+ """
651
+ Initialize a PandasSource with a pandas DataFrame.
652
+
653
+ Args:
654
+ df: A pandas DataFrame.
655
+ """
656
+ try:
657
+ import pandas as pd
658
+ if not isinstance(df, pd.DataFrame):
659
+ raise ScenarioError("Input must be a pandas DataFrame")
660
+ self.df = df
661
+ except ImportError:
662
+ raise ImportError("pandas is required for PandasSource")
663
+
664
+ @classmethod
665
+ def example(cls) -> 'PandasSource':
666
+ """Return an example PandasSource instance."""
667
+ try:
668
+ import pandas as pd
669
+
670
+ # Create a sample DataFrame for the example
671
+ sample_data = {
672
+ 'name': ['Alice', 'Bob', 'Charlie', 'David'],
673
+ 'age': [30, 25, 35, 28],
674
+ 'city': ['New York', 'San Francisco', 'Boston', 'Seattle']
675
+ }
676
+ df = pd.DataFrame(sample_data)
677
+
678
+ return cls(df)
679
+ except ImportError:
680
+ # Create a mock instance that doesn't actually need pandas
681
+ instance = cls.__new__(cls)
682
+
683
+ # Override the to_scenario_list method just for the example
684
+ def mock_to_scenario_list(self):
685
+ from .scenario_list import ScenarioList
686
+ # Create a simple mock ScenarioList
687
+ scenarios = [
688
+ Scenario({"name": "Alice", "age": 30, "city": "New York"}),
689
+ Scenario({"name": "Bob", "age": 25, "city": "San Francisco"}),
690
+ Scenario({"name": "Charlie", "age": 35, "city": "Boston"}),
691
+ Scenario({"name": "David", "age": 28, "city": "Seattle"})
692
+ ]
693
+ return ScenarioList(scenarios)
694
+
695
+ # Replace the method on this instance only
696
+ import types
697
+ instance.to_scenario_list = types.MethodType(mock_to_scenario_list, instance)
698
+
699
+ return instance
700
+
701
+ def to_scenario_list(self):
702
+ """Create a ScenarioList from a pandas DataFrame."""
703
+ from .scenario_list import ScenarioList
704
+
705
+ # Convert DataFrame records to scenarios
706
+ scenarios = []
707
+ for _, row in self.df.iterrows():
708
+ scenario_dict = row.to_dict()
709
+ scenarios.append(Scenario(scenario_dict))
710
+
711
+ return ScenarioList(scenarios)
712
+
713
+
714
+ class StataSource(Source):
715
+ source_type = "dta"
716
+
717
+ def __init__(self, file_path: str, include_metadata: bool = True):
718
+ """
719
+ Initialize a StataSource with a path to a Stata data file.
720
+
721
+ Args:
722
+ file_path: Path to the Stata (.dta) file.
723
+ include_metadata: If True, extract and preserve variable labels and value labels
724
+ as additional metadata in the ScenarioList.
725
+ """
726
+ self.file_path = file_path
727
+ self.include_metadata = include_metadata
728
+
729
+ @classmethod
730
+ def example(cls) -> 'StataSource':
731
+ """Return an example StataSource instance."""
732
+ import tempfile
733
+ import os
734
+
735
+ # Since we can't easily create a real Stata file for testing,
736
+ # we'll create a mock instance with an override
737
+ instance = cls(file_path="/path/to/nonexistent/file.dta")
738
+
739
+ # Override the to_scenario_list method just for the example
740
+ def mock_to_scenario_list(self):
741
+ from .scenario_list import ScenarioList
742
+
743
+ # Create a simple mock ScenarioList with Stata-like data
744
+ scenarios = [
745
+ Scenario({"id": 1, "gender": 1, "income": 50000, "education": 2}),
746
+ Scenario({"id": 2, "gender": 2, "income": 45000, "education": 3}),
747
+ Scenario({"id": 3, "gender": 1, "income": 60000, "education": 4})
748
+ ]
749
+
750
+ result = ScenarioList(scenarios)
751
+
752
+ # Add metadata similar to what would be in a Stata file
753
+ if self.include_metadata:
754
+ result.codebook = {
755
+ "variable_labels": {
756
+ "gender": "Gender (1=Male, 2=Female)",
757
+ "income": "Annual income in USD",
758
+ "education": "Education level (1-4)"
759
+ },
760
+ "value_labels": {
761
+ "gender": {1: "Male", 2: "Female"},
762
+ "education": {1: "High School", 2: "Associate", 3: "Bachelor", 4: "Graduate"}
763
+ }
764
+ }
765
+
766
+ return result
767
+
768
+ # Replace the method on this instance only
769
+ import types
770
+ instance.to_scenario_list = types.MethodType(mock_to_scenario_list, instance)
771
+
772
+ return instance
773
+
774
+ def to_scenario_list(self):
775
+ """Create a ScenarioList from a Stata data file."""
776
+ from .scenario_list import ScenarioList
777
+
778
+ try:
779
+ import pandas as pd
780
+ except ImportError:
781
+ raise ImportError("pandas is required to read Stata files")
782
+
783
+ # Read the Stata file with pandas
784
+ df = pd.read_stata(self.file_path)
785
+
786
+ # Create scenarios
787
+ scenarios = []
788
+ for _, row in df.iterrows():
789
+ scenario_dict = row.to_dict()
790
+ scenarios.append(Scenario(scenario_dict))
791
+
792
+ # Create the basic ScenarioList
793
+ result = ScenarioList(scenarios)
794
+
795
+ # Extract and preserve metadata if requested
796
+ if self.include_metadata:
797
+ # Get variable labels (if any)
798
+ variable_labels = {}
799
+ if hasattr(df, "variable_labels") and df.variable_labels:
800
+ variable_labels = df.variable_labels
801
+
802
+ # Get value labels (if any)
803
+ value_labels = {}
804
+ if hasattr(df, "value_labels") and df.value_labels:
805
+ value_labels = df.value_labels
806
+
807
+ # Store the metadata in the ScenarioList's codebook
808
+ if variable_labels or value_labels:
809
+ result.codebook = {
810
+ "variable_labels": variable_labels,
811
+ "value_labels": value_labels,
812
+ }
813
+
814
+ return result
815
+
816
+
817
+ class WikipediaSource(Source):
818
+ source_type = "wikipedia"
819
+
820
+ def __init__(self, url: str, table_index: int = 0, header: bool = True):
821
+ """
822
+ Initialize a WikipediaSource with a URL to a Wikipedia page.
823
+
824
+ Args:
825
+ url: The URL of the Wikipedia page.
826
+ table_index: The index of the table to extract (default is 0).
827
+ header: Whether the table has a header row (default is True).
828
+ """
829
+ self.url = url
830
+ self.table_index = table_index
831
+ self.header = header
832
+
833
+ @classmethod
834
+ def example(cls) -> 'WikipediaSource':
835
+ """Return an example WikipediaSource instance."""
836
+ # Use a real Wikipedia URL for the example, but we'll override the to_scenario_list method
837
+ instance = cls(
838
+ url="https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)",
839
+ table_index=0,
840
+ header=True
841
+ )
842
+
843
+ # Override the to_scenario_list method just for the example
844
+ def mock_to_scenario_list(self):
845
+ from .scenario_list import ScenarioList
846
+
847
+ # Create a simple mock ScenarioList with GDP data
848
+ scenarios = [
849
+ Scenario({"Rank": 1, "Country": "United States", "GDP (millions of USD)": 25460000}),
850
+ Scenario({"Rank": 2, "Country": "China", "GDP (millions of USD)": 17963000}),
851
+ Scenario({"Rank": 3, "Country": "Japan", "GDP (millions of USD)": 4231000}),
852
+ Scenario({"Rank": 4, "Country": "Germany", "GDP (millions of USD)": 4430000}),
853
+ Scenario({"Rank": 5, "Country": "India", "GDP (millions of USD)": 3737000})
854
+ ]
855
+
856
+ return ScenarioList(scenarios)
857
+
858
+ # Replace the method on this instance only
859
+ import types
860
+ instance.to_scenario_list = types.MethodType(mock_to_scenario_list, instance)
861
+
862
+ return instance
863
+
864
+ def to_scenario_list(self):
865
+ """Create a ScenarioList from a table on a Wikipedia page."""
866
+ from .scenario_list import ScenarioList
867
+ import requests
868
+
869
+ try:
870
+ # Try to import pandas
871
+ import pandas as pd
872
+ except ImportError:
873
+ raise ImportError("pandas is required to read Wikipedia tables")
874
+
875
+ try:
876
+ # Check if the URL is reachable
877
+ response = requests.get(self.url)
878
+ response.raise_for_status() # Raises HTTPError for bad responses
879
+
880
+ # Extract tables from the Wikipedia page
881
+ tables = pd.read_html(self.url, header=0 if self.header else None)
882
+
883
+ # Ensure the requested table index is within the range of available tables
884
+ if self.table_index >= len(tables) or self.table_index < 0:
885
+ raise ScenarioError(
886
+ f"Table index {self.table_index} is out of range. This page has {len(tables)} table(s)."
887
+ )
888
+
889
+ # Get the requested table
890
+ df = tables[self.table_index]
891
+
892
+ # Convert DataFrame to ScenarioList
893
+ scenarios = []
894
+ for _, row in df.iterrows():
895
+ scenario_dict = row.to_dict()
896
+ scenarios.append(Scenario(scenario_dict))
897
+
898
+ return ScenarioList(scenarios)
899
+
900
+ except requests.exceptions.RequestException as e:
901
+ raise ScenarioError(f"Error fetching the URL: {str(e)}")
902
+ except ValueError as e:
903
+ raise ScenarioError(f"Error parsing tables: {str(e)}")
904
+ except Exception as e:
905
+ raise ScenarioError(f"An unexpected error occurred: {str(e)}")
906
+
907
+
908
+ class ExcelSource(Source):
909
+ source_type = "excel"
910
+
911
+ def __init__(
912
+ self,
913
+ file_path: str,
914
+ sheet_name: Optional[str] = None,
915
+ skip_rows: Optional[List[int]] = None,
916
+ use_codebook: bool = False,
917
+ **kwargs
918
+ ):
919
+ """
920
+ Initialize an ExcelSource with a path to an Excel file.
921
+
922
+ Args:
923
+ file_path: Path to the Excel file.
924
+ sheet_name: Name of the sheet to load. If None and multiple sheets exist,
925
+ will raise an error listing available sheets.
926
+ skip_rows: List of row indices to skip (0-based). If None, all rows are included.
927
+ use_codebook: If True, rename columns to standard format and store original names in codebook.
928
+ **kwargs: Additional parameters to pass to pandas.read_excel.
929
+ """
930
+ self.file_path = file_path
931
+ self.sheet_name = sheet_name
932
+ self.skip_rows = skip_rows
933
+ self.use_codebook = use_codebook
934
+ self.kwargs = kwargs
935
+
936
+ @classmethod
937
+ def example(cls) -> 'ExcelSource':
938
+ """Return an example ExcelSource instance."""
939
+ import tempfile
940
+ import os
941
+
942
+ try:
943
+ import pandas as pd
944
+
945
+ # Create a temporary Excel file with sample data
946
+ fd, temp_path = tempfile.mkstemp(suffix='.xlsx', prefix='edsl_test_')
947
+ os.close(fd) # Close the file descriptor
948
+
949
+ # Create sample data
950
+ df1 = pd.DataFrame({
951
+ 'name': ['Alice', 'Bob', 'Charlie'],
952
+ 'age': [30, 25, 35],
953
+ 'city': ['New York', 'San Francisco', 'Boston']
954
+ })
955
+
956
+ df2 = pd.DataFrame({
957
+ 'name': ['David', 'Eve'],
958
+ 'age': [40, 45],
959
+ 'city': ['Seattle', 'Chicago']
960
+ })
961
+
962
+ # Write to Excel file with multiple sheets
963
+ with pd.ExcelWriter(temp_path) as writer:
964
+ df1.to_excel(writer, sheet_name='Sheet1', index=False)
965
+ df2.to_excel(writer, sheet_name='Sheet2', index=False)
966
+
967
+ return cls(
968
+ file_path=temp_path,
969
+ sheet_name='Sheet1'
970
+ )
971
+
972
+ except ImportError:
973
+ # Create a mock instance with an override if pandas is not available
974
+ instance = cls(file_path="/path/to/nonexistent/file.xlsx")
975
+
976
+ # Override the to_scenario_list method just for the example
977
+ def mock_to_scenario_list(self):
978
+ from .scenario_list import ScenarioList
979
+ # Create a simple mock ScenarioList with sample data
980
+ scenarios = [
981
+ Scenario({"name": "Alice", "age": 30, "city": "New York"}),
982
+ Scenario({"name": "Bob", "age": 25, "city": "San Francisco"}),
983
+ Scenario({"name": "Charlie", "age": 35, "city": "Boston"})
984
+ ]
985
+ return ScenarioList(scenarios)
986
+
987
+ # Replace the method on this instance only
988
+ import types
989
+ instance.to_scenario_list = types.MethodType(mock_to_scenario_list, instance)
990
+
991
+ return instance
992
+
993
+ def to_scenario_list(self):
994
+ """Create a ScenarioList from an Excel file."""
995
+ from .scenario_list import ScenarioList
996
+
997
+ try:
998
+ import pandas as pd
999
+ except ImportError:
1000
+ raise ImportError("pandas is required to read Excel files")
1001
+
1002
+ # Get all sheets
1003
+ all_sheets = pd.read_excel(self.file_path, sheet_name=None)
1004
+
1005
+ # If no sheet_name is provided and there is more than one sheet, print available sheets
1006
+ sheet_name = self.sheet_name
1007
+ if sheet_name is None:
1008
+ if len(all_sheets) > 1:
1009
+ sheet_names = list(all_sheets.keys())
1010
+ available_sheets = ", ".join([f"'{name}'" for name in sheet_names])
1011
+ raise ScenarioError(
1012
+ f"The Excel file contains multiple sheets: {available_sheets}. "
1013
+ "Please provide a sheet_name parameter."
1014
+ )
1015
+ else:
1016
+ # If there is only one sheet, use it
1017
+ sheet_name = list(all_sheets.keys())[0]
1018
+
1019
+ # Load the specified or determined sheet
1020
+ df = pd.read_excel(self.file_path, sheet_name=sheet_name, **self.kwargs)
1021
+
1022
+ # Skip specified rows if any
1023
+ if self.skip_rows:
1024
+ df = df.drop(self.skip_rows)
1025
+ # Reset index to ensure continuous indexing
1026
+ df = df.reset_index(drop=True)
1027
+
1028
+ # Handle codebook if requested
1029
+ if self.use_codebook:
1030
+ codebook = {f"col_{i}": col for i, col in enumerate(df.columns)}
1031
+ koobedoc = {col: f"col_{i}" for i, col in enumerate(df.columns)}
1032
+
1033
+ # Create scenarios with renamed columns
1034
+ scenarios = []
1035
+ for _, row in df.iterrows():
1036
+ scenario_dict = {koobedoc.get(k): v for k, v in row.to_dict().items()}
1037
+ scenarios.append(Scenario(scenario_dict))
1038
+
1039
+ result = ScenarioList(scenarios)
1040
+ result.codebook = codebook
1041
+ return result
1042
+ else:
1043
+ # Create scenarios with original column names
1044
+ scenarios = []
1045
+ for _, row in df.iterrows():
1046
+ scenario_dict = row.to_dict()
1047
+ scenarios.append(Scenario(scenario_dict))
1048
+
1049
+ return ScenarioList(scenarios)
1050
+
1051
+
1052
+ class GoogleSheetSource(Source):
1053
+ source_type = "google_sheet"
1054
+
1055
+ def __init__(
1056
+ self,
1057
+ url: str,
1058
+ sheet_name: Optional[str] = None,
1059
+ column_names: Optional[List[str]] = None,
1060
+ **kwargs
1061
+ ):
1062
+ """
1063
+ Initialize a GoogleSheetSource with a URL to a Google Sheet.
1064
+
1065
+ Args:
1066
+ url: The URL of the Google Sheet.
1067
+ sheet_name: The name of the sheet to load. If None, the first sheet will be used.
1068
+ column_names: If provided, use these names for the columns instead
1069
+ of the default column names from the sheet.
1070
+ **kwargs: Additional parameters to pass to pandas.read_excel.
1071
+ """
1072
+ self.url = url
1073
+ self.sheet_name = sheet_name
1074
+ self.column_names = column_names
1075
+ self.kwargs = kwargs
1076
+
1077
+ @classmethod
1078
+ def example(cls) -> 'GoogleSheetSource':
1079
+ """Return an example GoogleSheetSource instance."""
1080
+ # Use a mock instance since we can't create a real Google Sheet for testing
1081
+ instance = cls(
1082
+ url="https://docs.google.com/spreadsheets/d/1234567890abcdefg/edit",
1083
+ sheet_name="Sheet1"
1084
+ )
1085
+
1086
+ # Override the to_scenario_list method just for the example
1087
+ def mock_to_scenario_list(self):
1088
+ from .scenario_list import ScenarioList
1089
+
1090
+ # Create a simple mock ScenarioList with sample data
1091
+ scenarios = [
1092
+ Scenario({"name": "Alice", "age": 30, "city": "New York"}),
1093
+ Scenario({"name": "Bob", "age": 25, "city": "San Francisco"}),
1094
+ Scenario({"name": "Charlie", "age": 35, "city": "Boston"})
1095
+ ]
1096
+ return ScenarioList(scenarios)
1097
+
1098
+ # Replace the method on this instance only
1099
+ import types
1100
+ instance.to_scenario_list = types.MethodType(mock_to_scenario_list, instance)
1101
+
1102
+ return instance
1103
+
1104
+ def to_scenario_list(self):
1105
+ """Create a ScenarioList from a Google Sheet."""
1106
+ from .scenario_list import ScenarioList
1107
+ import tempfile
1108
+ import requests
1109
+
1110
+ # Extract the sheet ID from the URL
1111
+ if "/edit" in self.url:
1112
+ sheet_id = self.url.split("/d/")[1].split("/edit")[0]
1113
+ else:
1114
+ raise ScenarioError("Invalid Google Sheet URL format.")
1115
+
1116
+ # Create the export URL for XLSX format
1117
+ export_url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=xlsx"
1118
+
1119
+ try:
1120
+ # Download the Google Sheet as an Excel file
1121
+ response = requests.get(export_url)
1122
+ response.raise_for_status() # Ensure the request was successful
1123
+
1124
+ # Save the Excel file to a temporary file
1125
+ with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as temp_file:
1126
+ temp_file.write(response.content)
1127
+ temp_filename = temp_file.name
1128
+
1129
+ # Use ExcelSource to create the initial ScenarioList
1130
+ excel_source = ExcelSource(
1131
+ file_path=temp_filename,
1132
+ sheet_name=self.sheet_name,
1133
+ **self.kwargs
1134
+ )
1135
+ scenario_list = excel_source.to_scenario_list()
1136
+
1137
+ # Apply column renaming if specified
1138
+ if self.column_names is not None and scenario_list:
1139
+ if len(self.column_names) != len(scenario_list[0].keys()):
1140
+ raise ScenarioError(
1141
+ f"Number of provided column names ({len(self.column_names)}) "
1142
+ f"does not match number of columns in sheet ({len(scenario_list[0].keys())})"
1143
+ )
1144
+
1145
+ # Create a mapping from original keys to new names
1146
+ original_keys = list(scenario_list[0].keys())
1147
+ column_mapping = dict(zip(original_keys, self.column_names))
1148
+
1149
+ # Create a new ScenarioList with renamed columns
1150
+ renamed_scenarios = []
1151
+ for scenario in scenario_list:
1152
+ renamed_scenario = {column_mapping.get(k, k): v for k, v in scenario.items()}
1153
+ renamed_scenarios.append(Scenario(renamed_scenario))
1154
+
1155
+ return ScenarioList(renamed_scenarios)
1156
+
1157
+ return scenario_list
1158
+
1159
+ except requests.exceptions.RequestException as e:
1160
+ raise ScenarioError(f"Error fetching the Google Sheet: {str(e)}")
1161
+ except Exception as e:
1162
+ raise ScenarioError(f"Error processing Google Sheet: {str(e)}")
1163
+
1164
+
1165
+ class DelimitedFileSource(Source):
1166
+ source_type = "delimited_file"
1167
+
1168
+ def __init__(
1169
+ self,
1170
+ file_or_url: str,
1171
+ delimiter: str = ",",
1172
+ has_header: bool = True,
1173
+ encoding: str = "utf-8",
1174
+ **kwargs
1175
+ ):
1176
+ """
1177
+ Initialize a DelimitedFileSource with a path to a delimited file or URL.
1178
+
1179
+ Args:
1180
+ file_or_url: Path to a local file or URL to a remote file.
1181
+ delimiter: The delimiter character used in the file (default is ',').
1182
+ has_header: Whether the file has a header row (default is True).
1183
+ encoding: The file encoding to use (default is 'utf-8').
1184
+ **kwargs: Additional parameters for csv reader.
1185
+ """
1186
+ self.file_or_url = file_or_url
1187
+ self.delimiter = delimiter
1188
+ self.has_header = has_header
1189
+ self.encoding = encoding
1190
+ self.kwargs = kwargs
1191
+
1192
+ @classmethod
1193
+ def example(cls) -> 'DelimitedFileSource':
1194
+ """Return an example DelimitedFileSource instance."""
1195
+ import tempfile
1196
+ import os
1197
+
1198
+ # Create a temporary CSV file with sample data
1199
+ fd, temp_path = tempfile.mkstemp(suffix='.csv', prefix='edsl_test_')
1200
+ os.close(fd) # Close the file descriptor
1201
+
1202
+ # Write sample data to the file
1203
+ with open(temp_path, 'w', newline='') as f:
1204
+ f.write("name,age,city\n")
1205
+ f.write("Alice,30,New York\n")
1206
+ f.write("Bob,25,San Francisco\n")
1207
+ f.write("Charlie,35,Boston\n")
1208
+
1209
+ return cls(
1210
+ file_or_url=temp_path,
1211
+ delimiter=",",
1212
+ has_header=True
1213
+ )
1214
+
1215
+ def to_scenario_list(self):
1216
+ """Create a ScenarioList from a delimited file or URL."""
1217
+ from .scenario_list import ScenarioList
1218
+ import requests
1219
+
1220
+ # Check if the input is a URL
1221
+ parsed_url = urlparse(self.file_or_url)
1222
+ if parsed_url.scheme in ("http", "https"):
1223
+ try:
1224
+ headers = {
1225
+ "Accept": "text/csv,application/csv,text/plain",
1226
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
1227
+ }
1228
+ response = requests.get(self.file_or_url, headers=headers)
1229
+ response.raise_for_status()
1230
+ content = response.text
1231
+ except requests.RequestException as e:
1232
+ raise ScenarioError(f"Failed to fetch URL: {str(e)}")
1233
+ else:
1234
+ # Assume it's a file path
1235
+ try:
1236
+ with open(self.file_or_url, "r", encoding=self.encoding) as f:
1237
+ content = f.read()
1238
+ except UnicodeDecodeError:
1239
+ # Try different encoding if specified encoding fails
1240
+ encodings_to_try = ["latin-1", "cp1252", "ISO-8859-1"]
1241
+ if self.encoding in encodings_to_try:
1242
+ encodings_to_try.remove(self.encoding)
1243
+
1244
+ for encoding in encodings_to_try:
1245
+ try:
1246
+ with open(self.file_or_url, "r", encoding=encoding) as f:
1247
+ content = f.read()
1248
+ break
1249
+ except UnicodeDecodeError:
1250
+ continue
1251
+ else:
1252
+ raise ScenarioError(f"Failed to decode file with any of the attempted encodings")
1253
+ except Exception as e:
1254
+ raise ScenarioError(f"Failed to read file: {str(e)}")
1255
+
1256
+ # Parse the content
1257
+ csv_reader = csv.reader(StringIO(content), delimiter=self.delimiter, **self.kwargs)
1258
+ rows = list(csv_reader)
1259
+
1260
+ if not rows:
1261
+ return ScenarioList()
1262
+
1263
+ # Handle header row
1264
+ if self.has_header:
1265
+ header = rows[0]
1266
+ data_rows = rows[1:]
1267
+ else:
1268
+ # Auto-generate column names
1269
+ header = [f"col{i}" for i in range(len(rows[0]))]
1270
+ data_rows = rows
1271
+
1272
+ # Create scenarios
1273
+ scenarios = []
1274
+ for row in data_rows:
1275
+ if len(row) != len(header):
1276
+ warnings.warn(f"Skipping row with {len(row)} values (expected {len(header)})")
1277
+ continue
1278
+
1279
+ scenario_dict = dict(zip(header, row))
1280
+ scenarios.append(Scenario(scenario_dict))
1281
+
1282
+ return ScenarioList(scenarios)
1283
+
1284
+
1285
+ class CSVSource(DelimitedFileSource):
1286
+ source_type = "csv"
1287
+
1288
+ def __init__(
1289
+ self,
1290
+ file_or_url: str,
1291
+ has_header: bool = True,
1292
+ encoding: str = "utf-8",
1293
+ **kwargs
1294
+ ):
1295
+ """
1296
+ Initialize a CSVSource with a path to a CSV file or URL.
1297
+
1298
+ Args:
1299
+ file_or_url: Path to a local file or URL to a remote file.
1300
+ has_header: Whether the file has a header row (default is True).
1301
+ encoding: The file encoding to use (default is 'utf-8').
1302
+ **kwargs: Additional parameters for csv reader.
1303
+ """
1304
+ super().__init__(
1305
+ file_or_url=file_or_url,
1306
+ delimiter=",",
1307
+ has_header=has_header,
1308
+ encoding=encoding,
1309
+ **kwargs
1310
+ )
1311
+
1312
+ @classmethod
1313
+ def example(cls) -> 'CSVSource':
1314
+ """Return an example CSVSource instance."""
1315
+ import tempfile
1316
+ import os
1317
+
1318
+ # Create a temporary CSV file with sample data
1319
+ fd, temp_path = tempfile.mkstemp(suffix='.csv', prefix='edsl_test_')
1320
+ os.close(fd) # Close the file descriptor
1321
+
1322
+ # Write sample data to the file
1323
+ with open(temp_path, 'w', newline='') as f:
1324
+ f.write("name,age,city\n")
1325
+ f.write("Alice,30,New York\n")
1326
+ f.write("Bob,25,San Francisco\n")
1327
+ f.write("Charlie,35,Boston\n")
1328
+
1329
+ return cls(
1330
+ file_or_url=temp_path,
1331
+ has_header=True
1332
+ )
1333
+
1334
+
1335
+ class TSVSource(DelimitedFileSource):
1336
+ source_type = "tsv"
1337
+
1338
+ def __init__(
1339
+ self,
1340
+ file_or_url: str,
1341
+ has_header: bool = True,
1342
+ encoding: str = "utf-8",
1343
+ **kwargs
1344
+ ):
1345
+ """
1346
+ Initialize a TSVSource with a path to a TSV file or URL.
1347
+
1348
+ Args:
1349
+ file_or_url: Path to a local file or URL to a remote file.
1350
+ has_header: Whether the file has a header row (default is True).
1351
+ encoding: The file encoding to use (default is 'utf-8').
1352
+ **kwargs: Additional parameters for csv reader.
1353
+ """
1354
+ super().__init__(
1355
+ file_or_url=file_or_url,
1356
+ delimiter="\t",
1357
+ has_header=has_header,
1358
+ encoding=encoding,
1359
+ **kwargs
1360
+ )
1361
+
1362
+ @classmethod
1363
+ def example(cls) -> 'TSVSource':
1364
+ """Return an example TSVSource instance."""
1365
+ import tempfile
1366
+ import os
1367
+
1368
+ # Create a temporary TSV file with sample data
1369
+ fd, temp_path = tempfile.mkstemp(suffix='.tsv', prefix='edsl_test_')
1370
+ os.close(fd) # Close the file descriptor
1371
+
1372
+ # Write sample data to the file
1373
+ with open(temp_path, 'w', newline='') as f:
1374
+ f.write("name\tage\tcity\n")
1375
+ f.write("Alice\t30\tNew York\n")
1376
+ f.write("Bob\t25\tSan Francisco\n")
1377
+ f.write("Charlie\t35\tBoston\n")
1378
+
1379
+ return cls(
1380
+ file_or_url=temp_path,
1381
+ has_header=True
1382
+ )
1383
+
1384
+ class ParquetSource(Source):
1385
+ source_type = "parquet"
1386
+
1387
+ def __init__(self, file_path: str):
1388
+ """
1389
+ Initialize a ParquetSource with a path to a Parquet file.
1390
+
1391
+ Args:
1392
+ file_path: Path to the Parquet file.
1393
+ """
1394
+ self.file_path = file_path
1395
+
1396
+ @classmethod
1397
+ def example(cls) -> 'ParquetSource':
1398
+ """Return an example ParquetSource instance."""
1399
+ import tempfile
1400
+ import os
1401
+
1402
+ try:
1403
+ import pandas as pd
1404
+ import pyarrow as pa
1405
+ import pyarrow.parquet as pq
1406
+
1407
+ # Create a temporary Parquet file with sample data
1408
+ fd, temp_path = tempfile.mkstemp(suffix='.parquet', prefix='edsl_test_')
1409
+ os.close(fd) # Close the file descriptor
1410
+
1411
+ # Create sample data
1412
+ df = pd.DataFrame({
1413
+ 'name': ['Alice', 'Bob', 'Charlie'],
1414
+ 'age': [30, 25, 35],
1415
+ 'city': ['New York', 'San Francisco', 'Boston']
1416
+ })
1417
+
1418
+ # Write to Parquet file
1419
+ df.to_parquet(temp_path)
1420
+
1421
+ return cls(file_path=temp_path)
1422
+
1423
+ except ImportError:
1424
+ # Create a mock instance with an override if pandas or pyarrow is not available
1425
+ instance = cls(file_path="/path/to/nonexistent/file.parquet")
1426
+
1427
+ # Override the to_scenario_list method just for the example
1428
+ def mock_to_scenario_list(self):
1429
+ from .scenario_list import ScenarioList
1430
+ # Create a simple mock ScenarioList with sample data
1431
+ scenarios = [
1432
+ Scenario({"name": "Alice", "age": 30, "city": "New York"}),
1433
+ Scenario({"name": "Bob", "age": 25, "city": "San Francisco"}),
1434
+ Scenario({"name": "Charlie", "age": 35, "city": "Boston"})
1435
+ ]
1436
+ return ScenarioList(scenarios)
1437
+
1438
+ # Replace the method on this instance only
1439
+ import types
1440
+ instance.to_scenario_list = types.MethodType(mock_to_scenario_list, instance)
1441
+
1442
+ return instance
1443
+
1444
+ def to_scenario_list(self):
1445
+ """Create a ScenarioList from a Parquet file."""
1446
+ from .scenario_list import ScenarioList
1447
+
1448
+ try:
1449
+ import pandas as pd
1450
+ except ImportError:
1451
+ raise ImportError("pandas is required to read Parquet files")
1452
+
1453
+ try:
1454
+ import pyarrow
1455
+ except ImportError:
1456
+ raise ImportError("pyarrow is required to read Parquet files")
1457
+
1458
+ # Read the Parquet file
1459
+ df = pd.read_parquet(self.file_path)
1460
+
1461
+ # Convert DataFrame to ScenarioList
1462
+ scenarios = []
1463
+ for _, row in df.iterrows():
1464
+ scenario_dict = row.to_dict()
1465
+ scenarios.append(Scenario(scenario_dict))
1466
+
1467
+ return ScenarioList(scenarios)
1468
+
1469
+
1470
+ class PDFSource(Source):
1471
+ source_type = "pdf"
1472
+
1473
+ def __init__(
1474
+ self,
1475
+ file_path: str,
1476
+ chunk_type: Literal["page", "text"] = "page",
1477
+ chunk_size: int = 1,
1478
+ chunk_overlap: int = 0
1479
+ ):
1480
+ """
1481
+ Initialize a PDFSource with a path to a PDF file.
1482
+
1483
+ Args:
1484
+ file_path: Path to the PDF file or URL to a PDF.
1485
+ chunk_type: Type of chunking to use ("page" or "text").
1486
+ chunk_size: Size of chunks to create.
1487
+ chunk_overlap: Number of overlapping chunks.
1488
+ """
1489
+ self.file_path = file_path
1490
+ self.chunk_type = chunk_type
1491
+ self.chunk_size = chunk_size
1492
+ self.chunk_overlap = chunk_overlap
1493
+
1494
+ @classmethod
1495
+ def example(cls) -> 'PDFSource':
1496
+ """Return an example PDFSource instance."""
1497
+ # Skip actual file creation and just use a mock instance
1498
+ instance = cls(
1499
+ file_path="/path/to/nonexistent/file.pdf",
1500
+ chunk_type="page",
1501
+ chunk_size=1,
1502
+ chunk_overlap=0
1503
+ )
1504
+
1505
+ # Override the to_scenario_list method just for the example
1506
+ def mock_to_scenario_list(self):
1507
+ from .scenario_list import ScenarioList
1508
+ # Create a simple mock ScenarioList with sample PDF data
1509
+ scenarios = [
1510
+ Scenario({"filename": "example.pdf", "page": 1, "text": "This is page 1 content"}),
1511
+ Scenario({"filename": "example.pdf", "page": 2, "text": "This is page 2 content"})
1512
+ ]
1513
+ return ScenarioList(scenarios)
1514
+
1515
+ # Replace the method on this instance only
1516
+ import types
1517
+ instance.to_scenario_list = types.MethodType(mock_to_scenario_list, instance)
1518
+
1519
+ return instance
1520
+
1521
+ def to_scenario_list(self):
1522
+ """Create a ScenarioList from a PDF file."""
1523
+ from .scenario_list import ScenarioList
1524
+ from .scenario_list_pdf_tools import PdfTools
1525
+
1526
+ try:
1527
+ # Check if it's a URL
1528
+ if PdfTools.is_url(self.file_path):
1529
+ # Download the PDF file
1530
+ if "drive.google.com" in self.file_path:
1531
+ # It's a Google Drive URL
1532
+ local_path = PdfTools.GoogleDriveDownloader.fetch_from_drive(
1533
+ self.file_path, "temp_pdf.pdf"
1534
+ )
1535
+ else:
1536
+ # It's a regular URL
1537
+ local_path = PdfTools.fetch_and_save_pdf(self.file_path, "temp_pdf.pdf")
1538
+ else:
1539
+ # It's a local file path
1540
+ local_path = self.file_path
1541
+
1542
+ # Extract scenarios from the PDF
1543
+ scenarios = list(PdfTools.extract_text_from_pdf(local_path))
1544
+
1545
+ # Handle chunking based on the specified parameters
1546
+ if self.chunk_type == "page":
1547
+ # Default behavior - one scenario per page
1548
+ return ScenarioList(scenarios)
1549
+ elif self.chunk_type == "text":
1550
+ # Combine all text
1551
+ combined_text = ""
1552
+ for scenario in scenarios:
1553
+ combined_text += scenario["text"]
1554
+
1555
+ # Create a single scenario with all text
1556
+ base_scenario = scenarios[0].copy()
1557
+ base_scenario["text"] = combined_text
1558
+ return ScenarioList([base_scenario])
1559
+ else:
1560
+ raise ValueError(f"Invalid chunk_type: {self.chunk_type}. Must be 'page' or 'text'.")
1561
+
1562
+ except Exception as e:
1563
+ from .exceptions import ScenarioError
1564
+ raise ScenarioError(f"Error processing PDF: {str(e)}")
1565
+
1566
+
1567
+ class PDFImageSource(Source):
1568
+ source_type = "pdf_to_image"
1569
+
1570
+ def __init__(
1571
+ self,
1572
+ file_path: str,
1573
+ base_width: int = 2000,
1574
+ include_text: bool = True
1575
+ ):
1576
+ """
1577
+ Initialize a PDFImageSource with a path to a PDF file.
1578
+
1579
+ Args:
1580
+ file_path: Path to the PDF file.
1581
+ base_width: Width to use for the generated images.
1582
+ include_text: Whether to include extracted text with the images.
1583
+ """
1584
+ self.file_path = file_path
1585
+ self.base_width = base_width
1586
+ self.include_text = include_text
1587
+
1588
+ @classmethod
1589
+ def example(cls) -> 'PDFImageSource':
1590
+ """Return an example PDFImageSource instance."""
1591
+ # Skip actual file creation and just use a mock instance
1592
+ instance = cls(
1593
+ file_path="/path/to/nonexistent/file.pdf",
1594
+ base_width=2000,
1595
+ include_text=True
1596
+ )
1597
+
1598
+ # Override the to_scenario_list method just for the example
1599
+ def mock_to_scenario_list(self):
1600
+ from .scenario_list import ScenarioList
1601
+ # Create a simple mock ScenarioList with sample PDF image data
1602
+ scenarios = [
1603
+ Scenario({"filepath": "/tmp/page_1.jpeg", "page": 0, "text": "This is page 1 content"}),
1604
+ Scenario({"filepath": "/tmp/page_2.jpeg", "page": 1, "text": "This is page 2 content"})
1605
+ ]
1606
+ return ScenarioList(scenarios)
1607
+
1608
+ # Replace the method on this instance only
1609
+ import types
1610
+ instance.to_scenario_list = types.MethodType(mock_to_scenario_list, instance)
1611
+
1612
+ return instance
1613
+
1614
+ def to_scenario_list(self):
1615
+ """Create a ScenarioList from a PDF file, converting pages to images."""
1616
+ from .scenario_list import ScenarioList
1617
+ from .scenario_list_pdf_tools import PdfTools
1618
+
1619
+ try:
1620
+ # Import pdf2image library
1621
+ try:
1622
+ from pdf2image import convert_from_path
1623
+ except ImportError:
1624
+ raise ImportError("pdf2image is required to convert PDF to images. Install it with 'pip install pdf2image'.")
1625
+
1626
+ # Convert PDF pages to images
1627
+ scenarios = PdfTools.from_pdf_to_image(self.file_path, image_format="jpeg")
1628
+ return ScenarioList(scenarios)
1629
+
1630
+ except Exception as e:
1631
+ from .exceptions import ScenarioError
1632
+ raise ScenarioError(f"Error converting PDF to images: {str(e)}")
1633
+
1634
+
1635
+ class ScenarioSource:
1636
+ """
1637
+ Factory class for creating ScenarioList objects from various sources.
1638
+
1639
+ This class provides static methods for creating ScenarioList objects from different
1640
+ data sources, centralizing the creation logic that was previously scattered across
1641
+ different classmethods in the ScenarioList class.
1642
+
1643
+ The main entry point is the from_source method, which dispatches to appropriate
1644
+ source-specific methods based on the source_type parameter.
1645
+ """
1646
+
1647
+ @staticmethod
1648
+ def from_source(source_type: str, *args, **kwargs):
1649
+ """
1650
+ Create a ScenarioList from a specified source type.
1651
+
1652
+ This method serves as the main entry point for creating ScenarioList objects,
1653
+ dispatching to the appropriate source-specific method based on the source_type.
1654
+
1655
+ Args:
1656
+ source_type: The type of source to create a ScenarioList from.
1657
+ Valid values include: 'urls', 'directory', 'list', 'list_of_tuples',
1658
+ 'sqlite', 'latex', 'google_doc', 'pandas', 'dta', 'wikipedia',
1659
+ 'excel', 'google_sheet', 'delimited_file', 'csv', 'tsv', 'dict',
1660
+ 'nested_dict', 'parquet', 'pdf', 'pdf_to_image'.
1661
+ *args: Positional arguments to pass to the source-specific method.
1662
+ **kwargs: Keyword arguments to pass to the source-specific method.
1663
+
1664
+ Returns:
1665
+ A ScenarioList object created from the specified source.
1666
+
1667
+ Raises:
1668
+ ValueError: If the source_type is not recognized.
1669
+ """
1670
+ try:
1671
+ source_class = Source.get_source_class(source_type)
1672
+ source_instance = source_class(*args, **kwargs)
1673
+ return source_instance.to_scenario_list()
1674
+ except ValueError as e:
1675
+ # For backward compatibility, try the old method if the source_type isn't in the registry
1676
+ method_name = f"_from_{source_type}"
1677
+ if hasattr(ScenarioSource, method_name):
1678
+ method = getattr(ScenarioSource, method_name)
1679
+ return method(*args, **kwargs)
1680
+ else:
1681
+ raise ValueError(f"Unsupported source type: {source_type}")
1682
+
1683
+ @staticmethod
1684
+ def _from_urls(urls: list[str], field_name: Optional[str] = "text"):
1685
+ """Create a ScenarioList from a list of URLs."""
1686
+ from .scenario_list import ScenarioList
1687
+
1688
+ import requests
1689
+
1690
+ result = ScenarioList()
1691
+ for url in urls:
1692
+ try:
1693
+ response = requests.get(url)
1694
+ response.raise_for_status()
1695
+ scenario = Scenario({field_name: response.text})
1696
+ result.append(scenario)
1697
+ except requests.RequestException as e:
1698
+ warnings.warn(f"Failed to fetch URL {url}: {str(e)}")
1699
+ continue
1700
+
1701
+ return result
1702
+
1703
+ @staticmethod
1704
+ def _from_directory(
1705
+ directory: str,
1706
+ pattern: str = "*",
1707
+ recursive: bool = False,
1708
+ metadata: bool = True,
1709
+ ignore_dirs: List[str] = None,
1710
+ ignore_files: List[str] = None,
1711
+ ):
1712
+ """Create a ScenarioList from files in a directory."""
1713
+ warnings.warn(
1714
+ "_from_directory is deprecated. Use DirectorySource directly or ScenarioSource.from_source('directory', ...) instead.",
1715
+ DeprecationWarning,
1716
+ stacklevel=2
1717
+ )
1718
+ source = DirectorySource(
1719
+ directory=directory,
1720
+ pattern=pattern,
1721
+ recursive=recursive,
1722
+ metadata=metadata,
1723
+ ignore_dirs=ignore_dirs,
1724
+ ignore_files=ignore_files
1725
+ )
1726
+ return source.to_scenario_list()
1727
+
1728
+ @staticmethod
1729
+ def _from_list(
1730
+ field_name: str, values: list, use_indexes: bool = False
1731
+ ):
1732
+ """Create a ScenarioList from a list of values with a specified field name."""
1733
+ warnings.warn(
1734
+ "_from_list is deprecated. Use ListSource directly or ScenarioSource.from_source('list', ...) instead.",
1735
+ DeprecationWarning,
1736
+ stacklevel=2
1737
+ )
1738
+ source = ListSource(field_name, values, use_indexes)
1739
+ return source.to_scenario_list()
1740
+
1741
+ @staticmethod
1742
+ def _from_list_of_tuples(
1743
+ field_names: list[str], values: list[tuple], use_indexes: bool = False
1744
+ ):
1745
+ """Create a ScenarioList from a list of tuples with specified field names."""
1746
+ warnings.warn(
1747
+ "_from_list_of_tuples is deprecated. Use TuplesSource directly or ScenarioSource.from_source('list_of_tuples', ...) instead.",
1748
+ DeprecationWarning,
1749
+ stacklevel=2
1750
+ )
1751
+ source = TuplesSource(field_names, values, use_indexes)
1752
+ return source.to_scenario_list()
1753
+
1754
+ @staticmethod
1755
+ def _from_sqlite(
1756
+ db_path: str, table: str, fields: Optional[list] = None
1757
+ ):
1758
+ """Create a ScenarioList from a SQLite database."""
1759
+ warnings.warn(
1760
+ "_from_sqlite is deprecated. Use SQLiteSource directly or ScenarioSource.from_source('sqlite', ...) instead.",
1761
+ DeprecationWarning,
1762
+ stacklevel=2
1763
+ )
1764
+ source = SQLiteSource(db_path, table, fields)
1765
+ return source.to_scenario_list()
1766
+
1767
+ @staticmethod
1768
+ def _from_latex(
1769
+ file_path: str, table_index: int = 0, has_header: bool = True
1770
+ ):
1771
+ """Create a ScenarioList from a LaTeX file."""
1772
+ warnings.warn(
1773
+ "_from_latex is deprecated. Use LaTeXSource directly or ScenarioSource.from_source('latex', ...) instead.",
1774
+ DeprecationWarning,
1775
+ stacklevel=2
1776
+ )
1777
+ source = LaTeXSource(file_path, table_index, has_header)
1778
+ return source.to_scenario_list()
1779
+
1780
+ @staticmethod
1781
+ def _from_google_doc(url: str):
1782
+ """Create a ScenarioList from a Google Doc."""
1783
+ warnings.warn(
1784
+ "_from_google_doc is deprecated. Use GoogleDocSource directly or ScenarioSource.from_source('google_doc', ...) instead.",
1785
+ DeprecationWarning,
1786
+ stacklevel=2
1787
+ )
1788
+ source = GoogleDocSource(url)
1789
+ return source.to_scenario_list()
1790
+
1791
+ @staticmethod
1792
+ def _from_pandas(df):
1793
+ """Create a ScenarioList from a pandas DataFrame."""
1794
+ warnings.warn(
1795
+ "_from_pandas is deprecated. Use PandasSource directly or ScenarioSource.from_source('pandas', ...) instead.",
1796
+ DeprecationWarning,
1797
+ stacklevel=2
1798
+ )
1799
+ source = PandasSource(df)
1800
+ return source.to_scenario_list()
1801
+
1802
+ @staticmethod
1803
+ def _from_dta(file_path: str, include_metadata: bool = True):
1804
+ """Create a ScenarioList from a Stata data file."""
1805
+ warnings.warn(
1806
+ "_from_dta is deprecated. Use StataSource directly or ScenarioSource.from_source('dta', ...) instead.",
1807
+ DeprecationWarning,
1808
+ stacklevel=2
1809
+ )
1810
+ source = StataSource(file_path, include_metadata)
1811
+ return source.to_scenario_list()
1812
+
1813
+ @staticmethod
1814
+ def _from_wikipedia(
1815
+ url: str, table_index: int = 0, header: bool = True
1816
+ ):
1817
+ """Create a ScenarioList from a table on a Wikipedia page."""
1818
+ warnings.warn(
1819
+ "_from_wikipedia is deprecated. Use WikipediaSource directly or ScenarioSource.from_source('wikipedia', ...) instead.",
1820
+ DeprecationWarning,
1821
+ stacklevel=2
1822
+ )
1823
+ source = WikipediaSource(url, table_index, header)
1824
+ return source.to_scenario_list()
1825
+
1826
+ @staticmethod
1827
+ def _from_excel(
1828
+ file_path: str, sheet_name: Optional[str] = None, **kwargs
1829
+ ):
1830
+ """Create a ScenarioList from an Excel file."""
1831
+ warnings.warn(
1832
+ "_from_excel is deprecated. Use ExcelSource directly or ScenarioSource.from_source('excel', ...) instead.",
1833
+ DeprecationWarning,
1834
+ stacklevel=2
1835
+ )
1836
+ source = ExcelSource(file_path, sheet_name=sheet_name, **kwargs)
1837
+ return source.to_scenario_list()
1838
+
1839
+ @staticmethod
1840
+ def _from_google_sheet(url: str, sheet_name: Optional[str] = None, column_names: Optional[List[str]] = None, **kwargs):
1841
+ """Create a ScenarioList from a Google Sheet."""
1842
+ warnings.warn(
1843
+ "_from_google_sheet is deprecated. Use GoogleSheetSource directly or ScenarioSource.from_source('google_sheet', ...) instead.",
1844
+ DeprecationWarning,
1845
+ stacklevel=2
1846
+ )
1847
+ source = GoogleSheetSource(url, sheet_name=sheet_name, column_names=column_names, **kwargs)
1848
+ return source.to_scenario_list()
1849
+
1850
+ @staticmethod
1851
+ def _from_delimited_file(
1852
+ file_or_url: str,
1853
+ delimiter: str = ",",
1854
+ has_header: bool = True,
1855
+ encoding: str = "utf-8",
1856
+ **kwargs,
1857
+ ):
1858
+ """Create a ScenarioList from a delimited file or URL."""
1859
+ warnings.warn(
1860
+ "_from_delimited_file is deprecated. Use DelimitedFileSource directly or ScenarioSource.from_source('delimited_file', ...) instead.",
1861
+ DeprecationWarning,
1862
+ stacklevel=2
1863
+ )
1864
+ source = DelimitedFileSource(
1865
+ file_or_url=file_or_url,
1866
+ delimiter=delimiter,
1867
+ has_header=has_header,
1868
+ encoding=encoding,
1869
+ **kwargs
1870
+ )
1871
+ return source.to_scenario_list()
1872
+
1873
+ @staticmethod
1874
+ def _from_csv(file_or_url: str, **kwargs):
1875
+ """Create a ScenarioList from a CSV file or URL."""
1876
+ warnings.warn(
1877
+ "_from_csv is deprecated. Use CSVSource directly or ScenarioSource.from_source('csv', ...) instead.",
1878
+ DeprecationWarning,
1879
+ stacklevel=2
1880
+ )
1881
+ source = CSVSource(file_or_url=file_or_url, **kwargs)
1882
+ return source.to_scenario_list()
1883
+
1884
+ @staticmethod
1885
+ def _from_tsv(file_or_url: str, **kwargs):
1886
+ """Create a ScenarioList from a TSV file or URL."""
1887
+ warnings.warn(
1888
+ "_from_tsv is deprecated. Use TSVSource directly or ScenarioSource.from_source('tsv', ...) instead.",
1889
+ DeprecationWarning,
1890
+ stacklevel=2
1891
+ )
1892
+ source = TSVSource(file_or_url=file_or_url, **kwargs)
1893
+ return source.to_scenario_list()
1894
+
1895
+ @staticmethod
1896
+ def _from_dict(data: dict):
1897
+ """Create a ScenarioList from a dictionary."""
1898
+ from .scenario_list import ScenarioList
1899
+
1900
+ if "scenarios" in data:
1901
+ scenarios = [Scenario(s) for s in data["scenarios"]]
1902
+ codebook = data.get("codebook", {})
1903
+ return ScenarioList(scenarios, codebook)
1904
+ else:
1905
+ scenarios = []
1906
+ # Assume the dict maps field names to lists of values
1907
+ field_names = list(data.keys())
1908
+ if not all(isinstance(v, list) for v in data.values()):
1909
+ raise ScenarioError("All values in the dictionary must be lists")
1910
+
1911
+ # Check all lists have the same length
1912
+ list_lengths = [len(v) for v in data.values()]
1913
+ if not all(l == list_lengths[0] for l in list_lengths):
1914
+ raise ScenarioError("All lists must have the same length")
1915
+
1916
+ # Create scenarios
1917
+ for i in range(list_lengths[0]):
1918
+ scenario_dict = {k: data[k][i] for k in field_names}
1919
+ scenarios.append(Scenario(scenario_dict))
1920
+
1921
+ return ScenarioList(scenarios)
1922
+
1923
+ @staticmethod
1924
+ def _from_nested_dict(data: dict, id_field: Optional[str] = None):
1925
+ """Create a ScenarioList from a nested dictionary."""
1926
+ from .scenario_list import ScenarioList
1927
+
1928
+ scenarios = []
1929
+
1930
+ for key, value in data.items():
1931
+ if not isinstance(value, dict):
1932
+ raise ScenarioError(f"Value for key {key} is not a dictionary")
1933
+
1934
+ scenario_dict = value.copy()
1935
+ if id_field:
1936
+ scenario_dict[id_field] = key
1937
+ scenarios.append(Scenario(scenario_dict))
1938
+
1939
+ return ScenarioList(scenarios)
1940
+
1941
+ @staticmethod
1942
+ def _from_parquet(file_path: str):
1943
+ """Create a ScenarioList from a Parquet file."""
1944
+ warnings.warn(
1945
+ "_from_parquet is deprecated. Use ParquetSource directly or ScenarioSource.from_source('parquet', ...) instead.",
1946
+ DeprecationWarning,
1947
+ stacklevel=2
1948
+ )
1949
+ source = ParquetSource(file_path)
1950
+ return source.to_scenario_list()
1951
+
1952
+ @staticmethod
1953
+ def _from_pdf(
1954
+ file_path: str,
1955
+ chunk_type: Literal["page", "text"] = "page",
1956
+ chunk_size: int = 1,
1957
+ chunk_overlap: int = 0,
1958
+ ):
1959
+ """Create a ScenarioList from a PDF file."""
1960
+ warnings.warn(
1961
+ "_from_pdf is deprecated. Use PDFSource directly or ScenarioSource.from_source('pdf', ...) instead.",
1962
+ DeprecationWarning,
1963
+ stacklevel=2
1964
+ )
1965
+ source = PDFSource(
1966
+ file_path=file_path,
1967
+ chunk_type=chunk_type,
1968
+ chunk_size=chunk_size,
1969
+ chunk_overlap=chunk_overlap
1970
+ )
1971
+ return source.to_scenario_list()
1972
+
1973
+ @staticmethod
1974
+ def _from_pdf_to_image(
1975
+ file_path: str,
1976
+ base_width: int = 2000,
1977
+ include_text: bool = True,
1978
+ ):
1979
+ """Create a ScenarioList containing images extracted from a PDF file."""
1980
+ warnings.warn(
1981
+ "_from_pdf_to_image is deprecated. Use PDFImageSource directly or ScenarioSource.from_source('pdf_to_image', ...) instead.",
1982
+ DeprecationWarning,
1983
+ stacklevel=2
1984
+ )
1985
+ source = PDFImageSource(
1986
+ file_path=file_path,
1987
+ base_width=base_width,
1988
+ include_text=include_text
1989
+ )
1990
+ return source.to_scenario_list()