edsl 0.1.53__py3-none-any.whl → 0.1.55__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- edsl/__init__.py +8 -1
- edsl/__init__original.py +134 -0
- edsl/__version__.py +1 -1
- edsl/agents/agent.py +29 -0
- edsl/agents/agent_list.py +36 -1
- edsl/base/base_class.py +281 -151
- edsl/buckets/__init__.py +8 -3
- edsl/buckets/bucket_collection.py +9 -3
- edsl/buckets/model_buckets.py +4 -2
- edsl/buckets/token_bucket.py +2 -2
- edsl/buckets/token_bucket_client.py +5 -3
- edsl/caching/cache.py +131 -62
- edsl/caching/cache_entry.py +70 -58
- edsl/caching/sql_dict.py +17 -0
- edsl/cli.py +99 -0
- edsl/config/config_class.py +16 -0
- edsl/conversation/__init__.py +31 -0
- edsl/coop/coop.py +276 -242
- edsl/coop/coop_jobs_objects.py +59 -0
- edsl/coop/coop_objects.py +29 -0
- edsl/coop/coop_regular_objects.py +26 -0
- edsl/coop/utils.py +24 -19
- edsl/dataset/dataset.py +338 -101
- edsl/db_list/sqlite_list.py +349 -0
- edsl/inference_services/__init__.py +40 -5
- edsl/inference_services/exceptions.py +11 -0
- edsl/inference_services/services/anthropic_service.py +5 -2
- edsl/inference_services/services/aws_bedrock.py +6 -2
- edsl/inference_services/services/azure_ai.py +6 -2
- edsl/inference_services/services/google_service.py +3 -2
- edsl/inference_services/services/mistral_ai_service.py +6 -2
- edsl/inference_services/services/open_ai_service.py +6 -2
- edsl/inference_services/services/perplexity_service.py +6 -2
- edsl/inference_services/services/test_service.py +105 -7
- edsl/interviews/answering_function.py +167 -59
- edsl/interviews/interview.py +124 -72
- edsl/interviews/interview_task_manager.py +10 -0
- edsl/invigilators/invigilators.py +10 -1
- edsl/jobs/async_interview_runner.py +146 -104
- edsl/jobs/data_structures.py +6 -4
- edsl/jobs/decorators.py +61 -0
- edsl/jobs/fetch_invigilator.py +61 -18
- edsl/jobs/html_table_job_logger.py +14 -2
- edsl/jobs/jobs.py +180 -104
- edsl/jobs/jobs_component_constructor.py +2 -2
- edsl/jobs/jobs_interview_constructor.py +2 -0
- edsl/jobs/jobs_pricing_estimation.py +127 -46
- edsl/jobs/jobs_remote_inference_logger.py +4 -0
- edsl/jobs/jobs_runner_status.py +30 -25
- edsl/jobs/progress_bar_manager.py +79 -0
- edsl/jobs/remote_inference.py +35 -1
- edsl/key_management/key_lookup_builder.py +6 -1
- edsl/language_models/language_model.py +102 -12
- edsl/language_models/model.py +10 -3
- edsl/language_models/price_manager.py +45 -75
- edsl/language_models/registry.py +5 -0
- edsl/language_models/utilities.py +2 -1
- edsl/notebooks/notebook.py +77 -10
- edsl/questions/VALIDATION_README.md +134 -0
- edsl/questions/__init__.py +24 -1
- edsl/questions/exceptions.py +21 -0
- edsl/questions/question_check_box.py +171 -149
- edsl/questions/question_dict.py +243 -51
- edsl/questions/question_multiple_choice_with_other.py +624 -0
- edsl/questions/question_registry.py +2 -1
- edsl/questions/templates/multiple_choice_with_other/__init__.py +0 -0
- edsl/questions/templates/multiple_choice_with_other/answering_instructions.jinja +15 -0
- edsl/questions/templates/multiple_choice_with_other/question_presentation.jinja +17 -0
- edsl/questions/validation_analysis.py +185 -0
- edsl/questions/validation_cli.py +131 -0
- edsl/questions/validation_html_report.py +404 -0
- edsl/questions/validation_logger.py +136 -0
- edsl/results/result.py +63 -16
- edsl/results/results.py +702 -171
- edsl/scenarios/construct_download_link.py +16 -3
- edsl/scenarios/directory_scanner.py +226 -226
- edsl/scenarios/file_methods.py +5 -0
- edsl/scenarios/file_store.py +117 -6
- edsl/scenarios/handlers/__init__.py +5 -1
- edsl/scenarios/handlers/mp4_file_store.py +104 -0
- edsl/scenarios/handlers/webm_file_store.py +104 -0
- edsl/scenarios/scenario.py +120 -101
- edsl/scenarios/scenario_list.py +800 -727
- edsl/scenarios/scenario_list_gc_test.py +146 -0
- edsl/scenarios/scenario_list_memory_test.py +214 -0
- edsl/scenarios/scenario_list_source_refactor.md +35 -0
- edsl/scenarios/scenario_selector.py +5 -4
- edsl/scenarios/scenario_source.py +1990 -0
- edsl/scenarios/tests/test_scenario_list_sources.py +52 -0
- edsl/surveys/survey.py +22 -0
- edsl/tasks/__init__.py +4 -2
- edsl/tasks/task_history.py +198 -36
- edsl/tests/scenarios/test_ScenarioSource.py +51 -0
- edsl/tests/scenarios/test_scenario_list_sources.py +51 -0
- edsl/utilities/__init__.py +2 -1
- edsl/utilities/decorators.py +121 -0
- edsl/utilities/memory_debugger.py +1010 -0
- {edsl-0.1.53.dist-info → edsl-0.1.55.dist-info}/METADATA +52 -76
- {edsl-0.1.53.dist-info → edsl-0.1.55.dist-info}/RECORD +102 -78
- edsl/jobs/jobs_runner_asyncio.py +0 -281
- edsl/language_models/unused/fake_openai_service.py +0 -60
- {edsl-0.1.53.dist-info → edsl-0.1.55.dist-info}/LICENSE +0 -0
- {edsl-0.1.53.dist-info → edsl-0.1.55.dist-info}/WHEEL +0 -0
- {edsl-0.1.53.dist-info → edsl-0.1.55.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,1990 @@
|
|
1
|
+
"""
|
2
|
+
ScenarioSource provides factory methods for creating ScenarioList objects from external sources.
|
3
|
+
|
4
|
+
This module contains the ScenarioSource class, which serves as a factory for creating
|
5
|
+
ScenarioList objects from various external data sources like files, directories, and URLs.
|
6
|
+
It centralizes all the file/external-source creation logic that was previously scattered
|
7
|
+
across different classmethods in the ScenarioList class.
|
8
|
+
|
9
|
+
Key features include:
|
10
|
+
- A unified from_source method that dispatches to appropriate source-specific methods
|
11
|
+
- Support for various data sources (CSV, Excel, PDF, directories, URLs, etc.)
|
12
|
+
- Deprecation decorators for backward compatibility with ScenarioList class methods
|
13
|
+
"""
|
14
|
+
|
15
|
+
from __future__ import annotations
|
16
|
+
import functools
|
17
|
+
import warnings
|
18
|
+
import fnmatch
|
19
|
+
from typing import Any, Callable, List, Literal, Optional, Type, TypeVar, Union, TYPE_CHECKING, cast, Any
|
20
|
+
|
21
|
+
T = TypeVar('T')
|
22
|
+
|
23
|
+
def deprecated_classmethod(alternative: str) -> Callable[[Callable[..., T]], Callable[..., T]]:
|
24
|
+
"""
|
25
|
+
Decorator that marks a class method as deprecated.
|
26
|
+
|
27
|
+
Args:
|
28
|
+
alternative: The suggested alternative to use instead
|
29
|
+
|
30
|
+
Returns:
|
31
|
+
A decorator function that wraps the original method with a deprecation warning
|
32
|
+
"""
|
33
|
+
def decorator(func: Callable[..., T]) -> Callable[..., T]:
|
34
|
+
@functools.wraps(func)
|
35
|
+
def wrapper(*args: Any, **kwargs: Any) -> T:
|
36
|
+
warnings.warn(
|
37
|
+
f"{func.__qualname__} is deprecated. Use {alternative} instead.",
|
38
|
+
DeprecationWarning,
|
39
|
+
stacklevel=2
|
40
|
+
)
|
41
|
+
return func(*args, **kwargs)
|
42
|
+
return wrapper
|
43
|
+
return decorator
|
44
|
+
|
45
|
+
import os
|
46
|
+
import csv
|
47
|
+
import json
|
48
|
+
import warnings
|
49
|
+
from io import StringIO
|
50
|
+
from urllib.parse import urlparse
|
51
|
+
|
52
|
+
if TYPE_CHECKING:
|
53
|
+
import pandas as pd
|
54
|
+
from urllib.parse import ParseResult
|
55
|
+
from .scenario_list import ScenarioList
|
56
|
+
|
57
|
+
# Local imports
|
58
|
+
from .scenario import Scenario
|
59
|
+
from .directory_scanner import DirectoryScanner
|
60
|
+
from .exceptions import ScenarioError
|
61
|
+
|
62
|
+
from abc import ABC, abstractmethod
|
63
|
+
|
64
|
+
class Source(ABC):
|
65
|
+
# Registry to store child classes and their source types
|
66
|
+
_registry: dict[str, Type['Source']] = {}
|
67
|
+
|
68
|
+
def __init_subclass__(cls, **kwargs):
|
69
|
+
"""Automatically register subclasses with their source_type."""
|
70
|
+
super().__init_subclass__(**kwargs)
|
71
|
+
if hasattr(cls, 'source_type'):
|
72
|
+
Source._registry[cls.source_type] = cls
|
73
|
+
|
74
|
+
@classmethod
|
75
|
+
@abstractmethod
|
76
|
+
def example(cls) -> 'Source':
|
77
|
+
"""
|
78
|
+
Return an example instance of this Source type.
|
79
|
+
|
80
|
+
This method should return a valid instance of the Source subclass
|
81
|
+
that can be used for testing. The instance should be created with
|
82
|
+
reasonable default values that will produce a valid ScenarioList
|
83
|
+
when to_scenario_list() is called.
|
84
|
+
|
85
|
+
Returns:
|
86
|
+
An instance of the Source subclass
|
87
|
+
"""
|
88
|
+
pass
|
89
|
+
|
90
|
+
@abstractmethod
|
91
|
+
def to_scenario_list(self):
|
92
|
+
"""
|
93
|
+
Convert the source to a ScenarioList.
|
94
|
+
|
95
|
+
Returns:
|
96
|
+
A ScenarioList containing the data from this source
|
97
|
+
"""
|
98
|
+
pass
|
99
|
+
|
100
|
+
@classmethod
|
101
|
+
def get_source_class(cls, source_type: str) -> Type['Source']:
|
102
|
+
"""Get the Source subclass for a given source_type."""
|
103
|
+
if source_type not in cls._registry:
|
104
|
+
raise ValueError(f"No Source subclass found for source_type: {source_type}")
|
105
|
+
return cls._registry[source_type]
|
106
|
+
|
107
|
+
@classmethod
|
108
|
+
def get_registered_types(cls) -> list[str]:
|
109
|
+
"""Get a list of all registered source types."""
|
110
|
+
return list(cls._registry.keys())
|
111
|
+
|
112
|
+
@classmethod
|
113
|
+
def test_all_sources(cls) -> dict[str, bool]:
|
114
|
+
"""
|
115
|
+
Test all registered source types by creating an example instance
|
116
|
+
and calling to_scenario_list() on it.
|
117
|
+
|
118
|
+
Returns:
|
119
|
+
A dictionary mapping source types to boolean success values
|
120
|
+
"""
|
121
|
+
from .scenario_list import ScenarioList
|
122
|
+
|
123
|
+
results = {}
|
124
|
+
for source_type, source_class in cls._registry.items():
|
125
|
+
try:
|
126
|
+
# Create example instance
|
127
|
+
example_instance = source_class.example()
|
128
|
+
# Convert to scenario list
|
129
|
+
scenario_list = example_instance.to_scenario_list()
|
130
|
+
# Basic validation
|
131
|
+
if not isinstance(scenario_list, ScenarioList):
|
132
|
+
results[source_type] = False
|
133
|
+
print(f"Source {source_type} returned {type(scenario_list)} instead of ScenarioList")
|
134
|
+
else:
|
135
|
+
results[source_type] = True
|
136
|
+
except Exception as e:
|
137
|
+
results[source_type] = False
|
138
|
+
print(f"Source {source_type} exception: {e}")
|
139
|
+
return results
|
140
|
+
|
141
|
+
class URLSource(Source):
|
142
|
+
source_type = "urls"
|
143
|
+
|
144
|
+
def __init__(self, urls: list[str], field_name: str):
|
145
|
+
self.urls = urls
|
146
|
+
self.field_name = field_name
|
147
|
+
|
148
|
+
@classmethod
|
149
|
+
def example(cls) -> 'URLSource':
|
150
|
+
"""Return an example URLSource instance."""
|
151
|
+
return cls(
|
152
|
+
urls=['http://www.example.com'],
|
153
|
+
field_name="text"
|
154
|
+
)
|
155
|
+
|
156
|
+
def to_scenario_list(self):
|
157
|
+
"""Create a ScenarioList from a list of URLs."""
|
158
|
+
import requests
|
159
|
+
|
160
|
+
from .scenario_list import ScenarioList
|
161
|
+
|
162
|
+
result = ScenarioList()
|
163
|
+
for url in self.urls:
|
164
|
+
try:
|
165
|
+
response = requests.get(url)
|
166
|
+
response.raise_for_status()
|
167
|
+
scenario = Scenario({self.field_name: response.text})
|
168
|
+
result.append(scenario)
|
169
|
+
except requests.RequestException as e:
|
170
|
+
warnings.warn(f"Failed to fetch URL {url}: {str(e)}")
|
171
|
+
continue
|
172
|
+
|
173
|
+
return result
|
174
|
+
|
175
|
+
|
176
|
+
class ListSource(Source):
|
177
|
+
source_type = "list"
|
178
|
+
|
179
|
+
def __init__(self, field_name: str, values: list, use_indexes: bool = False):
|
180
|
+
self.field_name = field_name
|
181
|
+
self.values = values
|
182
|
+
self.use_indexes = use_indexes
|
183
|
+
|
184
|
+
@classmethod
|
185
|
+
def example(cls) -> 'ListSource':
|
186
|
+
"""Return an example ListSource instance."""
|
187
|
+
return cls(
|
188
|
+
field_name="text",
|
189
|
+
values=["example1", "example2", "example3"],
|
190
|
+
use_indexes=True
|
191
|
+
)
|
192
|
+
|
193
|
+
def to_scenario_list(self):
|
194
|
+
"""Create a ScenarioList from a list of values with a specified field name."""
|
195
|
+
from .scenario_list import ScenarioList
|
196
|
+
|
197
|
+
scenarios = []
|
198
|
+
|
199
|
+
for i, value in enumerate(self.values):
|
200
|
+
scenario_dict = {self.field_name: value}
|
201
|
+
if self.use_indexes:
|
202
|
+
scenario_dict["idx"] = i
|
203
|
+
scenarios.append(Scenario(scenario_dict))
|
204
|
+
|
205
|
+
return ScenarioList(scenarios)
|
206
|
+
|
207
|
+
|
208
|
+
class DirectorySource(Source):
|
209
|
+
source_type = "directory"
|
210
|
+
|
211
|
+
def __init__(
|
212
|
+
self,
|
213
|
+
directory: str,
|
214
|
+
pattern: str = "*",
|
215
|
+
recursive: bool = False,
|
216
|
+
metadata: bool = True,
|
217
|
+
ignore_dirs: List[str] = None,
|
218
|
+
ignore_files: List[str] = None,
|
219
|
+
):
|
220
|
+
self.directory = directory
|
221
|
+
self.pattern = pattern
|
222
|
+
self.recursive = recursive
|
223
|
+
self.metadata = metadata
|
224
|
+
self.ignore_dirs = ignore_dirs or []
|
225
|
+
self.ignore_files = ignore_files or []
|
226
|
+
|
227
|
+
@classmethod
|
228
|
+
def example(cls) -> 'DirectorySource':
|
229
|
+
"""Return an example DirectorySource instance."""
|
230
|
+
import tempfile
|
231
|
+
import os
|
232
|
+
|
233
|
+
# Create a temporary directory for the example
|
234
|
+
temp_dir = tempfile.mkdtemp(prefix="edsl_test_")
|
235
|
+
|
236
|
+
# Create some sample files in the directory
|
237
|
+
with open(os.path.join(temp_dir, "test1.txt"), "w") as f:
|
238
|
+
f.write("Sample content 1")
|
239
|
+
|
240
|
+
with open(os.path.join(temp_dir, "test2.txt"), "w") as f:
|
241
|
+
f.write("Sample content 2")
|
242
|
+
|
243
|
+
# Create a subdirectory with a file
|
244
|
+
subdir = os.path.join(temp_dir, "subdir")
|
245
|
+
os.makedirs(subdir, exist_ok=True)
|
246
|
+
with open(os.path.join(subdir, "test3.txt"), "w") as f:
|
247
|
+
f.write("Sample content 3")
|
248
|
+
|
249
|
+
return cls(
|
250
|
+
directory=temp_dir,
|
251
|
+
pattern="*.txt",
|
252
|
+
recursive=True,
|
253
|
+
metadata=True,
|
254
|
+
ignore_dirs=["__pycache__"],
|
255
|
+
ignore_files=["*.pyc"]
|
256
|
+
)
|
257
|
+
|
258
|
+
def to_scenario_list(self):
|
259
|
+
"""Create a ScenarioList from files in a directory."""
|
260
|
+
import os
|
261
|
+
import glob
|
262
|
+
|
263
|
+
from .scenario_list import ScenarioList
|
264
|
+
|
265
|
+
# Set default recursive value
|
266
|
+
recursive = self.recursive
|
267
|
+
|
268
|
+
# Handle paths with wildcards properly
|
269
|
+
if '*' in self.directory:
|
270
|
+
# Handle "**/*.py" patterns (recursive wildcard)
|
271
|
+
if "**" in self.directory:
|
272
|
+
parts = self.directory.split("**")
|
273
|
+
directory = parts[0].rstrip("/\\")
|
274
|
+
if not directory:
|
275
|
+
directory = os.getcwd()
|
276
|
+
pattern = f"**{parts[1]}" if len(parts) > 1 else "**/*"
|
277
|
+
# Force recursive=True for ** patterns
|
278
|
+
recursive = True
|
279
|
+
# Handle "*.txt" patterns (just wildcard with no directory)
|
280
|
+
elif os.path.dirname(self.directory) == "":
|
281
|
+
directory = os.getcwd()
|
282
|
+
pattern = self.directory
|
283
|
+
# Handle "/path/to/dir/*.py" patterns
|
284
|
+
else:
|
285
|
+
directory = os.path.dirname(self.directory)
|
286
|
+
pattern = os.path.basename(self.directory)
|
287
|
+
else:
|
288
|
+
directory = self.directory
|
289
|
+
pattern = self.pattern
|
290
|
+
|
291
|
+
# Check if directory exists
|
292
|
+
if not os.path.isdir(directory):
|
293
|
+
from .exceptions import FileNotFoundScenarioError
|
294
|
+
raise FileNotFoundScenarioError(f"Directory not found: {directory}")
|
295
|
+
|
296
|
+
# Use glob directly for ** patterns to prevent duplicates
|
297
|
+
if "**" in pattern:
|
298
|
+
from .scenario_list import ScenarioList
|
299
|
+
from .file_store import FileStore
|
300
|
+
|
301
|
+
# Handle the pattern directly with glob
|
302
|
+
full_pattern = os.path.join(directory, pattern)
|
303
|
+
file_paths = glob.glob(full_pattern, recursive=True)
|
304
|
+
|
305
|
+
# Remove duplicates (by converting to a set and back)
|
306
|
+
file_paths = list(set(file_paths))
|
307
|
+
|
308
|
+
# Create scenarios
|
309
|
+
scenarios = []
|
310
|
+
for file_path in file_paths:
|
311
|
+
if os.path.isfile(file_path):
|
312
|
+
# Check if file should be ignored
|
313
|
+
file_name = os.path.basename(file_path)
|
314
|
+
if any(fnmatch.fnmatch(file_name, ignore_pattern) for ignore_pattern in self.ignore_files or []):
|
315
|
+
continue
|
316
|
+
|
317
|
+
# Create FileStore object
|
318
|
+
file_store = FileStore(file_path)
|
319
|
+
|
320
|
+
# Create scenario
|
321
|
+
scenario_data = {"file": file_store}
|
322
|
+
|
323
|
+
# Add metadata if requested
|
324
|
+
if self.metadata:
|
325
|
+
file_stat = os.stat(file_path)
|
326
|
+
scenario_data.update({
|
327
|
+
"file_path": file_path,
|
328
|
+
"file_name": file_name,
|
329
|
+
"file_size": file_stat.st_size,
|
330
|
+
"file_created": file_stat.st_ctime,
|
331
|
+
"file_modified": file_stat.st_mtime,
|
332
|
+
})
|
333
|
+
|
334
|
+
scenarios.append(Scenario(scenario_data))
|
335
|
+
|
336
|
+
return ScenarioList(scenarios)
|
337
|
+
else:
|
338
|
+
# Use the standard scanning method for non-** patterns
|
339
|
+
return DirectoryScanner.scan_directory(
|
340
|
+
directory=directory,
|
341
|
+
pattern=pattern,
|
342
|
+
recursive=recursive,
|
343
|
+
metadata=self.metadata,
|
344
|
+
ignore_dirs=self.ignore_dirs,
|
345
|
+
ignore_files=self.ignore_files,
|
346
|
+
)
|
347
|
+
|
348
|
+
|
349
|
+
class TuplesSource(Source):
|
350
|
+
source_type = "list_of_tuples"
|
351
|
+
|
352
|
+
def __init__(self, field_names: list[str], values: list[tuple], use_indexes: bool = False):
|
353
|
+
self.field_names = field_names
|
354
|
+
self.values = values
|
355
|
+
self.use_indexes = use_indexes
|
356
|
+
|
357
|
+
# Validate inputs
|
358
|
+
if not all(isinstance(v, (tuple, list)) for v in values):
|
359
|
+
raise ScenarioError("All values must be tuples or lists")
|
360
|
+
|
361
|
+
@classmethod
|
362
|
+
def example(cls) -> 'TuplesSource':
|
363
|
+
"""Return an example TuplesSource instance."""
|
364
|
+
return cls(
|
365
|
+
field_names=["name", "age", "city"],
|
366
|
+
values=[
|
367
|
+
("Alice", 30, "New York"),
|
368
|
+
("Bob", 25, "San Francisco"),
|
369
|
+
("Charlie", 35, "Boston")
|
370
|
+
],
|
371
|
+
use_indexes=True
|
372
|
+
)
|
373
|
+
|
374
|
+
def to_scenario_list(self):
|
375
|
+
"""Create a ScenarioList from a list of tuples with specified field names."""
|
376
|
+
from .scenario_list import ScenarioList
|
377
|
+
|
378
|
+
scenarios = []
|
379
|
+
|
380
|
+
for i, value_tuple in enumerate(self.values):
|
381
|
+
if len(value_tuple) != len(self.field_names):
|
382
|
+
raise ScenarioError(
|
383
|
+
f"Tuple {i} has {len(value_tuple)} elements, but {len(self.field_names)} field names were provided."
|
384
|
+
)
|
385
|
+
|
386
|
+
scenario_dict = dict(zip(self.field_names, value_tuple))
|
387
|
+
if self.use_indexes:
|
388
|
+
scenario_dict["idx"] = i
|
389
|
+
scenarios.append(Scenario(scenario_dict))
|
390
|
+
|
391
|
+
return ScenarioList(scenarios)
|
392
|
+
|
393
|
+
|
394
|
+
class SQLiteSource(Source):
|
395
|
+
source_type = "sqlite"
|
396
|
+
|
397
|
+
def __init__(self, db_path: str, table: str, fields: Optional[list] = None):
|
398
|
+
self.db_path = db_path
|
399
|
+
self.table = table
|
400
|
+
self.fields = fields
|
401
|
+
|
402
|
+
@classmethod
|
403
|
+
def example(cls) -> 'SQLiteSource':
|
404
|
+
"""Return an example SQLiteSource instance."""
|
405
|
+
import sqlite3
|
406
|
+
import tempfile
|
407
|
+
import os
|
408
|
+
|
409
|
+
# Create a temporary SQLite database for the example
|
410
|
+
fd, temp_path = tempfile.mkstemp(suffix='.db', prefix='edsl_test_')
|
411
|
+
os.close(fd) # Close the file descriptor
|
412
|
+
|
413
|
+
# Connect to the database and create a sample table
|
414
|
+
conn = sqlite3.connect(temp_path)
|
415
|
+
cursor = conn.cursor()
|
416
|
+
|
417
|
+
# Create a simple table
|
418
|
+
cursor.execute('CREATE TABLE test_table (id INTEGER PRIMARY KEY, name TEXT, value INTEGER)')
|
419
|
+
|
420
|
+
# Insert sample data
|
421
|
+
sample_data = [
|
422
|
+
(1, 'Alpha', 100),
|
423
|
+
(2, 'Beta', 200),
|
424
|
+
(3, 'Gamma', 300)
|
425
|
+
]
|
426
|
+
cursor.executemany('INSERT INTO test_table VALUES (?, ?, ?)', sample_data)
|
427
|
+
|
428
|
+
conn.commit()
|
429
|
+
conn.close()
|
430
|
+
|
431
|
+
return cls(
|
432
|
+
db_path=temp_path,
|
433
|
+
table='test_table',
|
434
|
+
fields=['id', 'name', 'value']
|
435
|
+
)
|
436
|
+
|
437
|
+
def to_scenario_list(self):
|
438
|
+
"""Create a ScenarioList from a SQLite database."""
|
439
|
+
from .scenario_list import ScenarioList
|
440
|
+
import sqlite3
|
441
|
+
|
442
|
+
conn = sqlite3.connect(self.db_path)
|
443
|
+
cursor = conn.cursor()
|
444
|
+
|
445
|
+
# If fields weren't provided, get all fields from the table
|
446
|
+
fields = self.fields
|
447
|
+
if fields is None:
|
448
|
+
cursor.execute(f"PRAGMA table_info({self.table})")
|
449
|
+
fields = [row[1] for row in cursor.fetchall()]
|
450
|
+
|
451
|
+
# Query the data
|
452
|
+
field_placeholders = ", ".join(fields)
|
453
|
+
cursor.execute(f"SELECT {field_placeholders} FROM {self.table}")
|
454
|
+
rows = cursor.fetchall()
|
455
|
+
|
456
|
+
# Create scenarios
|
457
|
+
scenarios = []
|
458
|
+
for row in rows:
|
459
|
+
scenario_dict = dict(zip(fields, row))
|
460
|
+
scenarios.append(Scenario(scenario_dict))
|
461
|
+
|
462
|
+
conn.close()
|
463
|
+
return ScenarioList(scenarios)
|
464
|
+
|
465
|
+
|
466
|
+
class LaTeXSource(Source):
|
467
|
+
source_type = "latex"
|
468
|
+
|
469
|
+
def __init__(self, file_path: str, table_index: int = 0, has_header: bool = True):
|
470
|
+
"""
|
471
|
+
Initialize a LaTeXSource with a LaTeX file path.
|
472
|
+
|
473
|
+
Args:
|
474
|
+
file_path: The path to the LaTeX file.
|
475
|
+
table_index: The index of the table to extract (if multiple tables exist).
|
476
|
+
Default is 0 (first table).
|
477
|
+
has_header: Whether the table has a header row. Default is True.
|
478
|
+
"""
|
479
|
+
self.file_path = file_path
|
480
|
+
self.table_index = table_index
|
481
|
+
self.has_header = has_header
|
482
|
+
|
483
|
+
@classmethod
|
484
|
+
def example(cls) -> 'LaTeXSource':
|
485
|
+
"""Return an example LaTeXSource instance."""
|
486
|
+
import tempfile
|
487
|
+
import os
|
488
|
+
|
489
|
+
# Create a temporary LaTeX file with a sample table
|
490
|
+
fd, temp_path = tempfile.mkstemp(suffix='.tex', prefix='edsl_test_')
|
491
|
+
os.close(fd) # Close the file descriptor
|
492
|
+
|
493
|
+
# Write a sample LaTeX table to the file
|
494
|
+
sample_latex = r"""
|
495
|
+
\documentclass{article}
|
496
|
+
\begin{document}
|
497
|
+
This is a sample document with a table:
|
498
|
+
|
499
|
+
\begin{tabular}{lrr}
|
500
|
+
\textbf{Name} & \textbf{Age} & \textbf{Score} \\
|
501
|
+
Alice & 30 & 95 \\
|
502
|
+
Bob & 25 & 87 \\
|
503
|
+
Charlie & 35 & 92 \\
|
504
|
+
\end{tabular}
|
505
|
+
|
506
|
+
\end{document}
|
507
|
+
"""
|
508
|
+
with open(temp_path, 'w') as f:
|
509
|
+
f.write(sample_latex)
|
510
|
+
|
511
|
+
return cls(
|
512
|
+
file_path=temp_path,
|
513
|
+
table_index=0,
|
514
|
+
has_header=True
|
515
|
+
)
|
516
|
+
|
517
|
+
def to_scenario_list(self):
|
518
|
+
"""Create a ScenarioList from a LaTeX file."""
|
519
|
+
from .scenario_list import ScenarioList
|
520
|
+
import re
|
521
|
+
|
522
|
+
with open(self.file_path, "r") as f:
|
523
|
+
content = f.read()
|
524
|
+
|
525
|
+
# Find all tabular environments
|
526
|
+
tabular_pattern = r"\\begin{tabular}(.*?)\\end{tabular}"
|
527
|
+
tables = re.findall(tabular_pattern, content, re.DOTALL)
|
528
|
+
|
529
|
+
if not tables or self.table_index >= len(tables):
|
530
|
+
raise ScenarioError(f"No table found at index {self.table_index}")
|
531
|
+
|
532
|
+
table_content = tables[self.table_index]
|
533
|
+
|
534
|
+
# Extract rows
|
535
|
+
rows = table_content.split("\\\\")
|
536
|
+
rows = [row.strip() for row in rows if row.strip()]
|
537
|
+
|
538
|
+
if not rows:
|
539
|
+
return ScenarioList()
|
540
|
+
|
541
|
+
# Process header if available
|
542
|
+
if self.has_header:
|
543
|
+
header_row = rows[0]
|
544
|
+
header_cells = re.findall(r"\\textbf{(.*?)}", header_row)
|
545
|
+
if not header_cells:
|
546
|
+
header_cells = header_row.split("&")
|
547
|
+
header_cells = [h.strip() for h in header_cells]
|
548
|
+
|
549
|
+
data_rows = rows[1:]
|
550
|
+
else:
|
551
|
+
# Auto-generate column names
|
552
|
+
header_cells = [f"col{i}" for i in range(rows[0].count("&") + 1)]
|
553
|
+
data_rows = rows
|
554
|
+
|
555
|
+
# Process data rows
|
556
|
+
scenarios = []
|
557
|
+
for row in data_rows:
|
558
|
+
cells = row.split("&")
|
559
|
+
cells = [cell.strip() for cell in cells]
|
560
|
+
|
561
|
+
if len(cells) != len(header_cells):
|
562
|
+
continue # Skip malformed rows
|
563
|
+
|
564
|
+
scenario_dict = dict(zip(header_cells, cells))
|
565
|
+
scenarios.append(Scenario(scenario_dict))
|
566
|
+
|
567
|
+
return ScenarioList(scenarios)
|
568
|
+
|
569
|
+
|
570
|
+
class GoogleDocSource(Source):
|
571
|
+
source_type = "google_doc"
|
572
|
+
|
573
|
+
def __init__(self, url: str):
|
574
|
+
"""
|
575
|
+
Initialize a GoogleDocSource with a Google Doc URL.
|
576
|
+
|
577
|
+
Args:
|
578
|
+
url: The URL to the Google Doc.
|
579
|
+
"""
|
580
|
+
self.url = url
|
581
|
+
|
582
|
+
@classmethod
|
583
|
+
def example(cls) -> 'GoogleDocSource':
|
584
|
+
"""Return an example GoogleDocSource instance."""
|
585
|
+
# Create a mock instance that doesn't actually fetch a Google Doc
|
586
|
+
instance = cls(url="https://docs.google.com/document/d/1234567890abcdefghijklmnopqrstuvwxyz/edit")
|
587
|
+
|
588
|
+
# Override the to_scenario_list method just for the example
|
589
|
+
def mock_to_scenario_list(self):
|
590
|
+
from .scenario_list import ScenarioList
|
591
|
+
# Create a simple mock ScenarioList with a few paragraphs
|
592
|
+
scenarios = [
|
593
|
+
Scenario({"text": "This is paragraph 1 from a sample Google Doc."}),
|
594
|
+
Scenario({"text": "This is paragraph 2 with some more content."}),
|
595
|
+
Scenario({"text": "This is the final paragraph with a conclusion."})
|
596
|
+
]
|
597
|
+
return ScenarioList(scenarios)
|
598
|
+
|
599
|
+
# Replace the method on this instance only
|
600
|
+
import types
|
601
|
+
instance.to_scenario_list = types.MethodType(mock_to_scenario_list, instance)
|
602
|
+
|
603
|
+
return instance
|
604
|
+
|
605
|
+
def to_scenario_list(self):
|
606
|
+
"""Create a ScenarioList from a Google Doc."""
|
607
|
+
from .scenario_list import ScenarioList
|
608
|
+
import tempfile
|
609
|
+
import requests
|
610
|
+
|
611
|
+
# Extract the document ID from the URL
|
612
|
+
if "/edit" in self.url:
|
613
|
+
doc_id = self.url.split("/d/")[1].split("/edit")[0]
|
614
|
+
else:
|
615
|
+
raise ScenarioError("Invalid Google Doc URL format.")
|
616
|
+
|
617
|
+
# Create the export URL to download as DOCX
|
618
|
+
export_url = f"https://docs.google.com/document/d/{doc_id}/export?format=docx"
|
619
|
+
|
620
|
+
try:
|
621
|
+
# Download the Google Doc as a Word file (.docx)
|
622
|
+
response = requests.get(export_url)
|
623
|
+
response.raise_for_status() # Ensure the request was successful
|
624
|
+
|
625
|
+
# Save the Word file to a temporary file
|
626
|
+
with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as temp_file:
|
627
|
+
temp_file.write(response.content)
|
628
|
+
temp_filename = temp_file.name
|
629
|
+
|
630
|
+
# Use the DocxScenario class to process the temporary file
|
631
|
+
from .scenario_list import ScenarioList
|
632
|
+
from .DocxScenario import DocxScenario
|
633
|
+
|
634
|
+
# Create a scenario from the DOCX file
|
635
|
+
docx_scenario = DocxScenario(temp_filename)
|
636
|
+
scenarios = [Scenario({"text": paragraph}) for paragraph in docx_scenario.paragraphs]
|
637
|
+
|
638
|
+
return ScenarioList(scenarios)
|
639
|
+
|
640
|
+
except requests.RequestException as e:
|
641
|
+
raise ScenarioError(f"Failed to fetch Google Doc: {str(e)}")
|
642
|
+
except Exception as e:
|
643
|
+
raise ScenarioError(f"Error processing Google Doc: {str(e)}")
|
644
|
+
|
645
|
+
|
646
|
+
class PandasSource(Source):
|
647
|
+
source_type = "pandas"
|
648
|
+
|
649
|
+
def __init__(self, df):
|
650
|
+
"""
|
651
|
+
Initialize a PandasSource with a pandas DataFrame.
|
652
|
+
|
653
|
+
Args:
|
654
|
+
df: A pandas DataFrame.
|
655
|
+
"""
|
656
|
+
try:
|
657
|
+
import pandas as pd
|
658
|
+
if not isinstance(df, pd.DataFrame):
|
659
|
+
raise ScenarioError("Input must be a pandas DataFrame")
|
660
|
+
self.df = df
|
661
|
+
except ImportError:
|
662
|
+
raise ImportError("pandas is required for PandasSource")
|
663
|
+
|
664
|
+
@classmethod
|
665
|
+
def example(cls) -> 'PandasSource':
|
666
|
+
"""Return an example PandasSource instance."""
|
667
|
+
try:
|
668
|
+
import pandas as pd
|
669
|
+
|
670
|
+
# Create a sample DataFrame for the example
|
671
|
+
sample_data = {
|
672
|
+
'name': ['Alice', 'Bob', 'Charlie', 'David'],
|
673
|
+
'age': [30, 25, 35, 28],
|
674
|
+
'city': ['New York', 'San Francisco', 'Boston', 'Seattle']
|
675
|
+
}
|
676
|
+
df = pd.DataFrame(sample_data)
|
677
|
+
|
678
|
+
return cls(df)
|
679
|
+
except ImportError:
|
680
|
+
# Create a mock instance that doesn't actually need pandas
|
681
|
+
instance = cls.__new__(cls)
|
682
|
+
|
683
|
+
# Override the to_scenario_list method just for the example
|
684
|
+
def mock_to_scenario_list(self):
|
685
|
+
from .scenario_list import ScenarioList
|
686
|
+
# Create a simple mock ScenarioList
|
687
|
+
scenarios = [
|
688
|
+
Scenario({"name": "Alice", "age": 30, "city": "New York"}),
|
689
|
+
Scenario({"name": "Bob", "age": 25, "city": "San Francisco"}),
|
690
|
+
Scenario({"name": "Charlie", "age": 35, "city": "Boston"}),
|
691
|
+
Scenario({"name": "David", "age": 28, "city": "Seattle"})
|
692
|
+
]
|
693
|
+
return ScenarioList(scenarios)
|
694
|
+
|
695
|
+
# Replace the method on this instance only
|
696
|
+
import types
|
697
|
+
instance.to_scenario_list = types.MethodType(mock_to_scenario_list, instance)
|
698
|
+
|
699
|
+
return instance
|
700
|
+
|
701
|
+
def to_scenario_list(self):
|
702
|
+
"""Create a ScenarioList from a pandas DataFrame."""
|
703
|
+
from .scenario_list import ScenarioList
|
704
|
+
|
705
|
+
# Convert DataFrame records to scenarios
|
706
|
+
scenarios = []
|
707
|
+
for _, row in self.df.iterrows():
|
708
|
+
scenario_dict = row.to_dict()
|
709
|
+
scenarios.append(Scenario(scenario_dict))
|
710
|
+
|
711
|
+
return ScenarioList(scenarios)
|
712
|
+
|
713
|
+
|
714
|
+
class StataSource(Source):
|
715
|
+
source_type = "dta"
|
716
|
+
|
717
|
+
def __init__(self, file_path: str, include_metadata: bool = True):
|
718
|
+
"""
|
719
|
+
Initialize a StataSource with a path to a Stata data file.
|
720
|
+
|
721
|
+
Args:
|
722
|
+
file_path: Path to the Stata (.dta) file.
|
723
|
+
include_metadata: If True, extract and preserve variable labels and value labels
|
724
|
+
as additional metadata in the ScenarioList.
|
725
|
+
"""
|
726
|
+
self.file_path = file_path
|
727
|
+
self.include_metadata = include_metadata
|
728
|
+
|
729
|
+
@classmethod
|
730
|
+
def example(cls) -> 'StataSource':
|
731
|
+
"""Return an example StataSource instance."""
|
732
|
+
import tempfile
|
733
|
+
import os
|
734
|
+
|
735
|
+
# Since we can't easily create a real Stata file for testing,
|
736
|
+
# we'll create a mock instance with an override
|
737
|
+
instance = cls(file_path="/path/to/nonexistent/file.dta")
|
738
|
+
|
739
|
+
# Override the to_scenario_list method just for the example
|
740
|
+
def mock_to_scenario_list(self):
|
741
|
+
from .scenario_list import ScenarioList
|
742
|
+
|
743
|
+
# Create a simple mock ScenarioList with Stata-like data
|
744
|
+
scenarios = [
|
745
|
+
Scenario({"id": 1, "gender": 1, "income": 50000, "education": 2}),
|
746
|
+
Scenario({"id": 2, "gender": 2, "income": 45000, "education": 3}),
|
747
|
+
Scenario({"id": 3, "gender": 1, "income": 60000, "education": 4})
|
748
|
+
]
|
749
|
+
|
750
|
+
result = ScenarioList(scenarios)
|
751
|
+
|
752
|
+
# Add metadata similar to what would be in a Stata file
|
753
|
+
if self.include_metadata:
|
754
|
+
result.codebook = {
|
755
|
+
"variable_labels": {
|
756
|
+
"gender": "Gender (1=Male, 2=Female)",
|
757
|
+
"income": "Annual income in USD",
|
758
|
+
"education": "Education level (1-4)"
|
759
|
+
},
|
760
|
+
"value_labels": {
|
761
|
+
"gender": {1: "Male", 2: "Female"},
|
762
|
+
"education": {1: "High School", 2: "Associate", 3: "Bachelor", 4: "Graduate"}
|
763
|
+
}
|
764
|
+
}
|
765
|
+
|
766
|
+
return result
|
767
|
+
|
768
|
+
# Replace the method on this instance only
|
769
|
+
import types
|
770
|
+
instance.to_scenario_list = types.MethodType(mock_to_scenario_list, instance)
|
771
|
+
|
772
|
+
return instance
|
773
|
+
|
774
|
+
def to_scenario_list(self):
|
775
|
+
"""Create a ScenarioList from a Stata data file."""
|
776
|
+
from .scenario_list import ScenarioList
|
777
|
+
|
778
|
+
try:
|
779
|
+
import pandas as pd
|
780
|
+
except ImportError:
|
781
|
+
raise ImportError("pandas is required to read Stata files")
|
782
|
+
|
783
|
+
# Read the Stata file with pandas
|
784
|
+
df = pd.read_stata(self.file_path)
|
785
|
+
|
786
|
+
# Create scenarios
|
787
|
+
scenarios = []
|
788
|
+
for _, row in df.iterrows():
|
789
|
+
scenario_dict = row.to_dict()
|
790
|
+
scenarios.append(Scenario(scenario_dict))
|
791
|
+
|
792
|
+
# Create the basic ScenarioList
|
793
|
+
result = ScenarioList(scenarios)
|
794
|
+
|
795
|
+
# Extract and preserve metadata if requested
|
796
|
+
if self.include_metadata:
|
797
|
+
# Get variable labels (if any)
|
798
|
+
variable_labels = {}
|
799
|
+
if hasattr(df, "variable_labels") and df.variable_labels:
|
800
|
+
variable_labels = df.variable_labels
|
801
|
+
|
802
|
+
# Get value labels (if any)
|
803
|
+
value_labels = {}
|
804
|
+
if hasattr(df, "value_labels") and df.value_labels:
|
805
|
+
value_labels = df.value_labels
|
806
|
+
|
807
|
+
# Store the metadata in the ScenarioList's codebook
|
808
|
+
if variable_labels or value_labels:
|
809
|
+
result.codebook = {
|
810
|
+
"variable_labels": variable_labels,
|
811
|
+
"value_labels": value_labels,
|
812
|
+
}
|
813
|
+
|
814
|
+
return result
|
815
|
+
|
816
|
+
|
817
|
+
class WikipediaSource(Source):
|
818
|
+
source_type = "wikipedia"
|
819
|
+
|
820
|
+
def __init__(self, url: str, table_index: int = 0, header: bool = True):
|
821
|
+
"""
|
822
|
+
Initialize a WikipediaSource with a URL to a Wikipedia page.
|
823
|
+
|
824
|
+
Args:
|
825
|
+
url: The URL of the Wikipedia page.
|
826
|
+
table_index: The index of the table to extract (default is 0).
|
827
|
+
header: Whether the table has a header row (default is True).
|
828
|
+
"""
|
829
|
+
self.url = url
|
830
|
+
self.table_index = table_index
|
831
|
+
self.header = header
|
832
|
+
|
833
|
+
@classmethod
|
834
|
+
def example(cls) -> 'WikipediaSource':
|
835
|
+
"""Return an example WikipediaSource instance."""
|
836
|
+
# Use a real Wikipedia URL for the example, but we'll override the to_scenario_list method
|
837
|
+
instance = cls(
|
838
|
+
url="https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)",
|
839
|
+
table_index=0,
|
840
|
+
header=True
|
841
|
+
)
|
842
|
+
|
843
|
+
# Override the to_scenario_list method just for the example
|
844
|
+
def mock_to_scenario_list(self):
|
845
|
+
from .scenario_list import ScenarioList
|
846
|
+
|
847
|
+
# Create a simple mock ScenarioList with GDP data
|
848
|
+
scenarios = [
|
849
|
+
Scenario({"Rank": 1, "Country": "United States", "GDP (millions of USD)": 25460000}),
|
850
|
+
Scenario({"Rank": 2, "Country": "China", "GDP (millions of USD)": 17963000}),
|
851
|
+
Scenario({"Rank": 3, "Country": "Japan", "GDP (millions of USD)": 4231000}),
|
852
|
+
Scenario({"Rank": 4, "Country": "Germany", "GDP (millions of USD)": 4430000}),
|
853
|
+
Scenario({"Rank": 5, "Country": "India", "GDP (millions of USD)": 3737000})
|
854
|
+
]
|
855
|
+
|
856
|
+
return ScenarioList(scenarios)
|
857
|
+
|
858
|
+
# Replace the method on this instance only
|
859
|
+
import types
|
860
|
+
instance.to_scenario_list = types.MethodType(mock_to_scenario_list, instance)
|
861
|
+
|
862
|
+
return instance
|
863
|
+
|
864
|
+
def to_scenario_list(self):
|
865
|
+
"""Create a ScenarioList from a table on a Wikipedia page."""
|
866
|
+
from .scenario_list import ScenarioList
|
867
|
+
import requests
|
868
|
+
|
869
|
+
try:
|
870
|
+
# Try to import pandas
|
871
|
+
import pandas as pd
|
872
|
+
except ImportError:
|
873
|
+
raise ImportError("pandas is required to read Wikipedia tables")
|
874
|
+
|
875
|
+
try:
|
876
|
+
# Check if the URL is reachable
|
877
|
+
response = requests.get(self.url)
|
878
|
+
response.raise_for_status() # Raises HTTPError for bad responses
|
879
|
+
|
880
|
+
# Extract tables from the Wikipedia page
|
881
|
+
tables = pd.read_html(self.url, header=0 if self.header else None)
|
882
|
+
|
883
|
+
# Ensure the requested table index is within the range of available tables
|
884
|
+
if self.table_index >= len(tables) or self.table_index < 0:
|
885
|
+
raise ScenarioError(
|
886
|
+
f"Table index {self.table_index} is out of range. This page has {len(tables)} table(s)."
|
887
|
+
)
|
888
|
+
|
889
|
+
# Get the requested table
|
890
|
+
df = tables[self.table_index]
|
891
|
+
|
892
|
+
# Convert DataFrame to ScenarioList
|
893
|
+
scenarios = []
|
894
|
+
for _, row in df.iterrows():
|
895
|
+
scenario_dict = row.to_dict()
|
896
|
+
scenarios.append(Scenario(scenario_dict))
|
897
|
+
|
898
|
+
return ScenarioList(scenarios)
|
899
|
+
|
900
|
+
except requests.exceptions.RequestException as e:
|
901
|
+
raise ScenarioError(f"Error fetching the URL: {str(e)}")
|
902
|
+
except ValueError as e:
|
903
|
+
raise ScenarioError(f"Error parsing tables: {str(e)}")
|
904
|
+
except Exception as e:
|
905
|
+
raise ScenarioError(f"An unexpected error occurred: {str(e)}")
|
906
|
+
|
907
|
+
|
908
|
+
class ExcelSource(Source):
|
909
|
+
source_type = "excel"
|
910
|
+
|
911
|
+
def __init__(
|
912
|
+
self,
|
913
|
+
file_path: str,
|
914
|
+
sheet_name: Optional[str] = None,
|
915
|
+
skip_rows: Optional[List[int]] = None,
|
916
|
+
use_codebook: bool = False,
|
917
|
+
**kwargs
|
918
|
+
):
|
919
|
+
"""
|
920
|
+
Initialize an ExcelSource with a path to an Excel file.
|
921
|
+
|
922
|
+
Args:
|
923
|
+
file_path: Path to the Excel file.
|
924
|
+
sheet_name: Name of the sheet to load. If None and multiple sheets exist,
|
925
|
+
will raise an error listing available sheets.
|
926
|
+
skip_rows: List of row indices to skip (0-based). If None, all rows are included.
|
927
|
+
use_codebook: If True, rename columns to standard format and store original names in codebook.
|
928
|
+
**kwargs: Additional parameters to pass to pandas.read_excel.
|
929
|
+
"""
|
930
|
+
self.file_path = file_path
|
931
|
+
self.sheet_name = sheet_name
|
932
|
+
self.skip_rows = skip_rows
|
933
|
+
self.use_codebook = use_codebook
|
934
|
+
self.kwargs = kwargs
|
935
|
+
|
936
|
+
@classmethod
|
937
|
+
def example(cls) -> 'ExcelSource':
|
938
|
+
"""Return an example ExcelSource instance."""
|
939
|
+
import tempfile
|
940
|
+
import os
|
941
|
+
|
942
|
+
try:
|
943
|
+
import pandas as pd
|
944
|
+
|
945
|
+
# Create a temporary Excel file with sample data
|
946
|
+
fd, temp_path = tempfile.mkstemp(suffix='.xlsx', prefix='edsl_test_')
|
947
|
+
os.close(fd) # Close the file descriptor
|
948
|
+
|
949
|
+
# Create sample data
|
950
|
+
df1 = pd.DataFrame({
|
951
|
+
'name': ['Alice', 'Bob', 'Charlie'],
|
952
|
+
'age': [30, 25, 35],
|
953
|
+
'city': ['New York', 'San Francisco', 'Boston']
|
954
|
+
})
|
955
|
+
|
956
|
+
df2 = pd.DataFrame({
|
957
|
+
'name': ['David', 'Eve'],
|
958
|
+
'age': [40, 45],
|
959
|
+
'city': ['Seattle', 'Chicago']
|
960
|
+
})
|
961
|
+
|
962
|
+
# Write to Excel file with multiple sheets
|
963
|
+
with pd.ExcelWriter(temp_path) as writer:
|
964
|
+
df1.to_excel(writer, sheet_name='Sheet1', index=False)
|
965
|
+
df2.to_excel(writer, sheet_name='Sheet2', index=False)
|
966
|
+
|
967
|
+
return cls(
|
968
|
+
file_path=temp_path,
|
969
|
+
sheet_name='Sheet1'
|
970
|
+
)
|
971
|
+
|
972
|
+
except ImportError:
|
973
|
+
# Create a mock instance with an override if pandas is not available
|
974
|
+
instance = cls(file_path="/path/to/nonexistent/file.xlsx")
|
975
|
+
|
976
|
+
# Override the to_scenario_list method just for the example
|
977
|
+
def mock_to_scenario_list(self):
|
978
|
+
from .scenario_list import ScenarioList
|
979
|
+
# Create a simple mock ScenarioList with sample data
|
980
|
+
scenarios = [
|
981
|
+
Scenario({"name": "Alice", "age": 30, "city": "New York"}),
|
982
|
+
Scenario({"name": "Bob", "age": 25, "city": "San Francisco"}),
|
983
|
+
Scenario({"name": "Charlie", "age": 35, "city": "Boston"})
|
984
|
+
]
|
985
|
+
return ScenarioList(scenarios)
|
986
|
+
|
987
|
+
# Replace the method on this instance only
|
988
|
+
import types
|
989
|
+
instance.to_scenario_list = types.MethodType(mock_to_scenario_list, instance)
|
990
|
+
|
991
|
+
return instance
|
992
|
+
|
993
|
+
def to_scenario_list(self):
|
994
|
+
"""Create a ScenarioList from an Excel file."""
|
995
|
+
from .scenario_list import ScenarioList
|
996
|
+
|
997
|
+
try:
|
998
|
+
import pandas as pd
|
999
|
+
except ImportError:
|
1000
|
+
raise ImportError("pandas is required to read Excel files")
|
1001
|
+
|
1002
|
+
# Get all sheets
|
1003
|
+
all_sheets = pd.read_excel(self.file_path, sheet_name=None)
|
1004
|
+
|
1005
|
+
# If no sheet_name is provided and there is more than one sheet, print available sheets
|
1006
|
+
sheet_name = self.sheet_name
|
1007
|
+
if sheet_name is None:
|
1008
|
+
if len(all_sheets) > 1:
|
1009
|
+
sheet_names = list(all_sheets.keys())
|
1010
|
+
available_sheets = ", ".join([f"'{name}'" for name in sheet_names])
|
1011
|
+
raise ScenarioError(
|
1012
|
+
f"The Excel file contains multiple sheets: {available_sheets}. "
|
1013
|
+
"Please provide a sheet_name parameter."
|
1014
|
+
)
|
1015
|
+
else:
|
1016
|
+
# If there is only one sheet, use it
|
1017
|
+
sheet_name = list(all_sheets.keys())[0]
|
1018
|
+
|
1019
|
+
# Load the specified or determined sheet
|
1020
|
+
df = pd.read_excel(self.file_path, sheet_name=sheet_name, **self.kwargs)
|
1021
|
+
|
1022
|
+
# Skip specified rows if any
|
1023
|
+
if self.skip_rows:
|
1024
|
+
df = df.drop(self.skip_rows)
|
1025
|
+
# Reset index to ensure continuous indexing
|
1026
|
+
df = df.reset_index(drop=True)
|
1027
|
+
|
1028
|
+
# Handle codebook if requested
|
1029
|
+
if self.use_codebook:
|
1030
|
+
codebook = {f"col_{i}": col for i, col in enumerate(df.columns)}
|
1031
|
+
koobedoc = {col: f"col_{i}" for i, col in enumerate(df.columns)}
|
1032
|
+
|
1033
|
+
# Create scenarios with renamed columns
|
1034
|
+
scenarios = []
|
1035
|
+
for _, row in df.iterrows():
|
1036
|
+
scenario_dict = {koobedoc.get(k): v for k, v in row.to_dict().items()}
|
1037
|
+
scenarios.append(Scenario(scenario_dict))
|
1038
|
+
|
1039
|
+
result = ScenarioList(scenarios)
|
1040
|
+
result.codebook = codebook
|
1041
|
+
return result
|
1042
|
+
else:
|
1043
|
+
# Create scenarios with original column names
|
1044
|
+
scenarios = []
|
1045
|
+
for _, row in df.iterrows():
|
1046
|
+
scenario_dict = row.to_dict()
|
1047
|
+
scenarios.append(Scenario(scenario_dict))
|
1048
|
+
|
1049
|
+
return ScenarioList(scenarios)
|
1050
|
+
|
1051
|
+
|
1052
|
+
class GoogleSheetSource(Source):
|
1053
|
+
source_type = "google_sheet"
|
1054
|
+
|
1055
|
+
def __init__(
|
1056
|
+
self,
|
1057
|
+
url: str,
|
1058
|
+
sheet_name: Optional[str] = None,
|
1059
|
+
column_names: Optional[List[str]] = None,
|
1060
|
+
**kwargs
|
1061
|
+
):
|
1062
|
+
"""
|
1063
|
+
Initialize a GoogleSheetSource with a URL to a Google Sheet.
|
1064
|
+
|
1065
|
+
Args:
|
1066
|
+
url: The URL of the Google Sheet.
|
1067
|
+
sheet_name: The name of the sheet to load. If None, the first sheet will be used.
|
1068
|
+
column_names: If provided, use these names for the columns instead
|
1069
|
+
of the default column names from the sheet.
|
1070
|
+
**kwargs: Additional parameters to pass to pandas.read_excel.
|
1071
|
+
"""
|
1072
|
+
self.url = url
|
1073
|
+
self.sheet_name = sheet_name
|
1074
|
+
self.column_names = column_names
|
1075
|
+
self.kwargs = kwargs
|
1076
|
+
|
1077
|
+
@classmethod
|
1078
|
+
def example(cls) -> 'GoogleSheetSource':
|
1079
|
+
"""Return an example GoogleSheetSource instance."""
|
1080
|
+
# Use a mock instance since we can't create a real Google Sheet for testing
|
1081
|
+
instance = cls(
|
1082
|
+
url="https://docs.google.com/spreadsheets/d/1234567890abcdefg/edit",
|
1083
|
+
sheet_name="Sheet1"
|
1084
|
+
)
|
1085
|
+
|
1086
|
+
# Override the to_scenario_list method just for the example
|
1087
|
+
def mock_to_scenario_list(self):
|
1088
|
+
from .scenario_list import ScenarioList
|
1089
|
+
|
1090
|
+
# Create a simple mock ScenarioList with sample data
|
1091
|
+
scenarios = [
|
1092
|
+
Scenario({"name": "Alice", "age": 30, "city": "New York"}),
|
1093
|
+
Scenario({"name": "Bob", "age": 25, "city": "San Francisco"}),
|
1094
|
+
Scenario({"name": "Charlie", "age": 35, "city": "Boston"})
|
1095
|
+
]
|
1096
|
+
return ScenarioList(scenarios)
|
1097
|
+
|
1098
|
+
# Replace the method on this instance only
|
1099
|
+
import types
|
1100
|
+
instance.to_scenario_list = types.MethodType(mock_to_scenario_list, instance)
|
1101
|
+
|
1102
|
+
return instance
|
1103
|
+
|
1104
|
+
def to_scenario_list(self):
|
1105
|
+
"""Create a ScenarioList from a Google Sheet."""
|
1106
|
+
from .scenario_list import ScenarioList
|
1107
|
+
import tempfile
|
1108
|
+
import requests
|
1109
|
+
|
1110
|
+
# Extract the sheet ID from the URL
|
1111
|
+
if "/edit" in self.url:
|
1112
|
+
sheet_id = self.url.split("/d/")[1].split("/edit")[0]
|
1113
|
+
else:
|
1114
|
+
raise ScenarioError("Invalid Google Sheet URL format.")
|
1115
|
+
|
1116
|
+
# Create the export URL for XLSX format
|
1117
|
+
export_url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=xlsx"
|
1118
|
+
|
1119
|
+
try:
|
1120
|
+
# Download the Google Sheet as an Excel file
|
1121
|
+
response = requests.get(export_url)
|
1122
|
+
response.raise_for_status() # Ensure the request was successful
|
1123
|
+
|
1124
|
+
# Save the Excel file to a temporary file
|
1125
|
+
with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as temp_file:
|
1126
|
+
temp_file.write(response.content)
|
1127
|
+
temp_filename = temp_file.name
|
1128
|
+
|
1129
|
+
# Use ExcelSource to create the initial ScenarioList
|
1130
|
+
excel_source = ExcelSource(
|
1131
|
+
file_path=temp_filename,
|
1132
|
+
sheet_name=self.sheet_name,
|
1133
|
+
**self.kwargs
|
1134
|
+
)
|
1135
|
+
scenario_list = excel_source.to_scenario_list()
|
1136
|
+
|
1137
|
+
# Apply column renaming if specified
|
1138
|
+
if self.column_names is not None and scenario_list:
|
1139
|
+
if len(self.column_names) != len(scenario_list[0].keys()):
|
1140
|
+
raise ScenarioError(
|
1141
|
+
f"Number of provided column names ({len(self.column_names)}) "
|
1142
|
+
f"does not match number of columns in sheet ({len(scenario_list[0].keys())})"
|
1143
|
+
)
|
1144
|
+
|
1145
|
+
# Create a mapping from original keys to new names
|
1146
|
+
original_keys = list(scenario_list[0].keys())
|
1147
|
+
column_mapping = dict(zip(original_keys, self.column_names))
|
1148
|
+
|
1149
|
+
# Create a new ScenarioList with renamed columns
|
1150
|
+
renamed_scenarios = []
|
1151
|
+
for scenario in scenario_list:
|
1152
|
+
renamed_scenario = {column_mapping.get(k, k): v for k, v in scenario.items()}
|
1153
|
+
renamed_scenarios.append(Scenario(renamed_scenario))
|
1154
|
+
|
1155
|
+
return ScenarioList(renamed_scenarios)
|
1156
|
+
|
1157
|
+
return scenario_list
|
1158
|
+
|
1159
|
+
except requests.exceptions.RequestException as e:
|
1160
|
+
raise ScenarioError(f"Error fetching the Google Sheet: {str(e)}")
|
1161
|
+
except Exception as e:
|
1162
|
+
raise ScenarioError(f"Error processing Google Sheet: {str(e)}")
|
1163
|
+
|
1164
|
+
|
1165
|
+
class DelimitedFileSource(Source):
|
1166
|
+
source_type = "delimited_file"
|
1167
|
+
|
1168
|
+
def __init__(
|
1169
|
+
self,
|
1170
|
+
file_or_url: str,
|
1171
|
+
delimiter: str = ",",
|
1172
|
+
has_header: bool = True,
|
1173
|
+
encoding: str = "utf-8",
|
1174
|
+
**kwargs
|
1175
|
+
):
|
1176
|
+
"""
|
1177
|
+
Initialize a DelimitedFileSource with a path to a delimited file or URL.
|
1178
|
+
|
1179
|
+
Args:
|
1180
|
+
file_or_url: Path to a local file or URL to a remote file.
|
1181
|
+
delimiter: The delimiter character used in the file (default is ',').
|
1182
|
+
has_header: Whether the file has a header row (default is True).
|
1183
|
+
encoding: The file encoding to use (default is 'utf-8').
|
1184
|
+
**kwargs: Additional parameters for csv reader.
|
1185
|
+
"""
|
1186
|
+
self.file_or_url = file_or_url
|
1187
|
+
self.delimiter = delimiter
|
1188
|
+
self.has_header = has_header
|
1189
|
+
self.encoding = encoding
|
1190
|
+
self.kwargs = kwargs
|
1191
|
+
|
1192
|
+
@classmethod
|
1193
|
+
def example(cls) -> 'DelimitedFileSource':
|
1194
|
+
"""Return an example DelimitedFileSource instance."""
|
1195
|
+
import tempfile
|
1196
|
+
import os
|
1197
|
+
|
1198
|
+
# Create a temporary CSV file with sample data
|
1199
|
+
fd, temp_path = tempfile.mkstemp(suffix='.csv', prefix='edsl_test_')
|
1200
|
+
os.close(fd) # Close the file descriptor
|
1201
|
+
|
1202
|
+
# Write sample data to the file
|
1203
|
+
with open(temp_path, 'w', newline='') as f:
|
1204
|
+
f.write("name,age,city\n")
|
1205
|
+
f.write("Alice,30,New York\n")
|
1206
|
+
f.write("Bob,25,San Francisco\n")
|
1207
|
+
f.write("Charlie,35,Boston\n")
|
1208
|
+
|
1209
|
+
return cls(
|
1210
|
+
file_or_url=temp_path,
|
1211
|
+
delimiter=",",
|
1212
|
+
has_header=True
|
1213
|
+
)
|
1214
|
+
|
1215
|
+
def to_scenario_list(self):
|
1216
|
+
"""Create a ScenarioList from a delimited file or URL."""
|
1217
|
+
from .scenario_list import ScenarioList
|
1218
|
+
import requests
|
1219
|
+
|
1220
|
+
# Check if the input is a URL
|
1221
|
+
parsed_url = urlparse(self.file_or_url)
|
1222
|
+
if parsed_url.scheme in ("http", "https"):
|
1223
|
+
try:
|
1224
|
+
headers = {
|
1225
|
+
"Accept": "text/csv,application/csv,text/plain",
|
1226
|
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
1227
|
+
}
|
1228
|
+
response = requests.get(self.file_or_url, headers=headers)
|
1229
|
+
response.raise_for_status()
|
1230
|
+
content = response.text
|
1231
|
+
except requests.RequestException as e:
|
1232
|
+
raise ScenarioError(f"Failed to fetch URL: {str(e)}")
|
1233
|
+
else:
|
1234
|
+
# Assume it's a file path
|
1235
|
+
try:
|
1236
|
+
with open(self.file_or_url, "r", encoding=self.encoding) as f:
|
1237
|
+
content = f.read()
|
1238
|
+
except UnicodeDecodeError:
|
1239
|
+
# Try different encoding if specified encoding fails
|
1240
|
+
encodings_to_try = ["latin-1", "cp1252", "ISO-8859-1"]
|
1241
|
+
if self.encoding in encodings_to_try:
|
1242
|
+
encodings_to_try.remove(self.encoding)
|
1243
|
+
|
1244
|
+
for encoding in encodings_to_try:
|
1245
|
+
try:
|
1246
|
+
with open(self.file_or_url, "r", encoding=encoding) as f:
|
1247
|
+
content = f.read()
|
1248
|
+
break
|
1249
|
+
except UnicodeDecodeError:
|
1250
|
+
continue
|
1251
|
+
else:
|
1252
|
+
raise ScenarioError(f"Failed to decode file with any of the attempted encodings")
|
1253
|
+
except Exception as e:
|
1254
|
+
raise ScenarioError(f"Failed to read file: {str(e)}")
|
1255
|
+
|
1256
|
+
# Parse the content
|
1257
|
+
csv_reader = csv.reader(StringIO(content), delimiter=self.delimiter, **self.kwargs)
|
1258
|
+
rows = list(csv_reader)
|
1259
|
+
|
1260
|
+
if not rows:
|
1261
|
+
return ScenarioList()
|
1262
|
+
|
1263
|
+
# Handle header row
|
1264
|
+
if self.has_header:
|
1265
|
+
header = rows[0]
|
1266
|
+
data_rows = rows[1:]
|
1267
|
+
else:
|
1268
|
+
# Auto-generate column names
|
1269
|
+
header = [f"col{i}" for i in range(len(rows[0]))]
|
1270
|
+
data_rows = rows
|
1271
|
+
|
1272
|
+
# Create scenarios
|
1273
|
+
scenarios = []
|
1274
|
+
for row in data_rows:
|
1275
|
+
if len(row) != len(header):
|
1276
|
+
warnings.warn(f"Skipping row with {len(row)} values (expected {len(header)})")
|
1277
|
+
continue
|
1278
|
+
|
1279
|
+
scenario_dict = dict(zip(header, row))
|
1280
|
+
scenarios.append(Scenario(scenario_dict))
|
1281
|
+
|
1282
|
+
return ScenarioList(scenarios)
|
1283
|
+
|
1284
|
+
|
1285
|
+
class CSVSource(DelimitedFileSource):
|
1286
|
+
source_type = "csv"
|
1287
|
+
|
1288
|
+
def __init__(
|
1289
|
+
self,
|
1290
|
+
file_or_url: str,
|
1291
|
+
has_header: bool = True,
|
1292
|
+
encoding: str = "utf-8",
|
1293
|
+
**kwargs
|
1294
|
+
):
|
1295
|
+
"""
|
1296
|
+
Initialize a CSVSource with a path to a CSV file or URL.
|
1297
|
+
|
1298
|
+
Args:
|
1299
|
+
file_or_url: Path to a local file or URL to a remote file.
|
1300
|
+
has_header: Whether the file has a header row (default is True).
|
1301
|
+
encoding: The file encoding to use (default is 'utf-8').
|
1302
|
+
**kwargs: Additional parameters for csv reader.
|
1303
|
+
"""
|
1304
|
+
super().__init__(
|
1305
|
+
file_or_url=file_or_url,
|
1306
|
+
delimiter=",",
|
1307
|
+
has_header=has_header,
|
1308
|
+
encoding=encoding,
|
1309
|
+
**kwargs
|
1310
|
+
)
|
1311
|
+
|
1312
|
+
@classmethod
|
1313
|
+
def example(cls) -> 'CSVSource':
|
1314
|
+
"""Return an example CSVSource instance."""
|
1315
|
+
import tempfile
|
1316
|
+
import os
|
1317
|
+
|
1318
|
+
# Create a temporary CSV file with sample data
|
1319
|
+
fd, temp_path = tempfile.mkstemp(suffix='.csv', prefix='edsl_test_')
|
1320
|
+
os.close(fd) # Close the file descriptor
|
1321
|
+
|
1322
|
+
# Write sample data to the file
|
1323
|
+
with open(temp_path, 'w', newline='') as f:
|
1324
|
+
f.write("name,age,city\n")
|
1325
|
+
f.write("Alice,30,New York\n")
|
1326
|
+
f.write("Bob,25,San Francisco\n")
|
1327
|
+
f.write("Charlie,35,Boston\n")
|
1328
|
+
|
1329
|
+
return cls(
|
1330
|
+
file_or_url=temp_path,
|
1331
|
+
has_header=True
|
1332
|
+
)
|
1333
|
+
|
1334
|
+
|
1335
|
+
class TSVSource(DelimitedFileSource):
|
1336
|
+
source_type = "tsv"
|
1337
|
+
|
1338
|
+
def __init__(
|
1339
|
+
self,
|
1340
|
+
file_or_url: str,
|
1341
|
+
has_header: bool = True,
|
1342
|
+
encoding: str = "utf-8",
|
1343
|
+
**kwargs
|
1344
|
+
):
|
1345
|
+
"""
|
1346
|
+
Initialize a TSVSource with a path to a TSV file or URL.
|
1347
|
+
|
1348
|
+
Args:
|
1349
|
+
file_or_url: Path to a local file or URL to a remote file.
|
1350
|
+
has_header: Whether the file has a header row (default is True).
|
1351
|
+
encoding: The file encoding to use (default is 'utf-8').
|
1352
|
+
**kwargs: Additional parameters for csv reader.
|
1353
|
+
"""
|
1354
|
+
super().__init__(
|
1355
|
+
file_or_url=file_or_url,
|
1356
|
+
delimiter="\t",
|
1357
|
+
has_header=has_header,
|
1358
|
+
encoding=encoding,
|
1359
|
+
**kwargs
|
1360
|
+
)
|
1361
|
+
|
1362
|
+
@classmethod
|
1363
|
+
def example(cls) -> 'TSVSource':
|
1364
|
+
"""Return an example TSVSource instance."""
|
1365
|
+
import tempfile
|
1366
|
+
import os
|
1367
|
+
|
1368
|
+
# Create a temporary TSV file with sample data
|
1369
|
+
fd, temp_path = tempfile.mkstemp(suffix='.tsv', prefix='edsl_test_')
|
1370
|
+
os.close(fd) # Close the file descriptor
|
1371
|
+
|
1372
|
+
# Write sample data to the file
|
1373
|
+
with open(temp_path, 'w', newline='') as f:
|
1374
|
+
f.write("name\tage\tcity\n")
|
1375
|
+
f.write("Alice\t30\tNew York\n")
|
1376
|
+
f.write("Bob\t25\tSan Francisco\n")
|
1377
|
+
f.write("Charlie\t35\tBoston\n")
|
1378
|
+
|
1379
|
+
return cls(
|
1380
|
+
file_or_url=temp_path,
|
1381
|
+
has_header=True
|
1382
|
+
)
|
1383
|
+
|
1384
|
+
class ParquetSource(Source):
|
1385
|
+
source_type = "parquet"
|
1386
|
+
|
1387
|
+
def __init__(self, file_path: str):
|
1388
|
+
"""
|
1389
|
+
Initialize a ParquetSource with a path to a Parquet file.
|
1390
|
+
|
1391
|
+
Args:
|
1392
|
+
file_path: Path to the Parquet file.
|
1393
|
+
"""
|
1394
|
+
self.file_path = file_path
|
1395
|
+
|
1396
|
+
@classmethod
|
1397
|
+
def example(cls) -> 'ParquetSource':
|
1398
|
+
"""Return an example ParquetSource instance."""
|
1399
|
+
import tempfile
|
1400
|
+
import os
|
1401
|
+
|
1402
|
+
try:
|
1403
|
+
import pandas as pd
|
1404
|
+
import pyarrow as pa
|
1405
|
+
import pyarrow.parquet as pq
|
1406
|
+
|
1407
|
+
# Create a temporary Parquet file with sample data
|
1408
|
+
fd, temp_path = tempfile.mkstemp(suffix='.parquet', prefix='edsl_test_')
|
1409
|
+
os.close(fd) # Close the file descriptor
|
1410
|
+
|
1411
|
+
# Create sample data
|
1412
|
+
df = pd.DataFrame({
|
1413
|
+
'name': ['Alice', 'Bob', 'Charlie'],
|
1414
|
+
'age': [30, 25, 35],
|
1415
|
+
'city': ['New York', 'San Francisco', 'Boston']
|
1416
|
+
})
|
1417
|
+
|
1418
|
+
# Write to Parquet file
|
1419
|
+
df.to_parquet(temp_path)
|
1420
|
+
|
1421
|
+
return cls(file_path=temp_path)
|
1422
|
+
|
1423
|
+
except ImportError:
|
1424
|
+
# Create a mock instance with an override if pandas or pyarrow is not available
|
1425
|
+
instance = cls(file_path="/path/to/nonexistent/file.parquet")
|
1426
|
+
|
1427
|
+
# Override the to_scenario_list method just for the example
|
1428
|
+
def mock_to_scenario_list(self):
|
1429
|
+
from .scenario_list import ScenarioList
|
1430
|
+
# Create a simple mock ScenarioList with sample data
|
1431
|
+
scenarios = [
|
1432
|
+
Scenario({"name": "Alice", "age": 30, "city": "New York"}),
|
1433
|
+
Scenario({"name": "Bob", "age": 25, "city": "San Francisco"}),
|
1434
|
+
Scenario({"name": "Charlie", "age": 35, "city": "Boston"})
|
1435
|
+
]
|
1436
|
+
return ScenarioList(scenarios)
|
1437
|
+
|
1438
|
+
# Replace the method on this instance only
|
1439
|
+
import types
|
1440
|
+
instance.to_scenario_list = types.MethodType(mock_to_scenario_list, instance)
|
1441
|
+
|
1442
|
+
return instance
|
1443
|
+
|
1444
|
+
def to_scenario_list(self):
|
1445
|
+
"""Create a ScenarioList from a Parquet file."""
|
1446
|
+
from .scenario_list import ScenarioList
|
1447
|
+
|
1448
|
+
try:
|
1449
|
+
import pandas as pd
|
1450
|
+
except ImportError:
|
1451
|
+
raise ImportError("pandas is required to read Parquet files")
|
1452
|
+
|
1453
|
+
try:
|
1454
|
+
import pyarrow
|
1455
|
+
except ImportError:
|
1456
|
+
raise ImportError("pyarrow is required to read Parquet files")
|
1457
|
+
|
1458
|
+
# Read the Parquet file
|
1459
|
+
df = pd.read_parquet(self.file_path)
|
1460
|
+
|
1461
|
+
# Convert DataFrame to ScenarioList
|
1462
|
+
scenarios = []
|
1463
|
+
for _, row in df.iterrows():
|
1464
|
+
scenario_dict = row.to_dict()
|
1465
|
+
scenarios.append(Scenario(scenario_dict))
|
1466
|
+
|
1467
|
+
return ScenarioList(scenarios)
|
1468
|
+
|
1469
|
+
|
1470
|
+
class PDFSource(Source):
|
1471
|
+
source_type = "pdf"
|
1472
|
+
|
1473
|
+
def __init__(
|
1474
|
+
self,
|
1475
|
+
file_path: str,
|
1476
|
+
chunk_type: Literal["page", "text"] = "page",
|
1477
|
+
chunk_size: int = 1,
|
1478
|
+
chunk_overlap: int = 0
|
1479
|
+
):
|
1480
|
+
"""
|
1481
|
+
Initialize a PDFSource with a path to a PDF file.
|
1482
|
+
|
1483
|
+
Args:
|
1484
|
+
file_path: Path to the PDF file or URL to a PDF.
|
1485
|
+
chunk_type: Type of chunking to use ("page" or "text").
|
1486
|
+
chunk_size: Size of chunks to create.
|
1487
|
+
chunk_overlap: Number of overlapping chunks.
|
1488
|
+
"""
|
1489
|
+
self.file_path = file_path
|
1490
|
+
self.chunk_type = chunk_type
|
1491
|
+
self.chunk_size = chunk_size
|
1492
|
+
self.chunk_overlap = chunk_overlap
|
1493
|
+
|
1494
|
+
@classmethod
|
1495
|
+
def example(cls) -> 'PDFSource':
|
1496
|
+
"""Return an example PDFSource instance."""
|
1497
|
+
# Skip actual file creation and just use a mock instance
|
1498
|
+
instance = cls(
|
1499
|
+
file_path="/path/to/nonexistent/file.pdf",
|
1500
|
+
chunk_type="page",
|
1501
|
+
chunk_size=1,
|
1502
|
+
chunk_overlap=0
|
1503
|
+
)
|
1504
|
+
|
1505
|
+
# Override the to_scenario_list method just for the example
|
1506
|
+
def mock_to_scenario_list(self):
|
1507
|
+
from .scenario_list import ScenarioList
|
1508
|
+
# Create a simple mock ScenarioList with sample PDF data
|
1509
|
+
scenarios = [
|
1510
|
+
Scenario({"filename": "example.pdf", "page": 1, "text": "This is page 1 content"}),
|
1511
|
+
Scenario({"filename": "example.pdf", "page": 2, "text": "This is page 2 content"})
|
1512
|
+
]
|
1513
|
+
return ScenarioList(scenarios)
|
1514
|
+
|
1515
|
+
# Replace the method on this instance only
|
1516
|
+
import types
|
1517
|
+
instance.to_scenario_list = types.MethodType(mock_to_scenario_list, instance)
|
1518
|
+
|
1519
|
+
return instance
|
1520
|
+
|
1521
|
+
def to_scenario_list(self):
|
1522
|
+
"""Create a ScenarioList from a PDF file."""
|
1523
|
+
from .scenario_list import ScenarioList
|
1524
|
+
from .scenario_list_pdf_tools import PdfTools
|
1525
|
+
|
1526
|
+
try:
|
1527
|
+
# Check if it's a URL
|
1528
|
+
if PdfTools.is_url(self.file_path):
|
1529
|
+
# Download the PDF file
|
1530
|
+
if "drive.google.com" in self.file_path:
|
1531
|
+
# It's a Google Drive URL
|
1532
|
+
local_path = PdfTools.GoogleDriveDownloader.fetch_from_drive(
|
1533
|
+
self.file_path, "temp_pdf.pdf"
|
1534
|
+
)
|
1535
|
+
else:
|
1536
|
+
# It's a regular URL
|
1537
|
+
local_path = PdfTools.fetch_and_save_pdf(self.file_path, "temp_pdf.pdf")
|
1538
|
+
else:
|
1539
|
+
# It's a local file path
|
1540
|
+
local_path = self.file_path
|
1541
|
+
|
1542
|
+
# Extract scenarios from the PDF
|
1543
|
+
scenarios = list(PdfTools.extract_text_from_pdf(local_path))
|
1544
|
+
|
1545
|
+
# Handle chunking based on the specified parameters
|
1546
|
+
if self.chunk_type == "page":
|
1547
|
+
# Default behavior - one scenario per page
|
1548
|
+
return ScenarioList(scenarios)
|
1549
|
+
elif self.chunk_type == "text":
|
1550
|
+
# Combine all text
|
1551
|
+
combined_text = ""
|
1552
|
+
for scenario in scenarios:
|
1553
|
+
combined_text += scenario["text"]
|
1554
|
+
|
1555
|
+
# Create a single scenario with all text
|
1556
|
+
base_scenario = scenarios[0].copy()
|
1557
|
+
base_scenario["text"] = combined_text
|
1558
|
+
return ScenarioList([base_scenario])
|
1559
|
+
else:
|
1560
|
+
raise ValueError(f"Invalid chunk_type: {self.chunk_type}. Must be 'page' or 'text'.")
|
1561
|
+
|
1562
|
+
except Exception as e:
|
1563
|
+
from .exceptions import ScenarioError
|
1564
|
+
raise ScenarioError(f"Error processing PDF: {str(e)}")
|
1565
|
+
|
1566
|
+
|
1567
|
+
class PDFImageSource(Source):
|
1568
|
+
source_type = "pdf_to_image"
|
1569
|
+
|
1570
|
+
def __init__(
|
1571
|
+
self,
|
1572
|
+
file_path: str,
|
1573
|
+
base_width: int = 2000,
|
1574
|
+
include_text: bool = True
|
1575
|
+
):
|
1576
|
+
"""
|
1577
|
+
Initialize a PDFImageSource with a path to a PDF file.
|
1578
|
+
|
1579
|
+
Args:
|
1580
|
+
file_path: Path to the PDF file.
|
1581
|
+
base_width: Width to use for the generated images.
|
1582
|
+
include_text: Whether to include extracted text with the images.
|
1583
|
+
"""
|
1584
|
+
self.file_path = file_path
|
1585
|
+
self.base_width = base_width
|
1586
|
+
self.include_text = include_text
|
1587
|
+
|
1588
|
+
@classmethod
|
1589
|
+
def example(cls) -> 'PDFImageSource':
|
1590
|
+
"""Return an example PDFImageSource instance."""
|
1591
|
+
# Skip actual file creation and just use a mock instance
|
1592
|
+
instance = cls(
|
1593
|
+
file_path="/path/to/nonexistent/file.pdf",
|
1594
|
+
base_width=2000,
|
1595
|
+
include_text=True
|
1596
|
+
)
|
1597
|
+
|
1598
|
+
# Override the to_scenario_list method just for the example
|
1599
|
+
def mock_to_scenario_list(self):
|
1600
|
+
from .scenario_list import ScenarioList
|
1601
|
+
# Create a simple mock ScenarioList with sample PDF image data
|
1602
|
+
scenarios = [
|
1603
|
+
Scenario({"filepath": "/tmp/page_1.jpeg", "page": 0, "text": "This is page 1 content"}),
|
1604
|
+
Scenario({"filepath": "/tmp/page_2.jpeg", "page": 1, "text": "This is page 2 content"})
|
1605
|
+
]
|
1606
|
+
return ScenarioList(scenarios)
|
1607
|
+
|
1608
|
+
# Replace the method on this instance only
|
1609
|
+
import types
|
1610
|
+
instance.to_scenario_list = types.MethodType(mock_to_scenario_list, instance)
|
1611
|
+
|
1612
|
+
return instance
|
1613
|
+
|
1614
|
+
def to_scenario_list(self):
|
1615
|
+
"""Create a ScenarioList from a PDF file, converting pages to images."""
|
1616
|
+
from .scenario_list import ScenarioList
|
1617
|
+
from .scenario_list_pdf_tools import PdfTools
|
1618
|
+
|
1619
|
+
try:
|
1620
|
+
# Import pdf2image library
|
1621
|
+
try:
|
1622
|
+
from pdf2image import convert_from_path
|
1623
|
+
except ImportError:
|
1624
|
+
raise ImportError("pdf2image is required to convert PDF to images. Install it with 'pip install pdf2image'.")
|
1625
|
+
|
1626
|
+
# Convert PDF pages to images
|
1627
|
+
scenarios = PdfTools.from_pdf_to_image(self.file_path, image_format="jpeg")
|
1628
|
+
return ScenarioList(scenarios)
|
1629
|
+
|
1630
|
+
except Exception as e:
|
1631
|
+
from .exceptions import ScenarioError
|
1632
|
+
raise ScenarioError(f"Error converting PDF to images: {str(e)}")
|
1633
|
+
|
1634
|
+
|
1635
|
+
class ScenarioSource:
|
1636
|
+
"""
|
1637
|
+
Factory class for creating ScenarioList objects from various sources.
|
1638
|
+
|
1639
|
+
This class provides static methods for creating ScenarioList objects from different
|
1640
|
+
data sources, centralizing the creation logic that was previously scattered across
|
1641
|
+
different classmethods in the ScenarioList class.
|
1642
|
+
|
1643
|
+
The main entry point is the from_source method, which dispatches to appropriate
|
1644
|
+
source-specific methods based on the source_type parameter.
|
1645
|
+
"""
|
1646
|
+
|
1647
|
+
@staticmethod
|
1648
|
+
def from_source(source_type: str, *args, **kwargs):
|
1649
|
+
"""
|
1650
|
+
Create a ScenarioList from a specified source type.
|
1651
|
+
|
1652
|
+
This method serves as the main entry point for creating ScenarioList objects,
|
1653
|
+
dispatching to the appropriate source-specific method based on the source_type.
|
1654
|
+
|
1655
|
+
Args:
|
1656
|
+
source_type: The type of source to create a ScenarioList from.
|
1657
|
+
Valid values include: 'urls', 'directory', 'list', 'list_of_tuples',
|
1658
|
+
'sqlite', 'latex', 'google_doc', 'pandas', 'dta', 'wikipedia',
|
1659
|
+
'excel', 'google_sheet', 'delimited_file', 'csv', 'tsv', 'dict',
|
1660
|
+
'nested_dict', 'parquet', 'pdf', 'pdf_to_image'.
|
1661
|
+
*args: Positional arguments to pass to the source-specific method.
|
1662
|
+
**kwargs: Keyword arguments to pass to the source-specific method.
|
1663
|
+
|
1664
|
+
Returns:
|
1665
|
+
A ScenarioList object created from the specified source.
|
1666
|
+
|
1667
|
+
Raises:
|
1668
|
+
ValueError: If the source_type is not recognized.
|
1669
|
+
"""
|
1670
|
+
try:
|
1671
|
+
source_class = Source.get_source_class(source_type)
|
1672
|
+
source_instance = source_class(*args, **kwargs)
|
1673
|
+
return source_instance.to_scenario_list()
|
1674
|
+
except ValueError as e:
|
1675
|
+
# For backward compatibility, try the old method if the source_type isn't in the registry
|
1676
|
+
method_name = f"_from_{source_type}"
|
1677
|
+
if hasattr(ScenarioSource, method_name):
|
1678
|
+
method = getattr(ScenarioSource, method_name)
|
1679
|
+
return method(*args, **kwargs)
|
1680
|
+
else:
|
1681
|
+
raise ValueError(f"Unsupported source type: {source_type}")
|
1682
|
+
|
1683
|
+
@staticmethod
|
1684
|
+
def _from_urls(urls: list[str], field_name: Optional[str] = "text"):
|
1685
|
+
"""Create a ScenarioList from a list of URLs."""
|
1686
|
+
from .scenario_list import ScenarioList
|
1687
|
+
|
1688
|
+
import requests
|
1689
|
+
|
1690
|
+
result = ScenarioList()
|
1691
|
+
for url in urls:
|
1692
|
+
try:
|
1693
|
+
response = requests.get(url)
|
1694
|
+
response.raise_for_status()
|
1695
|
+
scenario = Scenario({field_name: response.text})
|
1696
|
+
result.append(scenario)
|
1697
|
+
except requests.RequestException as e:
|
1698
|
+
warnings.warn(f"Failed to fetch URL {url}: {str(e)}")
|
1699
|
+
continue
|
1700
|
+
|
1701
|
+
return result
|
1702
|
+
|
1703
|
+
@staticmethod
|
1704
|
+
def _from_directory(
|
1705
|
+
directory: str,
|
1706
|
+
pattern: str = "*",
|
1707
|
+
recursive: bool = False,
|
1708
|
+
metadata: bool = True,
|
1709
|
+
ignore_dirs: List[str] = None,
|
1710
|
+
ignore_files: List[str] = None,
|
1711
|
+
):
|
1712
|
+
"""Create a ScenarioList from files in a directory."""
|
1713
|
+
warnings.warn(
|
1714
|
+
"_from_directory is deprecated. Use DirectorySource directly or ScenarioSource.from_source('directory', ...) instead.",
|
1715
|
+
DeprecationWarning,
|
1716
|
+
stacklevel=2
|
1717
|
+
)
|
1718
|
+
source = DirectorySource(
|
1719
|
+
directory=directory,
|
1720
|
+
pattern=pattern,
|
1721
|
+
recursive=recursive,
|
1722
|
+
metadata=metadata,
|
1723
|
+
ignore_dirs=ignore_dirs,
|
1724
|
+
ignore_files=ignore_files
|
1725
|
+
)
|
1726
|
+
return source.to_scenario_list()
|
1727
|
+
|
1728
|
+
@staticmethod
|
1729
|
+
def _from_list(
|
1730
|
+
field_name: str, values: list, use_indexes: bool = False
|
1731
|
+
):
|
1732
|
+
"""Create a ScenarioList from a list of values with a specified field name."""
|
1733
|
+
warnings.warn(
|
1734
|
+
"_from_list is deprecated. Use ListSource directly or ScenarioSource.from_source('list', ...) instead.",
|
1735
|
+
DeprecationWarning,
|
1736
|
+
stacklevel=2
|
1737
|
+
)
|
1738
|
+
source = ListSource(field_name, values, use_indexes)
|
1739
|
+
return source.to_scenario_list()
|
1740
|
+
|
1741
|
+
@staticmethod
|
1742
|
+
def _from_list_of_tuples(
|
1743
|
+
field_names: list[str], values: list[tuple], use_indexes: bool = False
|
1744
|
+
):
|
1745
|
+
"""Create a ScenarioList from a list of tuples with specified field names."""
|
1746
|
+
warnings.warn(
|
1747
|
+
"_from_list_of_tuples is deprecated. Use TuplesSource directly or ScenarioSource.from_source('list_of_tuples', ...) instead.",
|
1748
|
+
DeprecationWarning,
|
1749
|
+
stacklevel=2
|
1750
|
+
)
|
1751
|
+
source = TuplesSource(field_names, values, use_indexes)
|
1752
|
+
return source.to_scenario_list()
|
1753
|
+
|
1754
|
+
@staticmethod
|
1755
|
+
def _from_sqlite(
|
1756
|
+
db_path: str, table: str, fields: Optional[list] = None
|
1757
|
+
):
|
1758
|
+
"""Create a ScenarioList from a SQLite database."""
|
1759
|
+
warnings.warn(
|
1760
|
+
"_from_sqlite is deprecated. Use SQLiteSource directly or ScenarioSource.from_source('sqlite', ...) instead.",
|
1761
|
+
DeprecationWarning,
|
1762
|
+
stacklevel=2
|
1763
|
+
)
|
1764
|
+
source = SQLiteSource(db_path, table, fields)
|
1765
|
+
return source.to_scenario_list()
|
1766
|
+
|
1767
|
+
@staticmethod
|
1768
|
+
def _from_latex(
|
1769
|
+
file_path: str, table_index: int = 0, has_header: bool = True
|
1770
|
+
):
|
1771
|
+
"""Create a ScenarioList from a LaTeX file."""
|
1772
|
+
warnings.warn(
|
1773
|
+
"_from_latex is deprecated. Use LaTeXSource directly or ScenarioSource.from_source('latex', ...) instead.",
|
1774
|
+
DeprecationWarning,
|
1775
|
+
stacklevel=2
|
1776
|
+
)
|
1777
|
+
source = LaTeXSource(file_path, table_index, has_header)
|
1778
|
+
return source.to_scenario_list()
|
1779
|
+
|
1780
|
+
@staticmethod
|
1781
|
+
def _from_google_doc(url: str):
|
1782
|
+
"""Create a ScenarioList from a Google Doc."""
|
1783
|
+
warnings.warn(
|
1784
|
+
"_from_google_doc is deprecated. Use GoogleDocSource directly or ScenarioSource.from_source('google_doc', ...) instead.",
|
1785
|
+
DeprecationWarning,
|
1786
|
+
stacklevel=2
|
1787
|
+
)
|
1788
|
+
source = GoogleDocSource(url)
|
1789
|
+
return source.to_scenario_list()
|
1790
|
+
|
1791
|
+
@staticmethod
|
1792
|
+
def _from_pandas(df):
|
1793
|
+
"""Create a ScenarioList from a pandas DataFrame."""
|
1794
|
+
warnings.warn(
|
1795
|
+
"_from_pandas is deprecated. Use PandasSource directly or ScenarioSource.from_source('pandas', ...) instead.",
|
1796
|
+
DeprecationWarning,
|
1797
|
+
stacklevel=2
|
1798
|
+
)
|
1799
|
+
source = PandasSource(df)
|
1800
|
+
return source.to_scenario_list()
|
1801
|
+
|
1802
|
+
@staticmethod
|
1803
|
+
def _from_dta(file_path: str, include_metadata: bool = True):
|
1804
|
+
"""Create a ScenarioList from a Stata data file."""
|
1805
|
+
warnings.warn(
|
1806
|
+
"_from_dta is deprecated. Use StataSource directly or ScenarioSource.from_source('dta', ...) instead.",
|
1807
|
+
DeprecationWarning,
|
1808
|
+
stacklevel=2
|
1809
|
+
)
|
1810
|
+
source = StataSource(file_path, include_metadata)
|
1811
|
+
return source.to_scenario_list()
|
1812
|
+
|
1813
|
+
@staticmethod
|
1814
|
+
def _from_wikipedia(
|
1815
|
+
url: str, table_index: int = 0, header: bool = True
|
1816
|
+
):
|
1817
|
+
"""Create a ScenarioList from a table on a Wikipedia page."""
|
1818
|
+
warnings.warn(
|
1819
|
+
"_from_wikipedia is deprecated. Use WikipediaSource directly or ScenarioSource.from_source('wikipedia', ...) instead.",
|
1820
|
+
DeprecationWarning,
|
1821
|
+
stacklevel=2
|
1822
|
+
)
|
1823
|
+
source = WikipediaSource(url, table_index, header)
|
1824
|
+
return source.to_scenario_list()
|
1825
|
+
|
1826
|
+
@staticmethod
|
1827
|
+
def _from_excel(
|
1828
|
+
file_path: str, sheet_name: Optional[str] = None, **kwargs
|
1829
|
+
):
|
1830
|
+
"""Create a ScenarioList from an Excel file."""
|
1831
|
+
warnings.warn(
|
1832
|
+
"_from_excel is deprecated. Use ExcelSource directly or ScenarioSource.from_source('excel', ...) instead.",
|
1833
|
+
DeprecationWarning,
|
1834
|
+
stacklevel=2
|
1835
|
+
)
|
1836
|
+
source = ExcelSource(file_path, sheet_name=sheet_name, **kwargs)
|
1837
|
+
return source.to_scenario_list()
|
1838
|
+
|
1839
|
+
@staticmethod
|
1840
|
+
def _from_google_sheet(url: str, sheet_name: Optional[str] = None, column_names: Optional[List[str]] = None, **kwargs):
|
1841
|
+
"""Create a ScenarioList from a Google Sheet."""
|
1842
|
+
warnings.warn(
|
1843
|
+
"_from_google_sheet is deprecated. Use GoogleSheetSource directly or ScenarioSource.from_source('google_sheet', ...) instead.",
|
1844
|
+
DeprecationWarning,
|
1845
|
+
stacklevel=2
|
1846
|
+
)
|
1847
|
+
source = GoogleSheetSource(url, sheet_name=sheet_name, column_names=column_names, **kwargs)
|
1848
|
+
return source.to_scenario_list()
|
1849
|
+
|
1850
|
+
@staticmethod
|
1851
|
+
def _from_delimited_file(
|
1852
|
+
file_or_url: str,
|
1853
|
+
delimiter: str = ",",
|
1854
|
+
has_header: bool = True,
|
1855
|
+
encoding: str = "utf-8",
|
1856
|
+
**kwargs,
|
1857
|
+
):
|
1858
|
+
"""Create a ScenarioList from a delimited file or URL."""
|
1859
|
+
warnings.warn(
|
1860
|
+
"_from_delimited_file is deprecated. Use DelimitedFileSource directly or ScenarioSource.from_source('delimited_file', ...) instead.",
|
1861
|
+
DeprecationWarning,
|
1862
|
+
stacklevel=2
|
1863
|
+
)
|
1864
|
+
source = DelimitedFileSource(
|
1865
|
+
file_or_url=file_or_url,
|
1866
|
+
delimiter=delimiter,
|
1867
|
+
has_header=has_header,
|
1868
|
+
encoding=encoding,
|
1869
|
+
**kwargs
|
1870
|
+
)
|
1871
|
+
return source.to_scenario_list()
|
1872
|
+
|
1873
|
+
@staticmethod
|
1874
|
+
def _from_csv(file_or_url: str, **kwargs):
|
1875
|
+
"""Create a ScenarioList from a CSV file or URL."""
|
1876
|
+
warnings.warn(
|
1877
|
+
"_from_csv is deprecated. Use CSVSource directly or ScenarioSource.from_source('csv', ...) instead.",
|
1878
|
+
DeprecationWarning,
|
1879
|
+
stacklevel=2
|
1880
|
+
)
|
1881
|
+
source = CSVSource(file_or_url=file_or_url, **kwargs)
|
1882
|
+
return source.to_scenario_list()
|
1883
|
+
|
1884
|
+
@staticmethod
|
1885
|
+
def _from_tsv(file_or_url: str, **kwargs):
|
1886
|
+
"""Create a ScenarioList from a TSV file or URL."""
|
1887
|
+
warnings.warn(
|
1888
|
+
"_from_tsv is deprecated. Use TSVSource directly or ScenarioSource.from_source('tsv', ...) instead.",
|
1889
|
+
DeprecationWarning,
|
1890
|
+
stacklevel=2
|
1891
|
+
)
|
1892
|
+
source = TSVSource(file_or_url=file_or_url, **kwargs)
|
1893
|
+
return source.to_scenario_list()
|
1894
|
+
|
1895
|
+
@staticmethod
|
1896
|
+
def _from_dict(data: dict):
|
1897
|
+
"""Create a ScenarioList from a dictionary."""
|
1898
|
+
from .scenario_list import ScenarioList
|
1899
|
+
|
1900
|
+
if "scenarios" in data:
|
1901
|
+
scenarios = [Scenario(s) for s in data["scenarios"]]
|
1902
|
+
codebook = data.get("codebook", {})
|
1903
|
+
return ScenarioList(scenarios, codebook)
|
1904
|
+
else:
|
1905
|
+
scenarios = []
|
1906
|
+
# Assume the dict maps field names to lists of values
|
1907
|
+
field_names = list(data.keys())
|
1908
|
+
if not all(isinstance(v, list) for v in data.values()):
|
1909
|
+
raise ScenarioError("All values in the dictionary must be lists")
|
1910
|
+
|
1911
|
+
# Check all lists have the same length
|
1912
|
+
list_lengths = [len(v) for v in data.values()]
|
1913
|
+
if not all(l == list_lengths[0] for l in list_lengths):
|
1914
|
+
raise ScenarioError("All lists must have the same length")
|
1915
|
+
|
1916
|
+
# Create scenarios
|
1917
|
+
for i in range(list_lengths[0]):
|
1918
|
+
scenario_dict = {k: data[k][i] for k in field_names}
|
1919
|
+
scenarios.append(Scenario(scenario_dict))
|
1920
|
+
|
1921
|
+
return ScenarioList(scenarios)
|
1922
|
+
|
1923
|
+
@staticmethod
|
1924
|
+
def _from_nested_dict(data: dict, id_field: Optional[str] = None):
|
1925
|
+
"""Create a ScenarioList from a nested dictionary."""
|
1926
|
+
from .scenario_list import ScenarioList
|
1927
|
+
|
1928
|
+
scenarios = []
|
1929
|
+
|
1930
|
+
for key, value in data.items():
|
1931
|
+
if not isinstance(value, dict):
|
1932
|
+
raise ScenarioError(f"Value for key {key} is not a dictionary")
|
1933
|
+
|
1934
|
+
scenario_dict = value.copy()
|
1935
|
+
if id_field:
|
1936
|
+
scenario_dict[id_field] = key
|
1937
|
+
scenarios.append(Scenario(scenario_dict))
|
1938
|
+
|
1939
|
+
return ScenarioList(scenarios)
|
1940
|
+
|
1941
|
+
@staticmethod
|
1942
|
+
def _from_parquet(file_path: str):
|
1943
|
+
"""Create a ScenarioList from a Parquet file."""
|
1944
|
+
warnings.warn(
|
1945
|
+
"_from_parquet is deprecated. Use ParquetSource directly or ScenarioSource.from_source('parquet', ...) instead.",
|
1946
|
+
DeprecationWarning,
|
1947
|
+
stacklevel=2
|
1948
|
+
)
|
1949
|
+
source = ParquetSource(file_path)
|
1950
|
+
return source.to_scenario_list()
|
1951
|
+
|
1952
|
+
@staticmethod
|
1953
|
+
def _from_pdf(
|
1954
|
+
file_path: str,
|
1955
|
+
chunk_type: Literal["page", "text"] = "page",
|
1956
|
+
chunk_size: int = 1,
|
1957
|
+
chunk_overlap: int = 0,
|
1958
|
+
):
|
1959
|
+
"""Create a ScenarioList from a PDF file."""
|
1960
|
+
warnings.warn(
|
1961
|
+
"_from_pdf is deprecated. Use PDFSource directly or ScenarioSource.from_source('pdf', ...) instead.",
|
1962
|
+
DeprecationWarning,
|
1963
|
+
stacklevel=2
|
1964
|
+
)
|
1965
|
+
source = PDFSource(
|
1966
|
+
file_path=file_path,
|
1967
|
+
chunk_type=chunk_type,
|
1968
|
+
chunk_size=chunk_size,
|
1969
|
+
chunk_overlap=chunk_overlap
|
1970
|
+
)
|
1971
|
+
return source.to_scenario_list()
|
1972
|
+
|
1973
|
+
@staticmethod
|
1974
|
+
def _from_pdf_to_image(
|
1975
|
+
file_path: str,
|
1976
|
+
base_width: int = 2000,
|
1977
|
+
include_text: bool = True,
|
1978
|
+
):
|
1979
|
+
"""Create a ScenarioList containing images extracted from a PDF file."""
|
1980
|
+
warnings.warn(
|
1981
|
+
"_from_pdf_to_image is deprecated. Use PDFImageSource directly or ScenarioSource.from_source('pdf_to_image', ...) instead.",
|
1982
|
+
DeprecationWarning,
|
1983
|
+
stacklevel=2
|
1984
|
+
)
|
1985
|
+
source = PDFImageSource(
|
1986
|
+
file_path=file_path,
|
1987
|
+
base_width=base_width,
|
1988
|
+
include_text=include_text
|
1989
|
+
)
|
1990
|
+
return source.to_scenario_list()
|