edsl 0.1.60__py3-none-any.whl → 0.1.61__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- edsl/__version__.py +1 -1
- edsl/agents/agent.py +65 -17
- edsl/agents/agent_list.py +117 -33
- edsl/base/base_class.py +80 -11
- edsl/config/config_class.py +7 -2
- edsl/coop/coop.py +1295 -85
- edsl/coop/coop_prolific_filters.py +171 -0
- edsl/dataset/display/table_display.py +40 -7
- edsl/db_list/sqlite_list.py +102 -3
- edsl/jobs/data_structures.py +46 -31
- edsl/jobs/jobs.py +73 -2
- edsl/jobs/remote_inference.py +49 -15
- edsl/questions/loop_processor.py +289 -10
- edsl/questions/templates/dict/answering_instructions.jinja +0 -1
- edsl/scenarios/scenario_list.py +31 -1
- edsl/scenarios/scenario_source.py +606 -498
- edsl/surveys/survey.py +198 -163
- {edsl-0.1.60.dist-info → edsl-0.1.61.dist-info}/METADATA +3 -3
- {edsl-0.1.60.dist-info → edsl-0.1.61.dist-info}/RECORD +22 -21
- {edsl-0.1.60.dist-info → edsl-0.1.61.dist-info}/LICENSE +0 -0
- {edsl-0.1.60.dist-info → edsl-0.1.61.dist-info}/WHEEL +0 -0
- {edsl-0.1.60.dist-info → edsl-0.1.61.dist-info}/entry_points.txt +0 -0
@@ -16,32 +16,53 @@ from __future__ import annotations
|
|
16
16
|
import functools
|
17
17
|
import warnings
|
18
18
|
import fnmatch
|
19
|
-
from
|
19
|
+
from collections import defaultdict
|
20
|
+
import warnings
|
21
|
+
from typing import (
|
22
|
+
Any,
|
23
|
+
Callable,
|
24
|
+
List,
|
25
|
+
Literal,
|
26
|
+
Optional,
|
27
|
+
Type,
|
28
|
+
TypeVar,
|
29
|
+
Union,
|
30
|
+
TYPE_CHECKING,
|
31
|
+
cast,
|
32
|
+
Any,
|
33
|
+
)
|
34
|
+
|
35
|
+
T = TypeVar("T")
|
20
36
|
|
21
|
-
T = TypeVar('T')
|
22
37
|
|
23
|
-
def deprecated_classmethod(
|
38
|
+
def deprecated_classmethod(
|
39
|
+
alternative: str,
|
40
|
+
) -> Callable[[Callable[..., T]], Callable[..., T]]:
|
24
41
|
"""
|
25
42
|
Decorator that marks a class method as deprecated.
|
26
|
-
|
43
|
+
|
27
44
|
Args:
|
28
45
|
alternative: The suggested alternative to use instead
|
29
|
-
|
46
|
+
|
30
47
|
Returns:
|
31
48
|
A decorator function that wraps the original method with a deprecation warning
|
32
49
|
"""
|
50
|
+
|
33
51
|
def decorator(func: Callable[..., T]) -> Callable[..., T]:
|
34
52
|
@functools.wraps(func)
|
35
53
|
def wrapper(*args: Any, **kwargs: Any) -> T:
|
36
54
|
warnings.warn(
|
37
55
|
f"{func.__qualname__} is deprecated. Use {alternative} instead.",
|
38
56
|
DeprecationWarning,
|
39
|
-
stacklevel=2
|
57
|
+
stacklevel=2,
|
40
58
|
)
|
41
59
|
return func(*args, **kwargs)
|
60
|
+
|
42
61
|
return wrapper
|
62
|
+
|
43
63
|
return decorator
|
44
64
|
|
65
|
+
|
45
66
|
import os
|
46
67
|
import csv
|
47
68
|
import json
|
@@ -61,27 +82,28 @@ from .exceptions import ScenarioError
|
|
61
82
|
|
62
83
|
from abc import ABC, abstractmethod
|
63
84
|
|
85
|
+
|
64
86
|
class Source(ABC):
|
65
87
|
# Registry to store child classes and their source types
|
66
|
-
_registry: dict[str, Type[
|
88
|
+
_registry: dict[str, Type["Source"]] = {}
|
67
89
|
|
68
90
|
def __init_subclass__(cls, **kwargs):
|
69
91
|
"""Automatically register subclasses with their source_type."""
|
70
92
|
super().__init_subclass__(**kwargs)
|
71
|
-
if hasattr(cls,
|
93
|
+
if hasattr(cls, "source_type"):
|
72
94
|
Source._registry[cls.source_type] = cls
|
73
95
|
|
74
96
|
@classmethod
|
75
97
|
@abstractmethod
|
76
|
-
def example(cls) ->
|
98
|
+
def example(cls) -> "Source":
|
77
99
|
"""
|
78
100
|
Return an example instance of this Source type.
|
79
|
-
|
101
|
+
|
80
102
|
This method should return a valid instance of the Source subclass
|
81
103
|
that can be used for testing. The instance should be created with
|
82
104
|
reasonable default values that will produce a valid ScenarioList
|
83
105
|
when to_scenario_list() is called.
|
84
|
-
|
106
|
+
|
85
107
|
Returns:
|
86
108
|
An instance of the Source subclass
|
87
109
|
"""
|
@@ -91,14 +113,14 @@ class Source(ABC):
|
|
91
113
|
def to_scenario_list(self):
|
92
114
|
"""
|
93
115
|
Convert the source to a ScenarioList.
|
94
|
-
|
116
|
+
|
95
117
|
Returns:
|
96
118
|
A ScenarioList containing the data from this source
|
97
119
|
"""
|
98
120
|
pass
|
99
121
|
|
100
122
|
@classmethod
|
101
|
-
def get_source_class(cls, source_type: str) -> Type[
|
123
|
+
def get_source_class(cls, source_type: str) -> Type["Source"]:
|
102
124
|
"""Get the Source subclass for a given source_type."""
|
103
125
|
if source_type not in cls._registry:
|
104
126
|
raise ValueError(f"No Source subclass found for source_type: {source_type}")
|
@@ -114,12 +136,12 @@ class Source(ABC):
|
|
114
136
|
"""
|
115
137
|
Test all registered source types by creating an example instance
|
116
138
|
and calling to_scenario_list() on it.
|
117
|
-
|
139
|
+
|
118
140
|
Returns:
|
119
141
|
A dictionary mapping source types to boolean success values
|
120
142
|
"""
|
121
143
|
from .scenario_list import ScenarioList
|
122
|
-
|
144
|
+
|
123
145
|
results = {}
|
124
146
|
for source_type, source_class in cls._registry.items():
|
125
147
|
try:
|
@@ -130,7 +152,9 @@ class Source(ABC):
|
|
130
152
|
# Basic validation
|
131
153
|
if not isinstance(scenario_list, ScenarioList):
|
132
154
|
results[source_type] = False
|
133
|
-
print(
|
155
|
+
print(
|
156
|
+
f"Source {source_type} returned {type(scenario_list)} instead of ScenarioList"
|
157
|
+
)
|
134
158
|
else:
|
135
159
|
results[source_type] = True
|
136
160
|
except Exception as e:
|
@@ -138,6 +162,7 @@ class Source(ABC):
|
|
138
162
|
print(f"Source {source_type} exception: {e}")
|
139
163
|
return results
|
140
164
|
|
165
|
+
|
141
166
|
class URLSource(Source):
|
142
167
|
source_type = "urls"
|
143
168
|
|
@@ -146,19 +171,16 @@ class URLSource(Source):
|
|
146
171
|
self.field_name = field_name
|
147
172
|
|
148
173
|
@classmethod
|
149
|
-
def example(cls) ->
|
174
|
+
def example(cls) -> "URLSource":
|
150
175
|
"""Return an example URLSource instance."""
|
151
|
-
return cls(
|
152
|
-
|
153
|
-
field_name="text"
|
154
|
-
)
|
155
|
-
|
176
|
+
return cls(urls=["http://www.example.com"], field_name="text")
|
177
|
+
|
156
178
|
def to_scenario_list(self):
|
157
179
|
"""Create a ScenarioList from a list of URLs."""
|
158
180
|
import requests
|
159
|
-
|
181
|
+
|
160
182
|
from .scenario_list import ScenarioList
|
161
|
-
|
183
|
+
|
162
184
|
result = ScenarioList()
|
163
185
|
for url in self.urls:
|
164
186
|
try:
|
@@ -169,9 +191,9 @@ class URLSource(Source):
|
|
169
191
|
except requests.RequestException as e:
|
170
192
|
warnings.warn(f"Failed to fetch URL {url}: {str(e)}")
|
171
193
|
continue
|
172
|
-
|
194
|
+
|
173
195
|
return result
|
174
|
-
|
196
|
+
|
175
197
|
|
176
198
|
class ListSource(Source):
|
177
199
|
source_type = "list"
|
@@ -182,26 +204,26 @@ class ListSource(Source):
|
|
182
204
|
self.use_indexes = use_indexes
|
183
205
|
|
184
206
|
@classmethod
|
185
|
-
def example(cls) ->
|
207
|
+
def example(cls) -> "ListSource":
|
186
208
|
"""Return an example ListSource instance."""
|
187
209
|
return cls(
|
188
210
|
field_name="text",
|
189
211
|
values=["example1", "example2", "example3"],
|
190
|
-
use_indexes=True
|
212
|
+
use_indexes=True,
|
191
213
|
)
|
192
214
|
|
193
215
|
def to_scenario_list(self):
|
194
216
|
"""Create a ScenarioList from a list of values with a specified field name."""
|
195
217
|
from .scenario_list import ScenarioList
|
196
|
-
|
218
|
+
|
197
219
|
scenarios = []
|
198
|
-
|
220
|
+
|
199
221
|
for i, value in enumerate(self.values):
|
200
222
|
scenario_dict = {self.field_name: value}
|
201
223
|
if self.use_indexes:
|
202
224
|
scenario_dict["idx"] = i
|
203
225
|
scenarios.append(Scenario(scenario_dict))
|
204
|
-
|
226
|
+
|
205
227
|
return ScenarioList(scenarios)
|
206
228
|
|
207
229
|
|
@@ -225,48 +247,48 @@ class DirectorySource(Source):
|
|
225
247
|
self.ignore_files = ignore_files or []
|
226
248
|
|
227
249
|
@classmethod
|
228
|
-
def example(cls) ->
|
250
|
+
def example(cls) -> "DirectorySource":
|
229
251
|
"""Return an example DirectorySource instance."""
|
230
252
|
import tempfile
|
231
253
|
import os
|
232
|
-
|
254
|
+
|
233
255
|
# Create a temporary directory for the example
|
234
256
|
temp_dir = tempfile.mkdtemp(prefix="edsl_test_")
|
235
|
-
|
257
|
+
|
236
258
|
# Create some sample files in the directory
|
237
259
|
with open(os.path.join(temp_dir, "test1.txt"), "w") as f:
|
238
260
|
f.write("Sample content 1")
|
239
|
-
|
261
|
+
|
240
262
|
with open(os.path.join(temp_dir, "test2.txt"), "w") as f:
|
241
263
|
f.write("Sample content 2")
|
242
|
-
|
264
|
+
|
243
265
|
# Create a subdirectory with a file
|
244
266
|
subdir = os.path.join(temp_dir, "subdir")
|
245
267
|
os.makedirs(subdir, exist_ok=True)
|
246
268
|
with open(os.path.join(subdir, "test3.txt"), "w") as f:
|
247
269
|
f.write("Sample content 3")
|
248
|
-
|
270
|
+
|
249
271
|
return cls(
|
250
272
|
directory=temp_dir,
|
251
273
|
pattern="*.txt",
|
252
274
|
recursive=True,
|
253
275
|
metadata=True,
|
254
276
|
ignore_dirs=["__pycache__"],
|
255
|
-
ignore_files=["*.pyc"]
|
277
|
+
ignore_files=["*.pyc"],
|
256
278
|
)
|
257
|
-
|
279
|
+
|
258
280
|
def to_scenario_list(self):
|
259
281
|
"""Create a ScenarioList from files in a directory."""
|
260
282
|
import os
|
261
283
|
import glob
|
262
|
-
|
284
|
+
|
263
285
|
from .scenario_list import ScenarioList
|
264
|
-
|
286
|
+
|
265
287
|
# Set default recursive value
|
266
288
|
recursive = self.recursive
|
267
|
-
|
289
|
+
|
268
290
|
# Handle paths with wildcards properly
|
269
|
-
if
|
291
|
+
if "*" in self.directory:
|
270
292
|
# Handle "**/*.py" patterns (recursive wildcard)
|
271
293
|
if "**" in self.directory:
|
272
294
|
parts = self.directory.split("**")
|
@@ -287,52 +309,58 @@ class DirectorySource(Source):
|
|
287
309
|
else:
|
288
310
|
directory = self.directory
|
289
311
|
pattern = self.pattern
|
290
|
-
|
312
|
+
|
291
313
|
# Check if directory exists
|
292
314
|
if not os.path.isdir(directory):
|
293
315
|
from .exceptions import FileNotFoundScenarioError
|
316
|
+
|
294
317
|
raise FileNotFoundScenarioError(f"Directory not found: {directory}")
|
295
|
-
|
318
|
+
|
296
319
|
# Use glob directly for ** patterns to prevent duplicates
|
297
320
|
if "**" in pattern:
|
298
321
|
from .scenario_list import ScenarioList
|
299
322
|
from .file_store import FileStore
|
300
|
-
|
323
|
+
|
301
324
|
# Handle the pattern directly with glob
|
302
325
|
full_pattern = os.path.join(directory, pattern)
|
303
326
|
file_paths = glob.glob(full_pattern, recursive=True)
|
304
|
-
|
327
|
+
|
305
328
|
# Remove duplicates (by converting to a set and back)
|
306
329
|
file_paths = list(set(file_paths))
|
307
|
-
|
330
|
+
|
308
331
|
# Create scenarios
|
309
332
|
scenarios = []
|
310
333
|
for file_path in file_paths:
|
311
334
|
if os.path.isfile(file_path):
|
312
335
|
# Check if file should be ignored
|
313
336
|
file_name = os.path.basename(file_path)
|
314
|
-
if any(
|
337
|
+
if any(
|
338
|
+
fnmatch.fnmatch(file_name, ignore_pattern)
|
339
|
+
for ignore_pattern in self.ignore_files or []
|
340
|
+
):
|
315
341
|
continue
|
316
|
-
|
342
|
+
|
317
343
|
# Create FileStore object
|
318
344
|
file_store = FileStore(file_path)
|
319
|
-
|
345
|
+
|
320
346
|
# Create scenario
|
321
347
|
scenario_data = {"file": file_store}
|
322
|
-
|
348
|
+
|
323
349
|
# Add metadata if requested
|
324
350
|
if self.metadata:
|
325
351
|
file_stat = os.stat(file_path)
|
326
|
-
scenario_data.update(
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
352
|
+
scenario_data.update(
|
353
|
+
{
|
354
|
+
"file_path": file_path,
|
355
|
+
"file_name": file_name,
|
356
|
+
"file_size": file_stat.st_size,
|
357
|
+
"file_created": file_stat.st_ctime,
|
358
|
+
"file_modified": file_stat.st_mtime,
|
359
|
+
}
|
360
|
+
)
|
361
|
+
|
334
362
|
scenarios.append(Scenario(scenario_data))
|
335
|
-
|
363
|
+
|
336
364
|
return ScenarioList(scenarios)
|
337
365
|
else:
|
338
366
|
# Use the standard scanning method for non-** patterns
|
@@ -348,148 +376,146 @@ class DirectorySource(Source):
|
|
348
376
|
|
349
377
|
class TuplesSource(Source):
|
350
378
|
source_type = "list_of_tuples"
|
351
|
-
|
352
|
-
def __init__(
|
379
|
+
|
380
|
+
def __init__(
|
381
|
+
self, field_names: list[str], values: list[tuple], use_indexes: bool = False
|
382
|
+
):
|
353
383
|
self.field_names = field_names
|
354
384
|
self.values = values
|
355
385
|
self.use_indexes = use_indexes
|
356
|
-
|
386
|
+
|
357
387
|
# Validate inputs
|
358
388
|
if not all(isinstance(v, (tuple, list)) for v in values):
|
359
389
|
raise ScenarioError("All values must be tuples or lists")
|
360
|
-
|
390
|
+
|
361
391
|
@classmethod
|
362
|
-
def example(cls) ->
|
392
|
+
def example(cls) -> "TuplesSource":
|
363
393
|
"""Return an example TuplesSource instance."""
|
364
394
|
return cls(
|
365
395
|
field_names=["name", "age", "city"],
|
366
396
|
values=[
|
367
397
|
("Alice", 30, "New York"),
|
368
398
|
("Bob", 25, "San Francisco"),
|
369
|
-
("Charlie", 35, "Boston")
|
399
|
+
("Charlie", 35, "Boston"),
|
370
400
|
],
|
371
|
-
use_indexes=True
|
401
|
+
use_indexes=True,
|
372
402
|
)
|
373
|
-
|
403
|
+
|
374
404
|
def to_scenario_list(self):
|
375
405
|
"""Create a ScenarioList from a list of tuples with specified field names."""
|
376
406
|
from .scenario_list import ScenarioList
|
377
|
-
|
407
|
+
|
378
408
|
scenarios = []
|
379
|
-
|
409
|
+
|
380
410
|
for i, value_tuple in enumerate(self.values):
|
381
411
|
if len(value_tuple) != len(self.field_names):
|
382
412
|
raise ScenarioError(
|
383
413
|
f"Tuple {i} has {len(value_tuple)} elements, but {len(self.field_names)} field names were provided."
|
384
414
|
)
|
385
|
-
|
415
|
+
|
386
416
|
scenario_dict = dict(zip(self.field_names, value_tuple))
|
387
417
|
if self.use_indexes:
|
388
418
|
scenario_dict["idx"] = i
|
389
419
|
scenarios.append(Scenario(scenario_dict))
|
390
|
-
|
420
|
+
|
391
421
|
return ScenarioList(scenarios)
|
392
422
|
|
393
423
|
|
394
424
|
class SQLiteSource(Source):
|
395
425
|
source_type = "sqlite"
|
396
|
-
|
426
|
+
|
397
427
|
def __init__(self, db_path: str, table: str, fields: Optional[list] = None):
|
398
428
|
self.db_path = db_path
|
399
429
|
self.table = table
|
400
430
|
self.fields = fields
|
401
|
-
|
431
|
+
|
402
432
|
@classmethod
|
403
|
-
def example(cls) ->
|
433
|
+
def example(cls) -> "SQLiteSource":
|
404
434
|
"""Return an example SQLiteSource instance."""
|
405
435
|
import sqlite3
|
406
436
|
import tempfile
|
407
437
|
import os
|
408
|
-
|
438
|
+
|
409
439
|
# Create a temporary SQLite database for the example
|
410
|
-
fd, temp_path = tempfile.mkstemp(suffix=
|
440
|
+
fd, temp_path = tempfile.mkstemp(suffix=".db", prefix="edsl_test_")
|
411
441
|
os.close(fd) # Close the file descriptor
|
412
|
-
|
442
|
+
|
413
443
|
# Connect to the database and create a sample table
|
414
444
|
conn = sqlite3.connect(temp_path)
|
415
445
|
cursor = conn.cursor()
|
416
|
-
|
446
|
+
|
417
447
|
# Create a simple table
|
418
|
-
cursor.execute(
|
419
|
-
|
448
|
+
cursor.execute(
|
449
|
+
"CREATE TABLE test_table (id INTEGER PRIMARY KEY, name TEXT, value INTEGER)"
|
450
|
+
)
|
451
|
+
|
420
452
|
# Insert sample data
|
421
|
-
sample_data = [
|
422
|
-
|
423
|
-
|
424
|
-
(3, 'Gamma', 300)
|
425
|
-
]
|
426
|
-
cursor.executemany('INSERT INTO test_table VALUES (?, ?, ?)', sample_data)
|
427
|
-
|
453
|
+
sample_data = [(1, "Alpha", 100), (2, "Beta", 200), (3, "Gamma", 300)]
|
454
|
+
cursor.executemany("INSERT INTO test_table VALUES (?, ?, ?)", sample_data)
|
455
|
+
|
428
456
|
conn.commit()
|
429
457
|
conn.close()
|
430
|
-
|
458
|
+
|
431
459
|
return cls(
|
432
|
-
db_path=temp_path,
|
433
|
-
table='test_table',
|
434
|
-
fields=['id', 'name', 'value']
|
460
|
+
db_path=temp_path, table="test_table", fields=["id", "name", "value"]
|
435
461
|
)
|
436
|
-
|
462
|
+
|
437
463
|
def to_scenario_list(self):
|
438
464
|
"""Create a ScenarioList from a SQLite database."""
|
439
465
|
from .scenario_list import ScenarioList
|
440
466
|
import sqlite3
|
441
|
-
|
467
|
+
|
442
468
|
conn = sqlite3.connect(self.db_path)
|
443
469
|
cursor = conn.cursor()
|
444
|
-
|
470
|
+
|
445
471
|
# If fields weren't provided, get all fields from the table
|
446
472
|
fields = self.fields
|
447
473
|
if fields is None:
|
448
474
|
cursor.execute(f"PRAGMA table_info({self.table})")
|
449
475
|
fields = [row[1] for row in cursor.fetchall()]
|
450
|
-
|
476
|
+
|
451
477
|
# Query the data
|
452
478
|
field_placeholders = ", ".join(fields)
|
453
479
|
cursor.execute(f"SELECT {field_placeholders} FROM {self.table}")
|
454
480
|
rows = cursor.fetchall()
|
455
|
-
|
481
|
+
|
456
482
|
# Create scenarios
|
457
483
|
scenarios = []
|
458
484
|
for row in rows:
|
459
485
|
scenario_dict = dict(zip(fields, row))
|
460
486
|
scenarios.append(Scenario(scenario_dict))
|
461
|
-
|
487
|
+
|
462
488
|
conn.close()
|
463
489
|
return ScenarioList(scenarios)
|
464
490
|
|
465
491
|
|
466
492
|
class LaTeXSource(Source):
|
467
493
|
source_type = "latex"
|
468
|
-
|
494
|
+
|
469
495
|
def __init__(self, file_path: str, table_index: int = 0, has_header: bool = True):
|
470
496
|
"""
|
471
497
|
Initialize a LaTeXSource with a LaTeX file path.
|
472
|
-
|
498
|
+
|
473
499
|
Args:
|
474
500
|
file_path: The path to the LaTeX file.
|
475
|
-
table_index: The index of the table to extract (if multiple tables exist).
|
501
|
+
table_index: The index of the table to extract (if multiple tables exist).
|
476
502
|
Default is 0 (first table).
|
477
503
|
has_header: Whether the table has a header row. Default is True.
|
478
504
|
"""
|
479
505
|
self.file_path = file_path
|
480
506
|
self.table_index = table_index
|
481
507
|
self.has_header = has_header
|
482
|
-
|
508
|
+
|
483
509
|
@classmethod
|
484
|
-
def example(cls) ->
|
510
|
+
def example(cls) -> "LaTeXSource":
|
485
511
|
"""Return an example LaTeXSource instance."""
|
486
512
|
import tempfile
|
487
513
|
import os
|
488
|
-
|
514
|
+
|
489
515
|
# Create a temporary LaTeX file with a sample table
|
490
|
-
fd, temp_path = tempfile.mkstemp(suffix=
|
516
|
+
fd, temp_path = tempfile.mkstemp(suffix=".tex", prefix="edsl_test_")
|
491
517
|
os.close(fd) # Close the file descriptor
|
492
|
-
|
518
|
+
|
493
519
|
# Write a sample LaTeX table to the file
|
494
520
|
sample_latex = r"""
|
495
521
|
\documentclass{article}
|
@@ -505,39 +531,35 @@ Charlie & 35 & 92 \\
|
|
505
531
|
|
506
532
|
\end{document}
|
507
533
|
"""
|
508
|
-
with open(temp_path,
|
534
|
+
with open(temp_path, "w") as f:
|
509
535
|
f.write(sample_latex)
|
510
|
-
|
511
|
-
return cls(
|
512
|
-
|
513
|
-
table_index=0,
|
514
|
-
has_header=True
|
515
|
-
)
|
516
|
-
|
536
|
+
|
537
|
+
return cls(file_path=temp_path, table_index=0, has_header=True)
|
538
|
+
|
517
539
|
def to_scenario_list(self):
|
518
540
|
"""Create a ScenarioList from a LaTeX file."""
|
519
541
|
from .scenario_list import ScenarioList
|
520
542
|
import re
|
521
|
-
|
543
|
+
|
522
544
|
with open(self.file_path, "r") as f:
|
523
545
|
content = f.read()
|
524
|
-
|
546
|
+
|
525
547
|
# Find all tabular environments
|
526
548
|
tabular_pattern = r"\\begin{tabular}(.*?)\\end{tabular}"
|
527
549
|
tables = re.findall(tabular_pattern, content, re.DOTALL)
|
528
|
-
|
550
|
+
|
529
551
|
if not tables or self.table_index >= len(tables):
|
530
552
|
raise ScenarioError(f"No table found at index {self.table_index}")
|
531
|
-
|
553
|
+
|
532
554
|
table_content = tables[self.table_index]
|
533
|
-
|
555
|
+
|
534
556
|
# Extract rows
|
535
557
|
rows = table_content.split("\\\\")
|
536
558
|
rows = [row.strip() for row in rows if row.strip()]
|
537
|
-
|
559
|
+
|
538
560
|
if not rows:
|
539
561
|
return ScenarioList()
|
540
|
-
|
562
|
+
|
541
563
|
# Process header if available
|
542
564
|
if self.has_header:
|
543
565
|
header_row = rows[0]
|
@@ -545,98 +567,104 @@ Charlie & 35 & 92 \\
|
|
545
567
|
if not header_cells:
|
546
568
|
header_cells = header_row.split("&")
|
547
569
|
header_cells = [h.strip() for h in header_cells]
|
548
|
-
|
570
|
+
|
549
571
|
data_rows = rows[1:]
|
550
572
|
else:
|
551
573
|
# Auto-generate column names
|
552
574
|
header_cells = [f"col{i}" for i in range(rows[0].count("&") + 1)]
|
553
575
|
data_rows = rows
|
554
|
-
|
576
|
+
|
555
577
|
# Process data rows
|
556
578
|
scenarios = []
|
557
579
|
for row in data_rows:
|
558
580
|
cells = row.split("&")
|
559
581
|
cells = [cell.strip() for cell in cells]
|
560
|
-
|
582
|
+
|
561
583
|
if len(cells) != len(header_cells):
|
562
584
|
continue # Skip malformed rows
|
563
|
-
|
585
|
+
|
564
586
|
scenario_dict = dict(zip(header_cells, cells))
|
565
587
|
scenarios.append(Scenario(scenario_dict))
|
566
|
-
|
588
|
+
|
567
589
|
return ScenarioList(scenarios)
|
568
590
|
|
569
591
|
|
570
592
|
class GoogleDocSource(Source):
|
571
593
|
source_type = "google_doc"
|
572
|
-
|
594
|
+
|
573
595
|
def __init__(self, url: str):
|
574
596
|
"""
|
575
597
|
Initialize a GoogleDocSource with a Google Doc URL.
|
576
|
-
|
598
|
+
|
577
599
|
Args:
|
578
600
|
url: The URL to the Google Doc.
|
579
601
|
"""
|
580
602
|
self.url = url
|
581
|
-
|
603
|
+
|
582
604
|
@classmethod
|
583
|
-
def example(cls) ->
|
605
|
+
def example(cls) -> "GoogleDocSource":
|
584
606
|
"""Return an example GoogleDocSource instance."""
|
585
607
|
# Create a mock instance that doesn't actually fetch a Google Doc
|
586
|
-
instance = cls(
|
587
|
-
|
608
|
+
instance = cls(
|
609
|
+
url="https://docs.google.com/document/d/1234567890abcdefghijklmnopqrstuvwxyz/edit"
|
610
|
+
)
|
611
|
+
|
588
612
|
# Override the to_scenario_list method just for the example
|
589
613
|
def mock_to_scenario_list(self):
|
590
614
|
from .scenario_list import ScenarioList
|
615
|
+
|
591
616
|
# Create a simple mock ScenarioList with a few paragraphs
|
592
617
|
scenarios = [
|
593
618
|
Scenario({"text": "This is paragraph 1 from a sample Google Doc."}),
|
594
619
|
Scenario({"text": "This is paragraph 2 with some more content."}),
|
595
|
-
Scenario({"text": "This is the final paragraph with a conclusion."})
|
620
|
+
Scenario({"text": "This is the final paragraph with a conclusion."}),
|
596
621
|
]
|
597
622
|
return ScenarioList(scenarios)
|
598
|
-
|
623
|
+
|
599
624
|
# Replace the method on this instance only
|
600
625
|
import types
|
626
|
+
|
601
627
|
instance.to_scenario_list = types.MethodType(mock_to_scenario_list, instance)
|
602
|
-
|
628
|
+
|
603
629
|
return instance
|
604
|
-
|
630
|
+
|
605
631
|
def to_scenario_list(self):
|
606
632
|
"""Create a ScenarioList from a Google Doc."""
|
607
633
|
from .scenario_list import ScenarioList
|
608
634
|
import tempfile
|
609
635
|
import requests
|
610
|
-
|
636
|
+
|
611
637
|
# Extract the document ID from the URL
|
612
638
|
if "/edit" in self.url:
|
613
639
|
doc_id = self.url.split("/d/")[1].split("/edit")[0]
|
614
640
|
else:
|
615
641
|
raise ScenarioError("Invalid Google Doc URL format.")
|
616
|
-
|
642
|
+
|
617
643
|
# Create the export URL to download as DOCX
|
618
644
|
export_url = f"https://docs.google.com/document/d/{doc_id}/export?format=docx"
|
619
|
-
|
645
|
+
|
620
646
|
try:
|
621
647
|
# Download the Google Doc as a Word file (.docx)
|
622
648
|
response = requests.get(export_url)
|
623
649
|
response.raise_for_status() # Ensure the request was successful
|
624
|
-
|
650
|
+
|
625
651
|
# Save the Word file to a temporary file
|
626
652
|
with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as temp_file:
|
627
653
|
temp_file.write(response.content)
|
628
654
|
temp_filename = temp_file.name
|
629
|
-
|
655
|
+
|
630
656
|
# Use the DocxScenario class to process the temporary file
|
631
657
|
from .scenario_list import ScenarioList
|
632
658
|
from .DocxScenario import DocxScenario
|
633
|
-
|
659
|
+
|
634
660
|
# Create a scenario from the DOCX file
|
635
661
|
docx_scenario = DocxScenario(temp_filename)
|
636
|
-
scenarios = [
|
637
|
-
|
662
|
+
scenarios = [
|
663
|
+
Scenario({"text": paragraph}) for paragraph in docx_scenario.paragraphs
|
664
|
+
]
|
665
|
+
|
638
666
|
return ScenarioList(scenarios)
|
639
|
-
|
667
|
+
|
640
668
|
except requests.RequestException as e:
|
641
669
|
raise ScenarioError(f"Failed to fetch Google Doc: {str(e)}")
|
642
670
|
except Exception as e:
|
@@ -645,79 +673,84 @@ class GoogleDocSource(Source):
|
|
645
673
|
|
646
674
|
class PandasSource(Source):
|
647
675
|
source_type = "pandas"
|
648
|
-
|
676
|
+
|
649
677
|
def __init__(self, df):
|
650
678
|
"""
|
651
679
|
Initialize a PandasSource with a pandas DataFrame.
|
652
|
-
|
680
|
+
|
653
681
|
Args:
|
654
682
|
df: A pandas DataFrame.
|
655
683
|
"""
|
656
684
|
try:
|
657
685
|
import pandas as pd
|
686
|
+
|
658
687
|
if not isinstance(df, pd.DataFrame):
|
659
688
|
raise ScenarioError("Input must be a pandas DataFrame")
|
660
689
|
self.df = df
|
661
690
|
except ImportError:
|
662
691
|
raise ImportError("pandas is required for PandasSource")
|
663
|
-
|
692
|
+
|
664
693
|
@classmethod
|
665
|
-
def example(cls) ->
|
694
|
+
def example(cls) -> "PandasSource":
|
666
695
|
"""Return an example PandasSource instance."""
|
667
696
|
try:
|
668
697
|
import pandas as pd
|
669
|
-
|
698
|
+
|
670
699
|
# Create a sample DataFrame for the example
|
671
700
|
sample_data = {
|
672
|
-
|
673
|
-
|
674
|
-
|
701
|
+
"name": ["Alice", "Bob", "Charlie", "David"],
|
702
|
+
"age": [30, 25, 35, 28],
|
703
|
+
"city": ["New York", "San Francisco", "Boston", "Seattle"],
|
675
704
|
}
|
676
705
|
df = pd.DataFrame(sample_data)
|
677
|
-
|
706
|
+
|
678
707
|
return cls(df)
|
679
708
|
except ImportError:
|
680
709
|
# Create a mock instance that doesn't actually need pandas
|
681
710
|
instance = cls.__new__(cls)
|
682
|
-
|
711
|
+
|
683
712
|
# Override the to_scenario_list method just for the example
|
684
713
|
def mock_to_scenario_list(self):
|
685
714
|
from .scenario_list import ScenarioList
|
715
|
+
|
686
716
|
# Create a simple mock ScenarioList
|
687
717
|
scenarios = [
|
688
718
|
Scenario({"name": "Alice", "age": 30, "city": "New York"}),
|
689
719
|
Scenario({"name": "Bob", "age": 25, "city": "San Francisco"}),
|
690
720
|
Scenario({"name": "Charlie", "age": 35, "city": "Boston"}),
|
691
|
-
Scenario({"name": "David", "age": 28, "city": "Seattle"})
|
721
|
+
Scenario({"name": "David", "age": 28, "city": "Seattle"}),
|
692
722
|
]
|
693
723
|
return ScenarioList(scenarios)
|
694
|
-
|
724
|
+
|
695
725
|
# Replace the method on this instance only
|
696
726
|
import types
|
697
|
-
|
698
|
-
|
727
|
+
|
728
|
+
instance.to_scenario_list = types.MethodType(
|
729
|
+
mock_to_scenario_list, instance
|
730
|
+
)
|
731
|
+
|
699
732
|
return instance
|
700
|
-
|
733
|
+
|
701
734
|
def to_scenario_list(self):
|
702
735
|
"""Create a ScenarioList from a pandas DataFrame."""
|
703
736
|
from .scenario_list import ScenarioList
|
704
|
-
|
737
|
+
|
705
738
|
# Convert DataFrame records to scenarios
|
706
739
|
scenarios = []
|
707
740
|
for _, row in self.df.iterrows():
|
708
741
|
scenario_dict = row.to_dict()
|
709
742
|
scenarios.append(Scenario(scenario_dict))
|
710
|
-
|
743
|
+
|
711
744
|
return ScenarioList(scenarios)
|
712
745
|
|
713
746
|
|
714
747
|
class StataSource(Source):
|
715
748
|
source_type = "dta"
|
716
|
-
|
749
|
+
|
717
750
|
def __init__(self, file_path: str, include_metadata: bool = True):
|
718
751
|
"""
|
719
752
|
Initialize a StataSource with a path to a Stata data file.
|
720
|
-
|
753
|
+
|
721
754
|
Args:
|
722
755
|
file_path: Path to the Stata (.dta) file.
|
723
756
|
include_metadata: If True, extract and preserve variable labels and value labels
|
@@ -725,102 +758,108 @@ class StataSource(Source):
|
|
725
758
|
"""
|
726
759
|
self.file_path = file_path
|
727
760
|
self.include_metadata = include_metadata
|
728
|
-
|
761
|
+
|
729
762
|
@classmethod
|
730
|
-
def example(cls) ->
|
763
|
+
def example(cls) -> "StataSource":
|
731
764
|
"""Return an example StataSource instance."""
|
732
765
|
import tempfile
|
733
766
|
import os
|
734
|
-
|
767
|
+
|
735
768
|
# Since we can't easily create a real Stata file for testing,
|
736
769
|
# we'll create a mock instance with an override
|
737
770
|
instance = cls(file_path="/path/to/nonexistent/file.dta")
|
738
|
-
|
771
|
+
|
739
772
|
# Override the to_scenario_list method just for the example
|
740
773
|
def mock_to_scenario_list(self):
|
741
774
|
from .scenario_list import ScenarioList
|
742
|
-
|
775
|
+
|
743
776
|
# Create a simple mock ScenarioList with Stata-like data
|
744
777
|
scenarios = [
|
745
778
|
Scenario({"id": 1, "gender": 1, "income": 50000, "education": 2}),
|
746
779
|
Scenario({"id": 2, "gender": 2, "income": 45000, "education": 3}),
|
747
|
-
Scenario({"id": 3, "gender": 1, "income": 60000, "education": 4})
|
780
|
+
Scenario({"id": 3, "gender": 1, "income": 60000, "education": 4}),
|
748
781
|
]
|
749
|
-
|
782
|
+
|
750
783
|
result = ScenarioList(scenarios)
|
751
|
-
|
784
|
+
|
752
785
|
# Add metadata similar to what would be in a Stata file
|
753
786
|
if self.include_metadata:
|
754
787
|
result.codebook = {
|
755
788
|
"variable_labels": {
|
756
789
|
"gender": "Gender (1=Male, 2=Female)",
|
757
790
|
"income": "Annual income in USD",
|
758
|
-
"education": "Education level (1-4)"
|
791
|
+
"education": "Education level (1-4)",
|
759
792
|
},
|
760
793
|
"value_labels": {
|
761
794
|
"gender": {1: "Male", 2: "Female"},
|
762
|
-
"education": {
|
763
|
-
|
795
|
+
"education": {
|
796
|
+
1: "High School",
|
797
|
+
2: "Associate",
|
798
|
+
3: "Bachelor",
|
799
|
+
4: "Graduate",
|
800
|
+
},
|
801
|
+
},
|
764
802
|
}
|
765
|
-
|
803
|
+
|
766
804
|
return result
|
767
|
-
|
805
|
+
|
768
806
|
# Replace the method on this instance only
|
769
807
|
import types
|
808
|
+
|
770
809
|
instance.to_scenario_list = types.MethodType(mock_to_scenario_list, instance)
|
771
|
-
|
810
|
+
|
772
811
|
return instance
|
773
|
-
|
812
|
+
|
774
813
|
def to_scenario_list(self):
|
775
814
|
"""Create a ScenarioList from a Stata data file."""
|
776
815
|
from .scenario_list import ScenarioList
|
777
|
-
|
816
|
+
|
778
817
|
try:
|
779
818
|
import pandas as pd
|
780
819
|
except ImportError:
|
781
820
|
raise ImportError("pandas is required to read Stata files")
|
782
|
-
|
821
|
+
|
783
822
|
# Read the Stata file with pandas
|
784
823
|
df = pd.read_stata(self.file_path)
|
785
|
-
|
824
|
+
|
786
825
|
# Create scenarios
|
787
826
|
scenarios = []
|
788
827
|
for _, row in df.iterrows():
|
789
828
|
scenario_dict = row.to_dict()
|
790
829
|
scenarios.append(Scenario(scenario_dict))
|
791
|
-
|
830
|
+
|
792
831
|
# Create the basic ScenarioList
|
793
832
|
result = ScenarioList(scenarios)
|
794
|
-
|
833
|
+
|
795
834
|
# Extract and preserve metadata if requested
|
796
835
|
if self.include_metadata:
|
797
836
|
# Get variable labels (if any)
|
798
837
|
variable_labels = {}
|
799
838
|
if hasattr(df, "variable_labels") and df.variable_labels:
|
800
839
|
variable_labels = df.variable_labels
|
801
|
-
|
840
|
+
|
802
841
|
# Get value labels (if any)
|
803
842
|
value_labels = {}
|
804
843
|
if hasattr(df, "value_labels") and df.value_labels:
|
805
844
|
value_labels = df.value_labels
|
806
|
-
|
845
|
+
|
807
846
|
# Store the metadata in the ScenarioList's codebook
|
808
847
|
if variable_labels or value_labels:
|
809
848
|
result.codebook = {
|
810
849
|
"variable_labels": variable_labels,
|
811
850
|
"value_labels": value_labels,
|
812
851
|
}
|
813
|
-
|
852
|
+
|
814
853
|
return result
|
815
854
|
|
816
855
|
|
817
856
|
class WikipediaSource(Source):
|
818
857
|
source_type = "wikipedia"
|
819
|
-
|
858
|
+
|
820
859
|
def __init__(self, url: str, table_index: int = 0, header: bool = True):
|
821
860
|
"""
|
822
861
|
Initialize a WikipediaSource with a URL to a Wikipedia page.
|
823
|
-
|
862
|
+
|
824
863
|
Args:
|
825
864
|
url: The URL of the Wikipedia page.
|
826
865
|
table_index: The index of the table to extract (default is 0).
|
@@ -829,74 +868,89 @@ class WikipediaSource(Source):
|
|
829
868
|
self.url = url
|
830
869
|
self.table_index = table_index
|
831
870
|
self.header = header
|
832
|
-
|
871
|
+
|
833
872
|
@classmethod
|
834
|
-
def example(cls) ->
|
873
|
+
def example(cls) -> "WikipediaSource":
|
835
874
|
"""Return an example WikipediaSource instance."""
|
836
875
|
# Use a real Wikipedia URL for the example, but we'll override the to_scenario_list method
|
837
876
|
instance = cls(
|
838
877
|
url="https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)",
|
839
878
|
table_index=0,
|
840
|
-
header=True
|
879
|
+
header=True,
|
841
880
|
)
|
842
|
-
|
881
|
+
|
843
882
|
# Override the to_scenario_list method just for the example
|
844
883
|
def mock_to_scenario_list(self):
|
845
884
|
from .scenario_list import ScenarioList
|
846
|
-
|
885
|
+
|
847
886
|
# Create a simple mock ScenarioList with GDP data
|
848
887
|
scenarios = [
|
849
|
-
Scenario(
|
850
|
-
|
851
|
-
|
852
|
-
|
853
|
-
|
888
|
+
Scenario(
|
889
|
+
{
|
890
|
+
"Rank": 1,
|
891
|
+
"Country": "United States",
|
892
|
+
"GDP (millions of USD)": 25460000,
|
893
|
+
}
|
894
|
+
),
|
895
|
+
Scenario(
|
896
|
+
{"Rank": 2, "Country": "China", "GDP (millions of USD)": 17963000}
|
897
|
+
),
|
898
|
+
Scenario(
|
899
|
+
{"Rank": 3, "Country": "Japan", "GDP (millions of USD)": 4231000}
|
900
|
+
),
|
901
|
+
Scenario(
|
902
|
+
{"Rank": 4, "Country": "Germany", "GDP (millions of USD)": 4430000}
|
903
|
+
),
|
904
|
+
Scenario(
|
905
|
+
{"Rank": 5, "Country": "India", "GDP (millions of USD)": 3737000}
|
906
|
+
),
|
854
907
|
]
|
855
|
-
|
908
|
+
|
856
909
|
return ScenarioList(scenarios)
|
857
|
-
|
910
|
+
|
858
911
|
# Replace the method on this instance only
|
859
912
|
import types
|
913
|
+
|
860
914
|
instance.to_scenario_list = types.MethodType(mock_to_scenario_list, instance)
|
861
|
-
|
915
|
+
|
862
916
|
return instance
|
863
|
-
|
917
|
+
|
864
918
|
def to_scenario_list(self):
|
865
919
|
"""Create a ScenarioList from a table on a Wikipedia page."""
|
866
920
|
from .scenario_list import ScenarioList
|
867
921
|
import requests
|
868
|
-
|
922
|
+
|
869
923
|
try:
|
870
924
|
# Try to import pandas
|
871
925
|
import pandas as pd
|
872
926
|
except ImportError:
|
873
927
|
raise ImportError("pandas is required to read Wikipedia tables")
|
874
|
-
|
928
|
+
|
875
929
|
try:
|
876
930
|
# Check if the URL is reachable
|
877
931
|
response = requests.get(self.url)
|
878
932
|
response.raise_for_status() # Raises HTTPError for bad responses
|
879
|
-
|
933
|
+
|
880
934
|
# Extract tables from the Wikipedia page
|
881
935
|
tables = pd.read_html(self.url, header=0 if self.header else None)
|
882
|
-
|
936
|
+
|
883
937
|
# Ensure the requested table index is within the range of available tables
|
884
938
|
if self.table_index >= len(tables) or self.table_index < 0:
|
885
939
|
raise ScenarioError(
|
886
940
|
f"Table index {self.table_index} is out of range. This page has {len(tables)} table(s)."
|
887
941
|
)
|
888
|
-
|
942
|
+
|
889
943
|
# Get the requested table
|
890
944
|
df = tables[self.table_index]
|
891
|
-
|
945
|
+
|
892
946
|
# Convert DataFrame to ScenarioList
|
893
947
|
scenarios = []
|
894
948
|
for _, row in df.iterrows():
|
895
949
|
scenario_dict = row.to_dict()
|
896
950
|
scenarios.append(Scenario(scenario_dict))
|
897
|
-
|
951
|
+
|
898
952
|
return ScenarioList(scenarios)
|
899
|
-
|
953
|
+
|
900
954
|
except requests.exceptions.RequestException as e:
|
901
955
|
raise ScenarioError(f"Error fetching the URL: {str(e)}")
|
902
956
|
except ValueError as e:
|
@@ -907,18 +961,18 @@ class WikipediaSource(Source):
|
|
907
961
|
|
908
962
|
class ExcelSource(Source):
|
909
963
|
source_type = "excel"
|
910
|
-
|
964
|
+
|
911
965
|
def __init__(
|
912
|
-
self,
|
913
|
-
file_path: str,
|
914
|
-
sheet_name: Optional[str] = None,
|
966
|
+
self,
|
967
|
+
file_path: str,
|
968
|
+
sheet_name: Optional[str] = None,
|
915
969
|
skip_rows: Optional[List[int]] = None,
|
916
970
|
use_codebook: bool = False,
|
917
|
-
**kwargs
|
971
|
+
**kwargs,
|
918
972
|
):
|
919
973
|
"""
|
920
974
|
Initialize an ExcelSource with a path to an Excel file.
|
921
|
-
|
975
|
+
|
922
976
|
Args:
|
923
977
|
file_path: Path to the Excel file.
|
924
978
|
sheet_name: Name of the sheet to load. If None and multiple sheets exist,
|
@@ -932,76 +986,81 @@ class ExcelSource(Source):
|
|
932
986
|
self.skip_rows = skip_rows
|
933
987
|
self.use_codebook = use_codebook
|
934
988
|
self.kwargs = kwargs
|
935
|
-
|
989
|
+
|
936
990
|
@classmethod
|
937
|
-
def example(cls) ->
|
991
|
+
def example(cls) -> "ExcelSource":
|
938
992
|
"""Return an example ExcelSource instance."""
|
939
993
|
import tempfile
|
940
994
|
import os
|
941
|
-
|
995
|
+
|
942
996
|
try:
|
943
997
|
import pandas as pd
|
944
|
-
|
998
|
+
|
945
999
|
# Create a temporary Excel file with sample data
|
946
|
-
fd, temp_path = tempfile.mkstemp(suffix=
|
1000
|
+
fd, temp_path = tempfile.mkstemp(suffix=".xlsx", prefix="edsl_test_")
|
947
1001
|
os.close(fd) # Close the file descriptor
|
948
|
-
|
1002
|
+
|
949
1003
|
# Create sample data
|
950
|
-
df1 = pd.DataFrame(
|
951
|
-
|
952
|
-
|
953
|
-
|
954
|
-
|
955
|
-
|
956
|
-
|
957
|
-
|
958
|
-
|
959
|
-
|
960
|
-
|
961
|
-
|
1004
|
+
df1 = pd.DataFrame(
|
1005
|
+
{
|
1006
|
+
"name": ["Alice", "Bob", "Charlie"],
|
1007
|
+
"age": [30, 25, 35],
|
1008
|
+
"city": ["New York", "San Francisco", "Boston"],
|
1009
|
+
}
|
1010
|
+
)
|
1011
|
+
|
1012
|
+
df2 = pd.DataFrame(
|
1013
|
+
{
|
1014
|
+
"name": ["David", "Eve"],
|
1015
|
+
"age": [40, 45],
|
1016
|
+
"city": ["Seattle", "Chicago"],
|
1017
|
+
}
|
1018
|
+
)
|
1019
|
+
|
962
1020
|
# Write to Excel file with multiple sheets
|
963
1021
|
with pd.ExcelWriter(temp_path) as writer:
|
964
|
-
df1.to_excel(writer, sheet_name=
|
965
|
-
df2.to_excel(writer, sheet_name=
|
966
|
-
|
967
|
-
return cls(
|
968
|
-
|
969
|
-
sheet_name='Sheet1'
|
970
|
-
)
|
971
|
-
|
1022
|
+
df1.to_excel(writer, sheet_name="Sheet1", index=False)
|
1023
|
+
df2.to_excel(writer, sheet_name="Sheet2", index=False)
|
1024
|
+
|
1025
|
+
return cls(file_path=temp_path, sheet_name="Sheet1")
|
1026
|
+
|
972
1027
|
except ImportError:
|
973
1028
|
# Create a mock instance with an override if pandas is not available
|
974
1029
|
instance = cls(file_path="/path/to/nonexistent/file.xlsx")
|
975
|
-
|
1030
|
+
|
976
1031
|
# Override the to_scenario_list method just for the example
|
977
1032
|
def mock_to_scenario_list(self):
|
978
1033
|
from .scenario_list import ScenarioList
|
1034
|
+
|
979
1035
|
# Create a simple mock ScenarioList with sample data
|
980
1036
|
scenarios = [
|
981
1037
|
Scenario({"name": "Alice", "age": 30, "city": "New York"}),
|
982
1038
|
Scenario({"name": "Bob", "age": 25, "city": "San Francisco"}),
|
983
|
-
Scenario({"name": "Charlie", "age": 35, "city": "Boston"})
|
1039
|
+
Scenario({"name": "Charlie", "age": 35, "city": "Boston"}),
|
984
1040
|
]
|
985
1041
|
return ScenarioList(scenarios)
|
986
|
-
|
1042
|
+
|
987
1043
|
# Replace the method on this instance only
|
988
1044
|
import types
|
989
|
-
|
990
|
-
|
1045
|
+
|
1046
|
+
instance.to_scenario_list = types.MethodType(
|
1047
|
+
mock_to_scenario_list, instance
|
1048
|
+
)
|
1049
|
+
|
991
1050
|
return instance
|
992
|
-
|
1051
|
+
|
993
1052
|
def to_scenario_list(self):
|
994
1053
|
"""Create a ScenarioList from an Excel file."""
|
995
1054
|
from .scenario_list import ScenarioList
|
996
|
-
|
1055
|
+
|
997
1056
|
try:
|
998
1057
|
import pandas as pd
|
999
1058
|
except ImportError:
|
1000
1059
|
raise ImportError("pandas is required to read Excel files")
|
1001
|
-
|
1060
|
+
|
1002
1061
|
# Get all sheets
|
1003
1062
|
all_sheets = pd.read_excel(self.file_path, sheet_name=None)
|
1004
|
-
|
1063
|
+
|
1005
1064
|
# If no sheet_name is provided and there is more than one sheet, print available sheets
|
1006
1065
|
sheet_name = self.sheet_name
|
1007
1066
|
if sheet_name is None:
|
@@ -1015,27 +1074,27 @@ class ExcelSource(Source):
|
|
1015
1074
|
else:
|
1016
1075
|
# If there is only one sheet, use it
|
1017
1076
|
sheet_name = list(all_sheets.keys())[0]
|
1018
|
-
|
1077
|
+
|
1019
1078
|
# Load the specified or determined sheet
|
1020
1079
|
df = pd.read_excel(self.file_path, sheet_name=sheet_name, **self.kwargs)
|
1021
|
-
|
1080
|
+
|
1022
1081
|
# Skip specified rows if any
|
1023
1082
|
if self.skip_rows:
|
1024
1083
|
df = df.drop(self.skip_rows)
|
1025
1084
|
# Reset index to ensure continuous indexing
|
1026
1085
|
df = df.reset_index(drop=True)
|
1027
|
-
|
1086
|
+
|
1028
1087
|
# Handle codebook if requested
|
1029
1088
|
if self.use_codebook:
|
1030
1089
|
codebook = {f"col_{i}": col for i, col in enumerate(df.columns)}
|
1031
1090
|
koobedoc = {col: f"col_{i}" for i, col in enumerate(df.columns)}
|
1032
|
-
|
1091
|
+
|
1033
1092
|
# Create scenarios with renamed columns
|
1034
1093
|
scenarios = []
|
1035
1094
|
for _, row in df.iterrows():
|
1036
1095
|
scenario_dict = {koobedoc.get(k): v for k, v in row.to_dict().items()}
|
1037
1096
|
scenarios.append(Scenario(scenario_dict))
|
1038
|
-
|
1097
|
+
|
1039
1098
|
result = ScenarioList(scenarios)
|
1040
1099
|
result.codebook = codebook
|
1041
1100
|
return result
|
@@ -1045,23 +1104,23 @@ class ExcelSource(Source):
|
|
1045
1104
|
for _, row in df.iterrows():
|
1046
1105
|
scenario_dict = row.to_dict()
|
1047
1106
|
scenarios.append(Scenario(scenario_dict))
|
1048
|
-
|
1107
|
+
|
1049
1108
|
return ScenarioList(scenarios)
|
1050
1109
|
|
1051
1110
|
|
1052
1111
|
class GoogleSheetSource(Source):
|
1053
1112
|
source_type = "google_sheet"
|
1054
|
-
|
1113
|
+
|
1055
1114
|
def __init__(
|
1056
|
-
self,
|
1057
|
-
url: str,
|
1058
|
-
sheet_name: Optional[str] = None,
|
1115
|
+
self,
|
1116
|
+
url: str,
|
1117
|
+
sheet_name: Optional[str] = None,
|
1059
1118
|
column_names: Optional[List[str]] = None,
|
1060
|
-
**kwargs
|
1119
|
+
**kwargs,
|
1061
1120
|
):
|
1062
1121
|
"""
|
1063
1122
|
Initialize a GoogleSheetSource with a URL to a Google Sheet.
|
1064
|
-
|
1123
|
+
|
1065
1124
|
Args:
|
1066
1125
|
url: The URL of the Google Sheet.
|
1067
1126
|
sheet_name: The name of the sheet to load. If None, the first sheet will be used.
|
@@ -1073,67 +1132,68 @@ class GoogleSheetSource(Source):
|
|
1073
1132
|
self.sheet_name = sheet_name
|
1074
1133
|
self.column_names = column_names
|
1075
1134
|
self.kwargs = kwargs
|
1076
|
-
|
1135
|
+
|
1077
1136
|
@classmethod
|
1078
|
-
def example(cls) ->
|
1137
|
+
def example(cls) -> "GoogleSheetSource":
|
1079
1138
|
"""Return an example GoogleSheetSource instance."""
|
1080
1139
|
# Use a mock instance since we can't create a real Google Sheet for testing
|
1081
1140
|
instance = cls(
|
1082
1141
|
url="https://docs.google.com/spreadsheets/d/1234567890abcdefg/edit",
|
1083
|
-
sheet_name="Sheet1"
|
1142
|
+
sheet_name="Sheet1",
|
1084
1143
|
)
|
1085
|
-
|
1144
|
+
|
1086
1145
|
# Override the to_scenario_list method just for the example
|
1087
1146
|
def mock_to_scenario_list(self):
|
1088
1147
|
from .scenario_list import ScenarioList
|
1089
|
-
|
1148
|
+
|
1090
1149
|
# Create a simple mock ScenarioList with sample data
|
1091
1150
|
scenarios = [
|
1092
1151
|
Scenario({"name": "Alice", "age": 30, "city": "New York"}),
|
1093
1152
|
Scenario({"name": "Bob", "age": 25, "city": "San Francisco"}),
|
1094
|
-
Scenario({"name": "Charlie", "age": 35, "city": "Boston"})
|
1153
|
+
Scenario({"name": "Charlie", "age": 35, "city": "Boston"}),
|
1095
1154
|
]
|
1096
1155
|
return ScenarioList(scenarios)
|
1097
|
-
|
1156
|
+
|
1098
1157
|
# Replace the method on this instance only
|
1099
1158
|
import types
|
1159
|
+
|
1100
1160
|
instance.to_scenario_list = types.MethodType(mock_to_scenario_list, instance)
|
1101
|
-
|
1161
|
+
|
1102
1162
|
return instance
|
1103
|
-
|
1163
|
+
|
1104
1164
|
def to_scenario_list(self):
|
1105
1165
|
"""Create a ScenarioList from a Google Sheet."""
|
1106
1166
|
from .scenario_list import ScenarioList
|
1107
1167
|
import tempfile
|
1108
1168
|
import requests
|
1109
|
-
|
1169
|
+
|
1110
1170
|
# Extract the sheet ID from the URL
|
1111
1171
|
if "/edit" in self.url:
|
1112
1172
|
sheet_id = self.url.split("/d/")[1].split("/edit")[0]
|
1113
1173
|
else:
|
1114
1174
|
raise ScenarioError("Invalid Google Sheet URL format.")
|
1115
|
-
|
1175
|
+
|
1116
1176
|
# Create the export URL for XLSX format
|
1117
|
-
export_url =
|
1118
|
-
|
1177
|
+
export_url = (
|
1178
|
+
f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=xlsx"
|
1179
|
+
)
|
1180
|
+
|
1119
1181
|
try:
|
1120
1182
|
# Download the Google Sheet as an Excel file
|
1121
1183
|
response = requests.get(export_url)
|
1122
1184
|
response.raise_for_status() # Ensure the request was successful
|
1123
|
-
|
1185
|
+
|
1124
1186
|
# Save the Excel file to a temporary file
|
1125
1187
|
with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as temp_file:
|
1126
1188
|
temp_file.write(response.content)
|
1127
1189
|
temp_filename = temp_file.name
|
1128
|
-
|
1190
|
+
|
1129
1191
|
# Use ExcelSource to create the initial ScenarioList
|
1130
1192
|
excel_source = ExcelSource(
|
1131
|
-
file_path=temp_filename,
|
1132
|
-
sheet_name=self.sheet_name,
|
1133
|
-
**self.kwargs
|
1193
|
+
file_path=temp_filename, sheet_name=self.sheet_name, **self.kwargs
|
1134
1194
|
)
|
1135
1195
|
scenario_list = excel_source.to_scenario_list()
|
1136
|
-
|
1196
|
+
|
1137
1197
|
# Apply column renaming if specified
|
1138
1198
|
if self.column_names is not None and scenario_list:
|
1139
1199
|
if len(self.column_names) != len(scenario_list[0].keys()):
|
@@ -1141,21 +1201,23 @@ class GoogleSheetSource(Source):
|
|
1141
1201
|
f"Number of provided column names ({len(self.column_names)}) "
|
1142
1202
|
f"does not match number of columns in sheet ({len(scenario_list[0].keys())})"
|
1143
1203
|
)
|
1144
|
-
|
1204
|
+
|
1145
1205
|
# Create a mapping from original keys to new names
|
1146
1206
|
original_keys = list(scenario_list[0].keys())
|
1147
1207
|
column_mapping = dict(zip(original_keys, self.column_names))
|
1148
|
-
|
1208
|
+
|
1149
1209
|
# Create a new ScenarioList with renamed columns
|
1150
1210
|
renamed_scenarios = []
|
1151
1211
|
for scenario in scenario_list:
|
1152
|
-
renamed_scenario = {
|
1212
|
+
renamed_scenario = {
|
1213
|
+
column_mapping.get(k, k): v for k, v in scenario.items()
|
1214
|
+
}
|
1153
1215
|
renamed_scenarios.append(Scenario(renamed_scenario))
|
1154
|
-
|
1216
|
+
|
1155
1217
|
return ScenarioList(renamed_scenarios)
|
1156
|
-
|
1218
|
+
|
1157
1219
|
return scenario_list
|
1158
|
-
|
1220
|
+
|
1159
1221
|
except requests.exceptions.RequestException as e:
|
1160
1222
|
raise ScenarioError(f"Error fetching the Google Sheet: {str(e)}")
|
1161
1223
|
except Exception as e:
|
@@ -1164,18 +1226,18 @@ class GoogleSheetSource(Source):
|
|
1164
1226
|
|
1165
1227
|
class DelimitedFileSource(Source):
|
1166
1228
|
source_type = "delimited_file"
|
1167
|
-
|
1229
|
+
|
1168
1230
|
def __init__(
|
1169
|
-
self,
|
1231
|
+
self,
|
1170
1232
|
file_or_url: str,
|
1171
1233
|
delimiter: str = ",",
|
1172
1234
|
has_header: bool = True,
|
1173
1235
|
encoding: str = "utf-8",
|
1174
|
-
**kwargs
|
1236
|
+
**kwargs,
|
1175
1237
|
):
|
1176
1238
|
"""
|
1177
1239
|
Initialize a DelimitedFileSource with a path to a delimited file or URL.
|
1178
|
-
|
1240
|
+
|
1179
1241
|
Args:
|
1180
1242
|
file_or_url: Path to a local file or URL to a remote file.
|
1181
1243
|
delimiter: The delimiter character used in the file (default is ',').
|
@@ -1188,42 +1250,38 @@ class DelimitedFileSource(Source):
|
|
1188
1250
|
self.has_header = has_header
|
1189
1251
|
self.encoding = encoding
|
1190
1252
|
self.kwargs = kwargs
|
1191
|
-
|
1253
|
+
|
1192
1254
|
@classmethod
|
1193
|
-
def example(cls) ->
|
1255
|
+
def example(cls) -> "DelimitedFileSource":
|
1194
1256
|
"""Return an example DelimitedFileSource instance."""
|
1195
1257
|
import tempfile
|
1196
1258
|
import os
|
1197
|
-
|
1259
|
+
|
1198
1260
|
# Create a temporary CSV file with sample data
|
1199
|
-
fd, temp_path = tempfile.mkstemp(suffix=
|
1261
|
+
fd, temp_path = tempfile.mkstemp(suffix=".csv", prefix="edsl_test_")
|
1200
1262
|
os.close(fd) # Close the file descriptor
|
1201
|
-
|
1263
|
+
|
1202
1264
|
# Write sample data to the file
|
1203
|
-
with open(temp_path,
|
1265
|
+
with open(temp_path, "w", newline="") as f:
|
1204
1266
|
f.write("name,age,city\n")
|
1205
1267
|
f.write("Alice,30,New York\n")
|
1206
1268
|
f.write("Bob,25,San Francisco\n")
|
1207
1269
|
f.write("Charlie,35,Boston\n")
|
1208
|
-
|
1209
|
-
return cls(
|
1210
|
-
|
1211
|
-
delimiter=",",
|
1212
|
-
has_header=True
|
1213
|
-
)
|
1214
|
-
|
1270
|
+
|
1271
|
+
return cls(file_or_url=temp_path, delimiter=",", has_header=True)
|
1272
|
+
|
1215
1273
|
def to_scenario_list(self):
|
1216
1274
|
"""Create a ScenarioList from a delimited file or URL."""
|
1217
1275
|
from .scenario_list import ScenarioList
|
1218
1276
|
import requests
|
1219
|
-
|
1277
|
+
|
1220
1278
|
# Check if the input is a URL
|
1221
1279
|
parsed_url = urlparse(self.file_or_url)
|
1222
1280
|
if parsed_url.scheme in ("http", "https"):
|
1223
1281
|
try:
|
1224
1282
|
headers = {
|
1225
1283
|
"Accept": "text/csv,application/csv,text/plain",
|
1226
|
-
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
1284
|
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
|
1227
1285
|
}
|
1228
1286
|
response = requests.get(self.file_or_url, headers=headers)
|
1229
1287
|
response.raise_for_status()
|
@@ -1240,7 +1298,7 @@ class DelimitedFileSource(Source):
|
|
1240
1298
|
encodings_to_try = ["latin-1", "cp1252", "ISO-8859-1"]
|
1241
1299
|
if self.encoding in encodings_to_try:
|
1242
1300
|
encodings_to_try.remove(self.encoding)
|
1243
|
-
|
1301
|
+
|
1244
1302
|
for encoding in encodings_to_try:
|
1245
1303
|
try:
|
1246
1304
|
with open(self.file_or_url, "r", encoding=encoding) as f:
|
@@ -1249,17 +1307,21 @@ class DelimitedFileSource(Source):
|
|
1249
1307
|
except UnicodeDecodeError:
|
1250
1308
|
continue
|
1251
1309
|
else:
|
1252
|
-
raise ScenarioError(
|
1310
|
+
raise ScenarioError(
|
1311
|
+
f"Failed to decode file with any of the attempted encodings"
|
1312
|
+
)
|
1253
1313
|
except Exception as e:
|
1254
1314
|
raise ScenarioError(f"Failed to read file: {str(e)}")
|
1255
|
-
|
1315
|
+
|
1256
1316
|
# Parse the content
|
1257
|
-
csv_reader = csv.reader(
|
1317
|
+
csv_reader = csv.reader(
|
1318
|
+
StringIO(content), delimiter=self.delimiter, **self.kwargs
|
1319
|
+
)
|
1258
1320
|
rows = list(csv_reader)
|
1259
|
-
|
1321
|
+
|
1260
1322
|
if not rows:
|
1261
1323
|
return ScenarioList()
|
1262
|
-
|
1324
|
+
|
1263
1325
|
# Handle header row
|
1264
1326
|
if self.has_header:
|
1265
1327
|
header = rows[0]
|
@@ -1268,33 +1330,50 @@ class DelimitedFileSource(Source):
|
|
1268
1330
|
# Auto-generate column names
|
1269
1331
|
header = [f"col{i}" for i in range(len(rows[0]))]
|
1270
1332
|
data_rows = rows
|
1271
|
-
|
1333
|
+
|
1334
|
+
header_counts = defaultdict(lambda: 0)
|
1335
|
+
new_header = []
|
1336
|
+
for h in header:
|
1337
|
+
print(header_counts)
|
1338
|
+
if header_counts[h] >= 1:
|
1339
|
+
new_header.append(f"{h}_{header_counts[h]}")
|
1340
|
+
warnings.warn(
|
1341
|
+
f"Duplicate header found: {h}. Renamed to {h}_{header_counts[h]}"
|
1342
|
+
)
|
1343
|
+
else:
|
1344
|
+
new_header.append(h)
|
1345
|
+
header_counts[h] += 1
|
1346
|
+
|
1347
|
+
assert len(new_header) == len(set(new_header))
|
1348
|
+
|
1272
1349
|
# Create scenarios
|
1273
1350
|
scenarios = []
|
1274
1351
|
for row in data_rows:
|
1275
|
-
if len(row) != len(
|
1276
|
-
warnings.warn(
|
1352
|
+
if len(row) != len(new_header):
|
1353
|
+
warnings.warn(
|
1354
|
+
f"Skipping row with {len(row)} values (expected {len(header)})"
|
1355
|
+
)
|
1277
1356
|
continue
|
1278
|
-
|
1279
|
-
scenario_dict = dict(zip(
|
1357
|
+
|
1358
|
+
scenario_dict = dict(zip(new_header, row))
|
1280
1359
|
scenarios.append(Scenario(scenario_dict))
|
1281
|
-
|
1360
|
+
|
1282
1361
|
return ScenarioList(scenarios)
|
1283
1362
|
|
1284
1363
|
|
1285
1364
|
class CSVSource(DelimitedFileSource):
|
1286
1365
|
source_type = "csv"
|
1287
|
-
|
1366
|
+
|
1288
1367
|
def __init__(
|
1289
|
-
self,
|
1368
|
+
self,
|
1290
1369
|
file_or_url: str,
|
1291
1370
|
has_header: bool = True,
|
1292
1371
|
encoding: str = "utf-8",
|
1293
|
-
**kwargs
|
1372
|
+
**kwargs,
|
1294
1373
|
):
|
1295
1374
|
"""
|
1296
1375
|
Initialize a CSVSource with a path to a CSV file or URL.
|
1297
|
-
|
1376
|
+
|
1298
1377
|
Args:
|
1299
1378
|
file_or_url: Path to a local file or URL to a remote file.
|
1300
1379
|
has_header: Whether the file has a header row (default is True).
|
@@ -1306,45 +1385,42 @@ class CSVSource(DelimitedFileSource):
|
|
1306
1385
|
delimiter=",",
|
1307
1386
|
has_header=has_header,
|
1308
1387
|
encoding=encoding,
|
1309
|
-
**kwargs
|
1388
|
+
**kwargs,
|
1310
1389
|
)
|
1311
|
-
|
1390
|
+
|
1312
1391
|
@classmethod
|
1313
|
-
def example(cls) ->
|
1392
|
+
def example(cls) -> "CSVSource":
|
1314
1393
|
"""Return an example CSVSource instance."""
|
1315
1394
|
import tempfile
|
1316
1395
|
import os
|
1317
|
-
|
1396
|
+
|
1318
1397
|
# Create a temporary CSV file with sample data
|
1319
|
-
fd, temp_path = tempfile.mkstemp(suffix=
|
1398
|
+
fd, temp_path = tempfile.mkstemp(suffix=".csv", prefix="edsl_test_")
|
1320
1399
|
os.close(fd) # Close the file descriptor
|
1321
|
-
|
1400
|
+
|
1322
1401
|
# Write sample data to the file
|
1323
|
-
with open(temp_path,
|
1402
|
+
with open(temp_path, "w", newline="") as f:
|
1324
1403
|
f.write("name,age,city\n")
|
1325
1404
|
f.write("Alice,30,New York\n")
|
1326
1405
|
f.write("Bob,25,San Francisco\n")
|
1327
1406
|
f.write("Charlie,35,Boston\n")
|
1328
|
-
|
1329
|
-
return cls(
|
1330
|
-
file_or_url=temp_path,
|
1331
|
-
has_header=True
|
1332
|
-
)
|
1407
|
+
|
1408
|
+
return cls(file_or_url=temp_path, has_header=True)
|
1333
1409
|
|
1334
1410
|
|
1335
1411
|
class TSVSource(DelimitedFileSource):
|
1336
1412
|
source_type = "tsv"
|
1337
|
-
|
1413
|
+
|
1338
1414
|
def __init__(
|
1339
|
-
self,
|
1415
|
+
self,
|
1340
1416
|
file_or_url: str,
|
1341
1417
|
has_header: bool = True,
|
1342
1418
|
encoding: str = "utf-8",
|
1343
|
-
**kwargs
|
1419
|
+
**kwargs,
|
1344
1420
|
):
|
1345
1421
|
"""
|
1346
1422
|
Initialize a TSVSource with a path to a TSV file or URL.
|
1347
|
-
|
1423
|
+
|
1348
1424
|
Args:
|
1349
1425
|
file_or_url: Path to a local file or URL to a remote file.
|
1350
1426
|
has_header: Whether the file has a header row (default is True).
|
@@ -1356,130 +1432,134 @@ class TSVSource(DelimitedFileSource):
|
|
1356
1432
|
delimiter="\t",
|
1357
1433
|
has_header=has_header,
|
1358
1434
|
encoding=encoding,
|
1359
|
-
**kwargs
|
1435
|
+
**kwargs,
|
1360
1436
|
)
|
1361
|
-
|
1437
|
+
|
1362
1438
|
@classmethod
|
1363
|
-
def example(cls) ->
|
1439
|
+
def example(cls) -> "TSVSource":
|
1364
1440
|
"""Return an example TSVSource instance."""
|
1365
1441
|
import tempfile
|
1366
1442
|
import os
|
1367
|
-
|
1443
|
+
|
1368
1444
|
# Create a temporary TSV file with sample data
|
1369
|
-
fd, temp_path = tempfile.mkstemp(suffix=
|
1445
|
+
fd, temp_path = tempfile.mkstemp(suffix=".tsv", prefix="edsl_test_")
|
1370
1446
|
os.close(fd) # Close the file descriptor
|
1371
|
-
|
1447
|
+
|
1372
1448
|
# Write sample data to the file
|
1373
|
-
with open(temp_path,
|
1449
|
+
with open(temp_path, "w", newline="") as f:
|
1374
1450
|
f.write("name\tage\tcity\n")
|
1375
1451
|
f.write("Alice\t30\tNew York\n")
|
1376
1452
|
f.write("Bob\t25\tSan Francisco\n")
|
1377
1453
|
f.write("Charlie\t35\tBoston\n")
|
1378
|
-
|
1379
|
-
return cls(
|
1380
|
-
|
1381
|
-
has_header=True
|
1382
|
-
)
|
1454
|
+
|
1455
|
+
return cls(file_or_url=temp_path, has_header=True)
|
1456
|
+
|
1383
1457
|
|
1384
1458
|
class ParquetSource(Source):
|
1385
1459
|
source_type = "parquet"
|
1386
|
-
|
1460
|
+
|
1387
1461
|
def __init__(self, file_path: str):
|
1388
1462
|
"""
|
1389
1463
|
Initialize a ParquetSource with a path to a Parquet file.
|
1390
|
-
|
1464
|
+
|
1391
1465
|
Args:
|
1392
1466
|
file_path: Path to the Parquet file.
|
1393
1467
|
"""
|
1394
1468
|
self.file_path = file_path
|
1395
|
-
|
1469
|
+
|
1396
1470
|
@classmethod
|
1397
|
-
def example(cls) ->
|
1471
|
+
def example(cls) -> "ParquetSource":
|
1398
1472
|
"""Return an example ParquetSource instance."""
|
1399
1473
|
import tempfile
|
1400
1474
|
import os
|
1401
|
-
|
1475
|
+
|
1402
1476
|
try:
|
1403
1477
|
import pandas as pd
|
1404
1478
|
import pyarrow as pa
|
1405
1479
|
import pyarrow.parquet as pq
|
1406
|
-
|
1480
|
+
|
1407
1481
|
# Create a temporary Parquet file with sample data
|
1408
|
-
fd, temp_path = tempfile.mkstemp(suffix=
|
1482
|
+
fd, temp_path = tempfile.mkstemp(suffix=".parquet", prefix="edsl_test_")
|
1409
1483
|
os.close(fd) # Close the file descriptor
|
1410
|
-
|
1484
|
+
|
1411
1485
|
# Create sample data
|
1412
|
-
df = pd.DataFrame(
|
1413
|
-
|
1414
|
-
|
1415
|
-
|
1416
|
-
|
1417
|
-
|
1486
|
+
df = pd.DataFrame(
|
1487
|
+
{
|
1488
|
+
"name": ["Alice", "Bob", "Charlie"],
|
1489
|
+
"age": [30, 25, 35],
|
1490
|
+
"city": ["New York", "San Francisco", "Boston"],
|
1491
|
+
}
|
1492
|
+
)
|
1493
|
+
|
1418
1494
|
# Write to Parquet file
|
1419
1495
|
df.to_parquet(temp_path)
|
1420
|
-
|
1496
|
+
|
1421
1497
|
return cls(file_path=temp_path)
|
1422
|
-
|
1498
|
+
|
1423
1499
|
except ImportError:
|
1424
1500
|
# Create a mock instance with an override if pandas or pyarrow is not available
|
1425
1501
|
instance = cls(file_path="/path/to/nonexistent/file.parquet")
|
1426
|
-
|
1502
|
+
|
1427
1503
|
# Override the to_scenario_list method just for the example
|
1428
1504
|
def mock_to_scenario_list(self):
|
1429
1505
|
from .scenario_list import ScenarioList
|
1506
|
+
|
1430
1507
|
# Create a simple mock ScenarioList with sample data
|
1431
1508
|
scenarios = [
|
1432
1509
|
Scenario({"name": "Alice", "age": 30, "city": "New York"}),
|
1433
1510
|
Scenario({"name": "Bob", "age": 25, "city": "San Francisco"}),
|
1434
|
-
Scenario({"name": "Charlie", "age": 35, "city": "Boston"})
|
1511
|
+
Scenario({"name": "Charlie", "age": 35, "city": "Boston"}),
|
1435
1512
|
]
|
1436
1513
|
return ScenarioList(scenarios)
|
1437
|
-
|
1514
|
+
|
1438
1515
|
# Replace the method on this instance only
|
1439
1516
|
import types
|
1440
|
-
|
1441
|
-
|
1517
|
+
|
1518
|
+
instance.to_scenario_list = types.MethodType(
|
1519
|
+
mock_to_scenario_list, instance
|
1520
|
+
)
|
1521
|
+
|
1442
1522
|
return instance
|
1443
|
-
|
1523
|
+
|
1444
1524
|
def to_scenario_list(self):
|
1445
1525
|
"""Create a ScenarioList from a Parquet file."""
|
1446
1526
|
from .scenario_list import ScenarioList
|
1447
|
-
|
1527
|
+
|
1448
1528
|
try:
|
1449
1529
|
import pandas as pd
|
1450
1530
|
except ImportError:
|
1451
1531
|
raise ImportError("pandas is required to read Parquet files")
|
1452
|
-
|
1532
|
+
|
1453
1533
|
try:
|
1454
1534
|
import pyarrow
|
1455
1535
|
except ImportError:
|
1456
1536
|
raise ImportError("pyarrow is required to read Parquet files")
|
1457
|
-
|
1537
|
+
|
1458
1538
|
# Read the Parquet file
|
1459
1539
|
df = pd.read_parquet(self.file_path)
|
1460
|
-
|
1540
|
+
|
1461
1541
|
# Convert DataFrame to ScenarioList
|
1462
1542
|
scenarios = []
|
1463
1543
|
for _, row in df.iterrows():
|
1464
1544
|
scenario_dict = row.to_dict()
|
1465
1545
|
scenarios.append(Scenario(scenario_dict))
|
1466
|
-
|
1546
|
+
|
1467
1547
|
return ScenarioList(scenarios)
|
1468
1548
|
|
1469
1549
|
|
1470
1550
|
class PDFSource(Source):
|
1471
1551
|
source_type = "pdf"
|
1472
|
-
|
1552
|
+
|
1473
1553
|
def __init__(
|
1474
|
-
self,
|
1554
|
+
self,
|
1475
1555
|
file_path: str,
|
1476
1556
|
chunk_type: Literal["page", "text"] = "page",
|
1477
1557
|
chunk_size: int = 1,
|
1478
|
-
chunk_overlap: int = 0
|
1558
|
+
chunk_overlap: int = 0,
|
1479
1559
|
):
|
1480
1560
|
"""
|
1481
1561
|
Initialize a PDFSource with a path to a PDF file.
|
1482
|
-
|
1562
|
+
|
1483
1563
|
Args:
|
1484
1564
|
file_path: Path to the PDF file or URL to a PDF.
|
1485
1565
|
chunk_type: Type of chunking to use ("page" or "text").
|
@@ -1490,39 +1570,53 @@ class PDFSource(Source):
|
|
1490
1570
|
self.chunk_type = chunk_type
|
1491
1571
|
self.chunk_size = chunk_size
|
1492
1572
|
self.chunk_overlap = chunk_overlap
|
1493
|
-
|
1573
|
+
|
1494
1574
|
@classmethod
|
1495
|
-
def example(cls) ->
|
1575
|
+
def example(cls) -> "PDFSource":
|
1496
1576
|
"""Return an example PDFSource instance."""
|
1497
1577
|
# Skip actual file creation and just use a mock instance
|
1498
1578
|
instance = cls(
|
1499
1579
|
file_path="/path/to/nonexistent/file.pdf",
|
1500
1580
|
chunk_type="page",
|
1501
1581
|
chunk_size=1,
|
1502
|
-
chunk_overlap=0
|
1582
|
+
chunk_overlap=0,
|
1503
1583
|
)
|
1504
|
-
|
1584
|
+
|
1505
1585
|
# Override the to_scenario_list method just for the example
|
1506
1586
|
def mock_to_scenario_list(self):
|
1507
1587
|
from .scenario_list import ScenarioList
|
1588
|
+
|
1508
1589
|
# Create a simple mock ScenarioList with sample PDF data
|
1509
1590
|
scenarios = [
|
1510
|
-
Scenario(
|
1511
|
-
|
1591
|
+
Scenario(
|
1592
|
+
{
|
1593
|
+
"filename": "example.pdf",
|
1594
|
+
"page": 1,
|
1595
|
+
"text": "This is page 1 content",
|
1596
|
+
}
|
1597
|
+
),
|
1598
|
+
Scenario(
|
1599
|
+
{
|
1600
|
+
"filename": "example.pdf",
|
1601
|
+
"page": 2,
|
1602
|
+
"text": "This is page 2 content",
|
1603
|
+
}
|
1604
|
+
),
|
1512
1605
|
]
|
1513
1606
|
return ScenarioList(scenarios)
|
1514
|
-
|
1607
|
+
|
1515
1608
|
# Replace the method on this instance only
|
1516
1609
|
import types
|
1610
|
+
|
1517
1611
|
instance.to_scenario_list = types.MethodType(mock_to_scenario_list, instance)
|
1518
|
-
|
1612
|
+
|
1519
1613
|
return instance
|
1520
|
-
|
1614
|
+
|
1521
1615
|
def to_scenario_list(self):
|
1522
1616
|
"""Create a ScenarioList from a PDF file."""
|
1523
1617
|
from .scenario_list import ScenarioList
|
1524
1618
|
from .scenario_list_pdf_tools import PdfTools
|
1525
|
-
|
1619
|
+
|
1526
1620
|
try:
|
1527
1621
|
# Check if it's a URL
|
1528
1622
|
if PdfTools.is_url(self.file_path):
|
@@ -1534,14 +1628,16 @@ class PDFSource(Source):
|
|
1534
1628
|
)
|
1535
1629
|
else:
|
1536
1630
|
# It's a regular URL
|
1537
|
-
local_path = PdfTools.fetch_and_save_pdf(
|
1631
|
+
local_path = PdfTools.fetch_and_save_pdf(
|
1632
|
+
self.file_path, "temp_pdf.pdf"
|
1633
|
+
)
|
1538
1634
|
else:
|
1539
1635
|
# It's a local file path
|
1540
1636
|
local_path = self.file_path
|
1541
|
-
|
1637
|
+
|
1542
1638
|
# Extract scenarios from the PDF
|
1543
1639
|
scenarios = list(PdfTools.extract_text_from_pdf(local_path))
|
1544
|
-
|
1640
|
+
|
1545
1641
|
# Handle chunking based on the specified parameters
|
1546
1642
|
if self.chunk_type == "page":
|
1547
1643
|
# Default behavior - one scenario per page
|
@@ -1551,31 +1647,31 @@ class PDFSource(Source):
|
|
1551
1647
|
combined_text = ""
|
1552
1648
|
for scenario in scenarios:
|
1553
1649
|
combined_text += scenario["text"]
|
1554
|
-
|
1650
|
+
|
1555
1651
|
# Create a single scenario with all text
|
1556
1652
|
base_scenario = scenarios[0].copy()
|
1557
1653
|
base_scenario["text"] = combined_text
|
1558
1654
|
return ScenarioList([base_scenario])
|
1559
1655
|
else:
|
1560
|
-
raise ValueError(
|
1561
|
-
|
1656
|
+
raise ValueError(
|
1657
|
+
f"Invalid chunk_type: {self.chunk_type}. Must be 'page' or 'text'."
|
1658
|
+
)
|
1659
|
+
|
1562
1660
|
except Exception as e:
|
1563
1661
|
from .exceptions import ScenarioError
|
1662
|
+
|
1564
1663
|
raise ScenarioError(f"Error processing PDF: {str(e)}")
|
1565
1664
|
|
1566
1665
|
|
1567
1666
|
class PDFImageSource(Source):
|
1568
1667
|
source_type = "pdf_to_image"
|
1569
|
-
|
1668
|
+
|
1570
1669
|
def __init__(
|
1571
|
-
self,
|
1572
|
-
file_path: str,
|
1573
|
-
base_width: int = 2000,
|
1574
|
-
include_text: bool = True
|
1670
|
+
self, file_path: str, base_width: int = 2000, include_text: bool = True
|
1575
1671
|
):
|
1576
1672
|
"""
|
1577
1673
|
Initialize a PDFImageSource with a path to a PDF file.
|
1578
|
-
|
1674
|
+
|
1579
1675
|
Args:
|
1580
1676
|
file_path: Path to the PDF file.
|
1581
1677
|
base_width: Width to use for the generated images.
|
@@ -1584,74 +1680,91 @@ class PDFImageSource(Source):
|
|
1584
1680
|
self.file_path = file_path
|
1585
1681
|
self.base_width = base_width
|
1586
1682
|
self.include_text = include_text
|
1587
|
-
|
1683
|
+
|
1588
1684
|
@classmethod
|
1589
|
-
def example(cls) ->
|
1685
|
+
def example(cls) -> "PDFImageSource":
|
1590
1686
|
"""Return an example PDFImageSource instance."""
|
1591
1687
|
# Skip actual file creation and just use a mock instance
|
1592
1688
|
instance = cls(
|
1593
1689
|
file_path="/path/to/nonexistent/file.pdf",
|
1594
1690
|
base_width=2000,
|
1595
|
-
include_text=True
|
1691
|
+
include_text=True,
|
1596
1692
|
)
|
1597
|
-
|
1693
|
+
|
1598
1694
|
# Override the to_scenario_list method just for the example
|
1599
1695
|
def mock_to_scenario_list(self):
|
1600
1696
|
from .scenario_list import ScenarioList
|
1697
|
+
|
1601
1698
|
# Create a simple mock ScenarioList with sample PDF image data
|
1602
1699
|
scenarios = [
|
1603
|
-
Scenario(
|
1604
|
-
|
1700
|
+
Scenario(
|
1701
|
+
{
|
1702
|
+
"filepath": "/tmp/page_1.jpeg",
|
1703
|
+
"page": 0,
|
1704
|
+
"text": "This is page 1 content",
|
1705
|
+
}
|
1706
|
+
),
|
1707
|
+
Scenario(
|
1708
|
+
{
|
1709
|
+
"filepath": "/tmp/page_2.jpeg",
|
1710
|
+
"page": 1,
|
1711
|
+
"text": "This is page 2 content",
|
1712
|
+
}
|
1713
|
+
),
|
1605
1714
|
]
|
1606
1715
|
return ScenarioList(scenarios)
|
1607
|
-
|
1716
|
+
|
1608
1717
|
# Replace the method on this instance only
|
1609
1718
|
import types
|
1719
|
+
|
1610
1720
|
instance.to_scenario_list = types.MethodType(mock_to_scenario_list, instance)
|
1611
|
-
|
1721
|
+
|
1612
1722
|
return instance
|
1613
|
-
|
1723
|
+
|
1614
1724
|
def to_scenario_list(self):
|
1615
1725
|
"""Create a ScenarioList from a PDF file, converting pages to images."""
|
1616
1726
|
from .scenario_list import ScenarioList
|
1617
1727
|
from .scenario_list_pdf_tools import PdfTools
|
1618
|
-
|
1728
|
+
|
1619
1729
|
try:
|
1620
1730
|
# Import pdf2image library
|
1621
1731
|
try:
|
1622
1732
|
from pdf2image import convert_from_path
|
1623
1733
|
except ImportError:
|
1624
|
-
raise ImportError(
|
1625
|
-
|
1734
|
+
raise ImportError(
|
1735
|
+
"pdf2image is required to convert PDF to images. Install it with 'pip install pdf2image'."
|
1736
|
+
)
|
1737
|
+
|
1626
1738
|
# Convert PDF pages to images
|
1627
1739
|
scenarios = PdfTools.from_pdf_to_image(self.file_path, image_format="jpeg")
|
1628
1740
|
return ScenarioList(scenarios)
|
1629
|
-
|
1741
|
+
|
1630
1742
|
except Exception as e:
|
1631
1743
|
from .exceptions import ScenarioError
|
1744
|
+
|
1632
1745
|
raise ScenarioError(f"Error converting PDF to images: {str(e)}")
|
1633
1746
|
|
1634
1747
|
|
1635
1748
|
class ScenarioSource:
|
1636
1749
|
"""
|
1637
1750
|
Factory class for creating ScenarioList objects from various sources.
|
1638
|
-
|
1751
|
+
|
1639
1752
|
This class provides static methods for creating ScenarioList objects from different
|
1640
1753
|
data sources, centralizing the creation logic that was previously scattered across
|
1641
1754
|
different classmethods in the ScenarioList class.
|
1642
|
-
|
1755
|
+
|
1643
1756
|
The main entry point is the from_source method, which dispatches to appropriate
|
1644
1757
|
source-specific methods based on the source_type parameter.
|
1645
1758
|
"""
|
1646
|
-
|
1759
|
+
|
1647
1760
|
@staticmethod
|
1648
1761
|
def from_source(source_type: str, *args, **kwargs):
|
1649
1762
|
"""
|
1650
1763
|
Create a ScenarioList from a specified source type.
|
1651
|
-
|
1764
|
+
|
1652
1765
|
This method serves as the main entry point for creating ScenarioList objects,
|
1653
1766
|
dispatching to the appropriate source-specific method based on the source_type.
|
1654
|
-
|
1767
|
+
|
1655
1768
|
Args:
|
1656
1769
|
source_type: The type of source to create a ScenarioList from.
|
1657
1770
|
Valid values include: 'urls', 'directory', 'list', 'list_of_tuples',
|
@@ -1660,10 +1773,10 @@ class ScenarioSource:
|
|
1660
1773
|
'nested_dict', 'parquet', 'pdf', 'pdf_to_image'.
|
1661
1774
|
*args: Positional arguments to pass to the source-specific method.
|
1662
1775
|
**kwargs: Keyword arguments to pass to the source-specific method.
|
1663
|
-
|
1776
|
+
|
1664
1777
|
Returns:
|
1665
1778
|
A ScenarioList object created from the specified source.
|
1666
|
-
|
1779
|
+
|
1667
1780
|
Raises:
|
1668
1781
|
ValueError: If the source_type is not recognized.
|
1669
1782
|
"""
|
@@ -1679,14 +1792,14 @@ class ScenarioSource:
|
|
1679
1792
|
return method(*args, **kwargs)
|
1680
1793
|
else:
|
1681
1794
|
raise ValueError(f"Unsupported source type: {source_type}")
|
1682
|
-
|
1795
|
+
|
1683
1796
|
@staticmethod
|
1684
1797
|
def _from_urls(urls: list[str], field_name: Optional[str] = "text"):
|
1685
1798
|
"""Create a ScenarioList from a list of URLs."""
|
1686
1799
|
from .scenario_list import ScenarioList
|
1687
|
-
|
1800
|
+
|
1688
1801
|
import requests
|
1689
|
-
|
1802
|
+
|
1690
1803
|
result = ScenarioList()
|
1691
1804
|
for url in urls:
|
1692
1805
|
try:
|
@@ -1697,9 +1810,9 @@ class ScenarioSource:
|
|
1697
1810
|
except requests.RequestException as e:
|
1698
1811
|
warnings.warn(f"Failed to fetch URL {url}: {str(e)}")
|
1699
1812
|
continue
|
1700
|
-
|
1813
|
+
|
1701
1814
|
return result
|
1702
|
-
|
1815
|
+
|
1703
1816
|
@staticmethod
|
1704
1817
|
def _from_directory(
|
1705
1818
|
directory: str,
|
@@ -1713,7 +1826,7 @@ class ScenarioSource:
|
|
1713
1826
|
warnings.warn(
|
1714
1827
|
"_from_directory is deprecated. Use DirectorySource directly or ScenarioSource.from_source('directory', ...) instead.",
|
1715
1828
|
DeprecationWarning,
|
1716
|
-
stacklevel=2
|
1829
|
+
stacklevel=2,
|
1717
1830
|
)
|
1718
1831
|
source = DirectorySource(
|
1719
1832
|
directory=directory,
|
@@ -1721,23 +1834,21 @@ class ScenarioSource:
|
|
1721
1834
|
recursive=recursive,
|
1722
1835
|
metadata=metadata,
|
1723
1836
|
ignore_dirs=ignore_dirs,
|
1724
|
-
ignore_files=ignore_files
|
1837
|
+
ignore_files=ignore_files,
|
1725
1838
|
)
|
1726
1839
|
return source.to_scenario_list()
|
1727
|
-
|
1840
|
+
|
1728
1841
|
@staticmethod
|
1729
|
-
def _from_list(
|
1730
|
-
field_name: str, values: list, use_indexes: bool = False
|
1731
|
-
):
|
1842
|
+
def _from_list(field_name: str, values: list, use_indexes: bool = False):
|
1732
1843
|
"""Create a ScenarioList from a list of values with a specified field name."""
|
1733
1844
|
warnings.warn(
|
1734
1845
|
"_from_list is deprecated. Use ListSource directly or ScenarioSource.from_source('list', ...) instead.",
|
1735
1846
|
DeprecationWarning,
|
1736
|
-
stacklevel=2
|
1847
|
+
stacklevel=2,
|
1737
1848
|
)
|
1738
1849
|
source = ListSource(field_name, values, use_indexes)
|
1739
1850
|
return source.to_scenario_list()
|
1740
|
-
|
1851
|
+
|
1741
1852
|
@staticmethod
|
1742
1853
|
def _from_list_of_tuples(
|
1743
1854
|
field_names: list[str], values: list[tuple], use_indexes: bool = False
|
@@ -1746,107 +1857,106 @@ class ScenarioSource:
|
|
1746
1857
|
warnings.warn(
|
1747
1858
|
"_from_list_of_tuples is deprecated. Use TuplesSource directly or ScenarioSource.from_source('list_of_tuples', ...) instead.",
|
1748
1859
|
DeprecationWarning,
|
1749
|
-
stacklevel=2
|
1860
|
+
stacklevel=2,
|
1750
1861
|
)
|
1751
1862
|
source = TuplesSource(field_names, values, use_indexes)
|
1752
1863
|
return source.to_scenario_list()
|
1753
|
-
|
1864
|
+
|
1754
1865
|
@staticmethod
|
1755
|
-
def _from_sqlite(
|
1756
|
-
db_path: str, table: str, fields: Optional[list] = None
|
1757
|
-
):
|
1866
|
+
def _from_sqlite(db_path: str, table: str, fields: Optional[list] = None):
|
1758
1867
|
"""Create a ScenarioList from a SQLite database."""
|
1759
1868
|
warnings.warn(
|
1760
1869
|
"_from_sqlite is deprecated. Use SQLiteSource directly or ScenarioSource.from_source('sqlite', ...) instead.",
|
1761
1870
|
DeprecationWarning,
|
1762
|
-
stacklevel=2
|
1871
|
+
stacklevel=2,
|
1763
1872
|
)
|
1764
1873
|
source = SQLiteSource(db_path, table, fields)
|
1765
1874
|
return source.to_scenario_list()
|
1766
|
-
|
1875
|
+
|
1767
1876
|
@staticmethod
|
1768
|
-
def _from_latex(
|
1769
|
-
file_path: str, table_index: int = 0, has_header: bool = True
|
1770
|
-
):
|
1877
|
+
def _from_latex(file_path: str, table_index: int = 0, has_header: bool = True):
|
1771
1878
|
"""Create a ScenarioList from a LaTeX file."""
|
1772
1879
|
warnings.warn(
|
1773
1880
|
"_from_latex is deprecated. Use LaTeXSource directly or ScenarioSource.from_source('latex', ...) instead.",
|
1774
1881
|
DeprecationWarning,
|
1775
|
-
stacklevel=2
|
1882
|
+
stacklevel=2,
|
1776
1883
|
)
|
1777
1884
|
source = LaTeXSource(file_path, table_index, has_header)
|
1778
1885
|
return source.to_scenario_list()
|
1779
|
-
|
1886
|
+
|
1780
1887
|
@staticmethod
|
1781
1888
|
def _from_google_doc(url: str):
|
1782
1889
|
"""Create a ScenarioList from a Google Doc."""
|
1783
1890
|
warnings.warn(
|
1784
1891
|
"_from_google_doc is deprecated. Use GoogleDocSource directly or ScenarioSource.from_source('google_doc', ...) instead.",
|
1785
1892
|
DeprecationWarning,
|
1786
|
-
stacklevel=2
|
1893
|
+
stacklevel=2,
|
1787
1894
|
)
|
1788
1895
|
source = GoogleDocSource(url)
|
1789
1896
|
return source.to_scenario_list()
|
1790
|
-
|
1897
|
+
|
1791
1898
|
@staticmethod
|
1792
1899
|
def _from_pandas(df):
|
1793
1900
|
"""Create a ScenarioList from a pandas DataFrame."""
|
1794
1901
|
warnings.warn(
|
1795
1902
|
"_from_pandas is deprecated. Use PandasSource directly or ScenarioSource.from_source('pandas', ...) instead.",
|
1796
1903
|
DeprecationWarning,
|
1797
|
-
stacklevel=2
|
1904
|
+
stacklevel=2,
|
1798
1905
|
)
|
1799
1906
|
source = PandasSource(df)
|
1800
1907
|
return source.to_scenario_list()
|
1801
|
-
|
1908
|
+
|
1802
1909
|
@staticmethod
|
1803
1910
|
def _from_dta(file_path: str, include_metadata: bool = True):
|
1804
1911
|
"""Create a ScenarioList from a Stata data file."""
|
1805
1912
|
warnings.warn(
|
1806
1913
|
"_from_dta is deprecated. Use StataSource directly or ScenarioSource.from_source('dta', ...) instead.",
|
1807
1914
|
DeprecationWarning,
|
1808
|
-
stacklevel=2
|
1915
|
+
stacklevel=2,
|
1809
1916
|
)
|
1810
1917
|
source = StataSource(file_path, include_metadata)
|
1811
1918
|
return source.to_scenario_list()
|
1812
|
-
|
1919
|
+
|
1813
1920
|
@staticmethod
|
1814
|
-
def _from_wikipedia(
|
1815
|
-
url: str, table_index: int = 0, header: bool = True
|
1816
|
-
):
|
1921
|
+
def _from_wikipedia(url: str, table_index: int = 0, header: bool = True):
|
1817
1922
|
"""Create a ScenarioList from a table on a Wikipedia page."""
|
1818
1923
|
warnings.warn(
|
1819
1924
|
"_from_wikipedia is deprecated. Use WikipediaSource directly or ScenarioSource.from_source('wikipedia', ...) instead.",
|
1820
1925
|
DeprecationWarning,
|
1821
|
-
stacklevel=2
|
1926
|
+
stacklevel=2,
|
1822
1927
|
)
|
1823
1928
|
source = WikipediaSource(url, table_index, header)
|
1824
1929
|
return source.to_scenario_list()
|
1825
|
-
|
1930
|
+
|
1826
1931
|
@staticmethod
|
1827
|
-
def _from_excel(
|
1828
|
-
file_path: str, sheet_name: Optional[str] = None, **kwargs
|
1829
|
-
):
|
1932
|
+
def _from_excel(file_path: str, sheet_name: Optional[str] = None, **kwargs):
|
1830
1933
|
"""Create a ScenarioList from an Excel file."""
|
1831
1934
|
warnings.warn(
|
1832
1935
|
"_from_excel is deprecated. Use ExcelSource directly or ScenarioSource.from_source('excel', ...) instead.",
|
1833
1936
|
DeprecationWarning,
|
1834
|
-
stacklevel=2
|
1937
|
+
stacklevel=2,
|
1835
1938
|
)
|
1836
1939
|
source = ExcelSource(file_path, sheet_name=sheet_name, **kwargs)
|
1837
1940
|
return source.to_scenario_list()
|
1838
|
-
|
1941
|
+
|
1839
1942
|
@staticmethod
|
1840
|
-
def _from_google_sheet(
|
1943
|
+
def _from_google_sheet(
|
1944
|
+
url: str,
|
1945
|
+
sheet_name: Optional[str] = None,
|
1946
|
+
column_names: Optional[List[str]] = None,
|
1947
|
+
**kwargs,
|
1948
|
+
):
|
1841
1949
|
"""Create a ScenarioList from a Google Sheet."""
|
1842
1950
|
warnings.warn(
|
1843
1951
|
"_from_google_sheet is deprecated. Use GoogleSheetSource directly or ScenarioSource.from_source('google_sheet', ...) instead.",
|
1844
1952
|
DeprecationWarning,
|
1845
|
-
stacklevel=2
|
1953
|
+
stacklevel=2,
|
1954
|
+
)
|
1955
|
+
source = GoogleSheetSource(
|
1956
|
+
url, sheet_name=sheet_name, column_names=column_names, **kwargs
|
1846
1957
|
)
|
1847
|
-
source = GoogleSheetSource(url, sheet_name=sheet_name, column_names=column_names, **kwargs)
|
1848
1958
|
return source.to_scenario_list()
|
1849
|
-
|
1959
|
+
|
1850
1960
|
@staticmethod
|
1851
1961
|
def _from_delimited_file(
|
1852
1962
|
file_or_url: str,
|
@@ -1859,44 +1969,44 @@ class ScenarioSource:
|
|
1859
1969
|
warnings.warn(
|
1860
1970
|
"_from_delimited_file is deprecated. Use DelimitedFileSource directly or ScenarioSource.from_source('delimited_file', ...) instead.",
|
1861
1971
|
DeprecationWarning,
|
1862
|
-
stacklevel=2
|
1972
|
+
stacklevel=2,
|
1863
1973
|
)
|
1864
1974
|
source = DelimitedFileSource(
|
1865
1975
|
file_or_url=file_or_url,
|
1866
1976
|
delimiter=delimiter,
|
1867
1977
|
has_header=has_header,
|
1868
1978
|
encoding=encoding,
|
1869
|
-
**kwargs
|
1979
|
+
**kwargs,
|
1870
1980
|
)
|
1871
1981
|
return source.to_scenario_list()
|
1872
|
-
|
1982
|
+
|
1873
1983
|
@staticmethod
|
1874
1984
|
def _from_csv(file_or_url: str, **kwargs):
|
1875
1985
|
"""Create a ScenarioList from a CSV file or URL."""
|
1876
1986
|
warnings.warn(
|
1877
1987
|
"_from_csv is deprecated. Use CSVSource directly or ScenarioSource.from_source('csv', ...) instead.",
|
1878
1988
|
DeprecationWarning,
|
1879
|
-
stacklevel=2
|
1989
|
+
stacklevel=2,
|
1880
1990
|
)
|
1881
1991
|
source = CSVSource(file_or_url=file_or_url, **kwargs)
|
1882
1992
|
return source.to_scenario_list()
|
1883
|
-
|
1993
|
+
|
1884
1994
|
@staticmethod
|
1885
1995
|
def _from_tsv(file_or_url: str, **kwargs):
|
1886
1996
|
"""Create a ScenarioList from a TSV file or URL."""
|
1887
1997
|
warnings.warn(
|
1888
1998
|
"_from_tsv is deprecated. Use TSVSource directly or ScenarioSource.from_source('tsv', ...) instead.",
|
1889
1999
|
DeprecationWarning,
|
1890
|
-
stacklevel=2
|
2000
|
+
stacklevel=2,
|
1891
2001
|
)
|
1892
2002
|
source = TSVSource(file_or_url=file_or_url, **kwargs)
|
1893
2003
|
return source.to_scenario_list()
|
1894
|
-
|
2004
|
+
|
1895
2005
|
@staticmethod
|
1896
2006
|
def _from_dict(data: dict):
|
1897
2007
|
"""Create a ScenarioList from a dictionary."""
|
1898
2008
|
from .scenario_list import ScenarioList
|
1899
|
-
|
2009
|
+
|
1900
2010
|
if "scenarios" in data:
|
1901
2011
|
scenarios = [Scenario(s) for s in data["scenarios"]]
|
1902
2012
|
codebook = data.get("codebook", {})
|
@@ -1907,48 +2017,48 @@ class ScenarioSource:
|
|
1907
2017
|
field_names = list(data.keys())
|
1908
2018
|
if not all(isinstance(v, list) for v in data.values()):
|
1909
2019
|
raise ScenarioError("All values in the dictionary must be lists")
|
1910
|
-
|
2020
|
+
|
1911
2021
|
# Check all lists have the same length
|
1912
2022
|
list_lengths = [len(v) for v in data.values()]
|
1913
2023
|
if not all(l == list_lengths[0] for l in list_lengths):
|
1914
2024
|
raise ScenarioError("All lists must have the same length")
|
1915
|
-
|
2025
|
+
|
1916
2026
|
# Create scenarios
|
1917
2027
|
for i in range(list_lengths[0]):
|
1918
2028
|
scenario_dict = {k: data[k][i] for k in field_names}
|
1919
2029
|
scenarios.append(Scenario(scenario_dict))
|
1920
|
-
|
2030
|
+
|
1921
2031
|
return ScenarioList(scenarios)
|
1922
|
-
|
2032
|
+
|
1923
2033
|
@staticmethod
|
1924
2034
|
def _from_nested_dict(data: dict, id_field: Optional[str] = None):
|
1925
2035
|
"""Create a ScenarioList from a nested dictionary."""
|
1926
2036
|
from .scenario_list import ScenarioList
|
1927
|
-
|
2037
|
+
|
1928
2038
|
scenarios = []
|
1929
|
-
|
2039
|
+
|
1930
2040
|
for key, value in data.items():
|
1931
2041
|
if not isinstance(value, dict):
|
1932
2042
|
raise ScenarioError(f"Value for key {key} is not a dictionary")
|
1933
|
-
|
2043
|
+
|
1934
2044
|
scenario_dict = value.copy()
|
1935
2045
|
if id_field:
|
1936
2046
|
scenario_dict[id_field] = key
|
1937
2047
|
scenarios.append(Scenario(scenario_dict))
|
1938
|
-
|
2048
|
+
|
1939
2049
|
return ScenarioList(scenarios)
|
1940
|
-
|
2050
|
+
|
1941
2051
|
@staticmethod
|
1942
2052
|
def _from_parquet(file_path: str):
|
1943
2053
|
"""Create a ScenarioList from a Parquet file."""
|
1944
2054
|
warnings.warn(
|
1945
2055
|
"_from_parquet is deprecated. Use ParquetSource directly or ScenarioSource.from_source('parquet', ...) instead.",
|
1946
2056
|
DeprecationWarning,
|
1947
|
-
stacklevel=2
|
2057
|
+
stacklevel=2,
|
1948
2058
|
)
|
1949
2059
|
source = ParquetSource(file_path)
|
1950
2060
|
return source.to_scenario_list()
|
1951
|
-
|
2061
|
+
|
1952
2062
|
@staticmethod
|
1953
2063
|
def _from_pdf(
|
1954
2064
|
file_path: str,
|
@@ -1960,16 +2070,16 @@ class ScenarioSource:
|
|
1960
2070
|
warnings.warn(
|
1961
2071
|
"_from_pdf is deprecated. Use PDFSource directly or ScenarioSource.from_source('pdf', ...) instead.",
|
1962
2072
|
DeprecationWarning,
|
1963
|
-
stacklevel=2
|
2073
|
+
stacklevel=2,
|
1964
2074
|
)
|
1965
2075
|
source = PDFSource(
|
1966
2076
|
file_path=file_path,
|
1967
2077
|
chunk_type=chunk_type,
|
1968
2078
|
chunk_size=chunk_size,
|
1969
|
-
chunk_overlap=chunk_overlap
|
2079
|
+
chunk_overlap=chunk_overlap,
|
1970
2080
|
)
|
1971
2081
|
return source.to_scenario_list()
|
1972
|
-
|
2082
|
+
|
1973
2083
|
@staticmethod
|
1974
2084
|
def _from_pdf_to_image(
|
1975
2085
|
file_path: str,
|
@@ -1980,11 +2090,9 @@ class ScenarioSource:
|
|
1980
2090
|
warnings.warn(
|
1981
2091
|
"_from_pdf_to_image is deprecated. Use PDFImageSource directly or ScenarioSource.from_source('pdf_to_image', ...) instead.",
|
1982
2092
|
DeprecationWarning,
|
1983
|
-
stacklevel=2
|
2093
|
+
stacklevel=2,
|
1984
2094
|
)
|
1985
2095
|
source = PDFImageSource(
|
1986
|
-
file_path=file_path,
|
1987
|
-
base_width=base_width,
|
1988
|
-
include_text=include_text
|
2096
|
+
file_path=file_path, base_width=base_width, include_text=include_text
|
1989
2097
|
)
|
1990
|
-
return source.to_scenario_list()
|
2098
|
+
return source.to_scenario_list()
|