edsl 0.1.53__py3-none-any.whl → 0.1.55__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- edsl/__init__.py +8 -1
- edsl/__init__original.py +134 -0
- edsl/__version__.py +1 -1
- edsl/agents/agent.py +29 -0
- edsl/agents/agent_list.py +36 -1
- edsl/base/base_class.py +281 -151
- edsl/buckets/__init__.py +8 -3
- edsl/buckets/bucket_collection.py +9 -3
- edsl/buckets/model_buckets.py +4 -2
- edsl/buckets/token_bucket.py +2 -2
- edsl/buckets/token_bucket_client.py +5 -3
- edsl/caching/cache.py +131 -62
- edsl/caching/cache_entry.py +70 -58
- edsl/caching/sql_dict.py +17 -0
- edsl/cli.py +99 -0
- edsl/config/config_class.py +16 -0
- edsl/conversation/__init__.py +31 -0
- edsl/coop/coop.py +276 -242
- edsl/coop/coop_jobs_objects.py +59 -0
- edsl/coop/coop_objects.py +29 -0
- edsl/coop/coop_regular_objects.py +26 -0
- edsl/coop/utils.py +24 -19
- edsl/dataset/dataset.py +338 -101
- edsl/db_list/sqlite_list.py +349 -0
- edsl/inference_services/__init__.py +40 -5
- edsl/inference_services/exceptions.py +11 -0
- edsl/inference_services/services/anthropic_service.py +5 -2
- edsl/inference_services/services/aws_bedrock.py +6 -2
- edsl/inference_services/services/azure_ai.py +6 -2
- edsl/inference_services/services/google_service.py +3 -2
- edsl/inference_services/services/mistral_ai_service.py +6 -2
- edsl/inference_services/services/open_ai_service.py +6 -2
- edsl/inference_services/services/perplexity_service.py +6 -2
- edsl/inference_services/services/test_service.py +105 -7
- edsl/interviews/answering_function.py +167 -59
- edsl/interviews/interview.py +124 -72
- edsl/interviews/interview_task_manager.py +10 -0
- edsl/invigilators/invigilators.py +10 -1
- edsl/jobs/async_interview_runner.py +146 -104
- edsl/jobs/data_structures.py +6 -4
- edsl/jobs/decorators.py +61 -0
- edsl/jobs/fetch_invigilator.py +61 -18
- edsl/jobs/html_table_job_logger.py +14 -2
- edsl/jobs/jobs.py +180 -104
- edsl/jobs/jobs_component_constructor.py +2 -2
- edsl/jobs/jobs_interview_constructor.py +2 -0
- edsl/jobs/jobs_pricing_estimation.py +127 -46
- edsl/jobs/jobs_remote_inference_logger.py +4 -0
- edsl/jobs/jobs_runner_status.py +30 -25
- edsl/jobs/progress_bar_manager.py +79 -0
- edsl/jobs/remote_inference.py +35 -1
- edsl/key_management/key_lookup_builder.py +6 -1
- edsl/language_models/language_model.py +102 -12
- edsl/language_models/model.py +10 -3
- edsl/language_models/price_manager.py +45 -75
- edsl/language_models/registry.py +5 -0
- edsl/language_models/utilities.py +2 -1
- edsl/notebooks/notebook.py +77 -10
- edsl/questions/VALIDATION_README.md +134 -0
- edsl/questions/__init__.py +24 -1
- edsl/questions/exceptions.py +21 -0
- edsl/questions/question_check_box.py +171 -149
- edsl/questions/question_dict.py +243 -51
- edsl/questions/question_multiple_choice_with_other.py +624 -0
- edsl/questions/question_registry.py +2 -1
- edsl/questions/templates/multiple_choice_with_other/__init__.py +0 -0
- edsl/questions/templates/multiple_choice_with_other/answering_instructions.jinja +15 -0
- edsl/questions/templates/multiple_choice_with_other/question_presentation.jinja +17 -0
- edsl/questions/validation_analysis.py +185 -0
- edsl/questions/validation_cli.py +131 -0
- edsl/questions/validation_html_report.py +404 -0
- edsl/questions/validation_logger.py +136 -0
- edsl/results/result.py +63 -16
- edsl/results/results.py +702 -171
- edsl/scenarios/construct_download_link.py +16 -3
- edsl/scenarios/directory_scanner.py +226 -226
- edsl/scenarios/file_methods.py +5 -0
- edsl/scenarios/file_store.py +117 -6
- edsl/scenarios/handlers/__init__.py +5 -1
- edsl/scenarios/handlers/mp4_file_store.py +104 -0
- edsl/scenarios/handlers/webm_file_store.py +104 -0
- edsl/scenarios/scenario.py +120 -101
- edsl/scenarios/scenario_list.py +800 -727
- edsl/scenarios/scenario_list_gc_test.py +146 -0
- edsl/scenarios/scenario_list_memory_test.py +214 -0
- edsl/scenarios/scenario_list_source_refactor.md +35 -0
- edsl/scenarios/scenario_selector.py +5 -4
- edsl/scenarios/scenario_source.py +1990 -0
- edsl/scenarios/tests/test_scenario_list_sources.py +52 -0
- edsl/surveys/survey.py +22 -0
- edsl/tasks/__init__.py +4 -2
- edsl/tasks/task_history.py +198 -36
- edsl/tests/scenarios/test_ScenarioSource.py +51 -0
- edsl/tests/scenarios/test_scenario_list_sources.py +51 -0
- edsl/utilities/__init__.py +2 -1
- edsl/utilities/decorators.py +121 -0
- edsl/utilities/memory_debugger.py +1010 -0
- {edsl-0.1.53.dist-info → edsl-0.1.55.dist-info}/METADATA +52 -76
- {edsl-0.1.53.dist-info → edsl-0.1.55.dist-info}/RECORD +102 -78
- edsl/jobs/jobs_runner_asyncio.py +0 -281
- edsl/language_models/unused/fake_openai_service.py +0 -60
- {edsl-0.1.53.dist-info → edsl-0.1.55.dist-info}/LICENSE +0 -0
- {edsl-0.1.53.dist-info → edsl-0.1.55.dist-info}/WHEEL +0 -0
- {edsl-0.1.53.dist-info → edsl-0.1.55.dist-info}/entry_points.txt +0 -0
edsl/scenarios/scenario.py
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
A Scenario is a dictionary-like object that stores key-value pairs for parameterizing questions.
|
3
3
|
|
4
4
|
Scenarios are a fundamental concept in EDSL, providing a mechanism to parameterize
|
5
|
-
questions with dynamic values. Each Scenario contains key-value pairs that can be
|
5
|
+
questions with dynamic values. Each Scenario contains key-value pairs that can be
|
6
6
|
referenced within question templates using Jinja syntax. This allows for creating
|
7
7
|
questions that vary based on the specific scenario being presented.
|
8
8
|
|
@@ -33,36 +33,35 @@ if TYPE_CHECKING:
|
|
33
33
|
from ..dataset import Dataset
|
34
34
|
|
35
35
|
|
36
|
-
|
37
36
|
class Scenario(Base, UserDict):
|
38
37
|
"""
|
39
38
|
A dictionary-like object that stores key-value pairs for parameterizing questions.
|
40
|
-
|
39
|
+
|
41
40
|
A Scenario inherits from both the EDSL Base class and Python's UserDict, allowing
|
42
41
|
it to function as a dictionary while providing additional functionality. Scenarios
|
43
42
|
are used to parameterize questions by providing variable data that can be referenced
|
44
43
|
within question templates using Jinja syntax.
|
45
|
-
|
44
|
+
|
46
45
|
Scenarios can be created directly with dictionary data or constructed from various
|
47
46
|
sources using class methods (from_file, from_url, from_pdf, etc.). They support
|
48
47
|
operations like addition (combining scenarios) and multiplication (creating cross
|
49
48
|
products with other scenarios or scenario lists).
|
50
|
-
|
49
|
+
|
51
50
|
Attributes:
|
52
51
|
data (dict): The underlying dictionary data.
|
53
52
|
name (str, optional): A name for the scenario.
|
54
|
-
|
53
|
+
|
55
54
|
Examples:
|
56
55
|
Create a simple scenario:
|
57
56
|
>>> s = Scenario({"product": "coffee", "price": 4.99})
|
58
|
-
|
57
|
+
|
59
58
|
Combine scenarios:
|
60
59
|
>>> s1 = Scenario({"product": "coffee"})
|
61
60
|
>>> s2 = Scenario({"price": 4.99})
|
62
61
|
>>> s3 = s1 + s2
|
63
62
|
>>> s3
|
64
63
|
Scenario({'product': 'coffee', 'price': 4.99})
|
65
|
-
|
64
|
+
|
66
65
|
Create a scenario from a file:
|
67
66
|
>>> import tempfile
|
68
67
|
>>> with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
|
@@ -96,14 +95,18 @@ class Scenario(Base, UserDict):
|
|
96
95
|
data = dict(data)
|
97
96
|
except Exception as e:
|
98
97
|
raise ScenarioError(
|
99
|
-
f"You must pass in a dictionary to initialize a Scenario. You passed in {data}"
|
98
|
+
f"You must pass in a dictionary to initialize a Scenario. You passed in {data}"
|
99
|
+
+ "Exception message:"
|
100
|
+
+ str(e),
|
100
101
|
)
|
101
102
|
|
102
103
|
super().__init__()
|
103
104
|
self.data = data if data is not None else {}
|
104
105
|
self.name = name
|
105
106
|
|
106
|
-
def __mul__(
|
107
|
+
def __mul__(
|
108
|
+
self, scenario_list_or_scenario: Union["ScenarioList", "Scenario"]
|
109
|
+
) -> "ScenarioList":
|
107
110
|
"""Takes the cross product of a Scenario with another Scenario or ScenarioList.
|
108
111
|
|
109
112
|
Args:
|
@@ -125,12 +128,15 @@ class Scenario(Base, UserDict):
|
|
125
128
|
True
|
126
129
|
"""
|
127
130
|
from .scenario_list import ScenarioList
|
131
|
+
|
128
132
|
if isinstance(scenario_list_or_scenario, ScenarioList):
|
129
133
|
return scenario_list_or_scenario * self
|
130
134
|
elif isinstance(scenario_list_or_scenario, Scenario):
|
131
135
|
return ScenarioList([self]) * scenario_list_or_scenario
|
132
136
|
else:
|
133
|
-
raise TypeError(
|
137
|
+
raise TypeError(
|
138
|
+
f"Cannot multiply Scenario with {type(scenario_list_or_scenario)}"
|
139
|
+
)
|
134
140
|
|
135
141
|
def replicate(self, n: int) -> "ScenarioList":
|
136
142
|
"""Replicate a scenario n times to return a ScenarioList.
|
@@ -258,14 +264,13 @@ class Scenario(Base, UserDict):
|
|
258
264
|
"""Display a scenario as a table."""
|
259
265
|
return self.to_dataset().table(tablefmt=tablefmt)
|
260
266
|
|
261
|
-
|
262
267
|
def to_dict(self, add_edsl_version: bool = True) -> dict:
|
263
268
|
"""Convert a scenario to a dictionary.
|
264
269
|
|
265
270
|
Example:
|
266
271
|
|
267
272
|
>>> s = Scenario({"food": "wood chips"})
|
268
|
-
>>> s.to_dict()
|
273
|
+
>>> s.to_dict() # doctest: +ELLIPSIS
|
269
274
|
{'food': 'wood chips', 'edsl_version': '...', 'edsl_class_name': 'Scenario'}
|
270
275
|
|
271
276
|
>>> s.to_dict(add_edsl_version = False)
|
@@ -273,10 +278,11 @@ class Scenario(Base, UserDict):
|
|
273
278
|
|
274
279
|
"""
|
275
280
|
from edsl.scenarios import FileStore
|
281
|
+
from edsl.prompts import Prompt
|
276
282
|
|
277
283
|
d = self.data.copy()
|
278
284
|
for key, value in d.items():
|
279
|
-
if isinstance(value, FileStore):
|
285
|
+
if isinstance(value, FileStore) or isinstance(value, Prompt):
|
280
286
|
d[key] = value.to_dict(add_edsl_version=add_edsl_version)
|
281
287
|
if add_edsl_version:
|
282
288
|
from edsl import __version__
|
@@ -363,36 +369,38 @@ class Scenario(Base, UserDict):
|
|
363
369
|
return self.select(list_of_keys)
|
364
370
|
|
365
371
|
@classmethod
|
366
|
-
def from_url(
|
372
|
+
def from_url(
|
373
|
+
cls, url: str, field_name: Optional[str] = "text", testing: bool = False
|
374
|
+
) -> "Scenario":
|
367
375
|
"""
|
368
376
|
Creates a Scenario from the content of a URL.
|
369
|
-
|
370
|
-
This method fetches content from a web URL and creates a Scenario containing the URL
|
377
|
+
|
378
|
+
This method fetches content from a web URL and creates a Scenario containing the URL
|
371
379
|
and the extracted text. When available, BeautifulSoup is used for better HTML parsing
|
372
380
|
and text extraction, otherwise a basic requests approach is used.
|
373
|
-
|
381
|
+
|
374
382
|
Args:
|
375
383
|
url: The URL to fetch content from.
|
376
384
|
field_name: The key name to use for storing the extracted text in the Scenario.
|
377
385
|
Defaults to "text".
|
378
386
|
testing: If True, uses a simplified requests method instead of BeautifulSoup.
|
379
387
|
This is primarily for testing purposes.
|
380
|
-
|
388
|
+
|
381
389
|
Returns:
|
382
390
|
A Scenario containing the URL and extracted text.
|
383
|
-
|
391
|
+
|
384
392
|
Raises:
|
385
393
|
requests.exceptions.RequestException: If the URL cannot be accessed.
|
386
|
-
|
394
|
+
|
387
395
|
Examples:
|
388
396
|
>>> s = Scenario.from_url("https://example.com", testing=True)
|
389
397
|
>>> "url" in s and "text" in s
|
390
398
|
True
|
391
|
-
|
399
|
+
|
392
400
|
>>> s = Scenario.from_url("https://example.com", field_name="content", testing=True)
|
393
401
|
>>> "url" in s and "content" in s
|
394
402
|
True
|
395
|
-
|
403
|
+
|
396
404
|
Notes:
|
397
405
|
- The method attempts to use BeautifulSoup and fake_useragent for better
|
398
406
|
HTML parsing and to mimic a real browser.
|
@@ -409,24 +417,33 @@ class Scenario(Base, UserDict):
|
|
409
417
|
try:
|
410
418
|
from bs4 import BeautifulSoup
|
411
419
|
from fake_useragent import UserAgent
|
412
|
-
|
420
|
+
|
413
421
|
# Configure request headers to appear more like a regular browser
|
414
422
|
ua = UserAgent()
|
415
423
|
headers = {
|
416
|
-
|
417
|
-
|
418
|
-
|
424
|
+
"User-Agent": ua.random,
|
425
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
426
|
+
"Accept-Language": "en-US,en;q=0.5",
|
419
427
|
}
|
420
428
|
|
421
429
|
response = requests.get(url, headers=headers)
|
422
|
-
soup = BeautifulSoup(response.content,
|
423
|
-
|
430
|
+
soup = BeautifulSoup(response.content, "html.parser")
|
431
|
+
|
424
432
|
# Get text content while preserving some structure
|
425
|
-
text =
|
433
|
+
text = " ".join(
|
434
|
+
[
|
435
|
+
p.get_text(strip=True)
|
436
|
+
for p in soup.find_all(
|
437
|
+
["p", "h1", "h2", "h3", "h4", "h5", "h6"]
|
438
|
+
)
|
439
|
+
]
|
440
|
+
)
|
426
441
|
|
427
442
|
except ImportError:
|
428
443
|
# Fallback to basic requests if BeautifulSoup/fake_useragent not available
|
429
|
-
print(
|
444
|
+
print(
|
445
|
+
"BeautifulSoup/fake_useragent not available. Falling back to basic requests."
|
446
|
+
)
|
430
447
|
response = requests.get(url)
|
431
448
|
text = response.text
|
432
449
|
|
@@ -436,33 +453,33 @@ class Scenario(Base, UserDict):
|
|
436
453
|
def from_file(cls, file_path: str, field_name: str) -> "Scenario":
|
437
454
|
"""
|
438
455
|
Creates a Scenario containing a FileStore object from a file.
|
439
|
-
|
456
|
+
|
440
457
|
This method creates a Scenario with a single key-value pair where the value
|
441
458
|
is a FileStore object that encapsulates the specified file. The FileStore
|
442
459
|
handles appropriate file loading, encoding, and extraction based on the file type.
|
443
|
-
|
460
|
+
|
444
461
|
Args:
|
445
462
|
file_path: Path to the file to be incorporated into the Scenario.
|
446
463
|
field_name: Key name to use for storing the FileStore in the Scenario.
|
447
|
-
|
464
|
+
|
448
465
|
Returns:
|
449
466
|
A Scenario containing a FileStore object linked to the specified file.
|
450
|
-
|
467
|
+
|
451
468
|
Raises:
|
452
469
|
FileNotFoundError: If the specified file does not exist.
|
453
|
-
|
470
|
+
|
454
471
|
Examples:
|
455
472
|
>>> import tempfile
|
456
473
|
>>> with tempfile.NamedTemporaryFile(suffix=".txt", mode="w") as f:
|
457
474
|
... _ = f.write("This is a test.")
|
458
475
|
... _ = f.flush()
|
459
476
|
... s = Scenario.from_file(f.name, "file")
|
460
|
-
>>> s
|
477
|
+
>>> s # doctest: +ELLIPSIS
|
461
478
|
Scenario({'file': FileStore(path='...', ...)})
|
462
|
-
|
479
|
+
|
463
480
|
Notes:
|
464
481
|
- The FileStore object handles various file formats differently
|
465
|
-
- FileStore provides methods to access file content, extract text,
|
482
|
+
- FileStore provides methods to access file content, extract text,
|
466
483
|
and manage file operations appropriate to the file type
|
467
484
|
"""
|
468
485
|
from edsl.scenarios import FileStore
|
@@ -476,30 +493,30 @@ class Scenario(Base, UserDict):
|
|
476
493
|
) -> "Scenario":
|
477
494
|
"""
|
478
495
|
Creates a Scenario containing an image file as a FileStore object.
|
479
|
-
|
496
|
+
|
480
497
|
This method creates a Scenario with a single key-value pair where the value
|
481
498
|
is a FileStore object that encapsulates the specified image file. The image
|
482
499
|
is stored as a base64-encoded string, allowing it to be easily serialized
|
483
500
|
and transmitted.
|
484
|
-
|
501
|
+
|
485
502
|
Args:
|
486
503
|
image_path: Path to the image file to be incorporated into the Scenario.
|
487
504
|
image_name: Key name to use for storing the FileStore in the Scenario.
|
488
505
|
If not provided, uses the filename without extension.
|
489
|
-
|
506
|
+
|
490
507
|
Returns:
|
491
508
|
A Scenario containing a FileStore object with the image data.
|
492
|
-
|
509
|
+
|
493
510
|
Raises:
|
494
511
|
FileNotFoundError: If the specified image file does not exist.
|
495
|
-
|
512
|
+
|
496
513
|
Examples:
|
497
514
|
>>> import os
|
498
515
|
>>> # Assuming an image file exists
|
499
516
|
>>> if os.path.exists("image.jpg"):
|
500
517
|
... s = Scenario.from_image("image.jpg")
|
501
518
|
... s_named = Scenario.from_image("image.jpg", "picture")
|
502
|
-
|
519
|
+
|
503
520
|
Notes:
|
504
521
|
- The resulting FileStore can be displayed in notebooks or used in questions
|
505
522
|
- Supported image formats include JPG, PNG, GIF, etc.
|
@@ -517,27 +534,27 @@ class Scenario(Base, UserDict):
|
|
517
534
|
def from_pdf(cls, pdf_path: str) -> "Scenario":
|
518
535
|
"""
|
519
536
|
Creates a Scenario containing text extracted from a PDF file.
|
520
|
-
|
537
|
+
|
521
538
|
This method extracts text and metadata from a PDF file and creates a Scenario
|
522
539
|
containing this information. It uses the PdfExtractor class which provides
|
523
540
|
access to text content, metadata, and structure from PDF files.
|
524
|
-
|
541
|
+
|
525
542
|
Args:
|
526
543
|
pdf_path: Path to the PDF file to extract content from.
|
527
|
-
|
544
|
+
|
528
545
|
Returns:
|
529
546
|
A Scenario containing extracted text and metadata from the PDF.
|
530
|
-
|
547
|
+
|
531
548
|
Raises:
|
532
549
|
FileNotFoundError: If the specified PDF file does not exist.
|
533
550
|
ImportError: If the required PDF extraction libraries are not installed.
|
534
|
-
|
551
|
+
|
535
552
|
Examples:
|
536
553
|
>>> import os
|
537
554
|
>>> # Assuming a PDF file exists
|
538
555
|
>>> if os.path.exists("document.pdf"):
|
539
556
|
... s = Scenario.from_pdf("document.pdf")
|
540
|
-
|
557
|
+
|
541
558
|
Notes:
|
542
559
|
- The returned Scenario contains various keys with PDF content and metadata
|
543
560
|
- PDF extraction requires the PyMuPDF library
|
@@ -545,6 +562,7 @@ class Scenario(Base, UserDict):
|
|
545
562
|
"""
|
546
563
|
try:
|
547
564
|
from edsl.scenarios.PdfExtractor import PdfExtractor
|
565
|
+
|
548
566
|
extractor = PdfExtractor(pdf_path)
|
549
567
|
return Scenario(extractor.get_pdf_dict())
|
550
568
|
except ImportError as e:
|
@@ -558,31 +576,31 @@ class Scenario(Base, UserDict):
|
|
558
576
|
def from_html(cls, url: str, field_name: Optional[str] = None) -> "Scenario":
|
559
577
|
"""
|
560
578
|
Creates a Scenario containing both HTML content and extracted text from a URL.
|
561
|
-
|
579
|
+
|
562
580
|
This method fetches HTML content from a URL, extracts readable text from it,
|
563
581
|
and creates a Scenario containing the original URL, the raw HTML, and the
|
564
582
|
extracted text. Unlike from_url, this method preserves the raw HTML content.
|
565
|
-
|
583
|
+
|
566
584
|
Args:
|
567
585
|
url: URL to fetch HTML content from.
|
568
586
|
field_name: Key name to use for the extracted text in the Scenario.
|
569
587
|
If not provided, defaults to "text".
|
570
|
-
|
588
|
+
|
571
589
|
Returns:
|
572
590
|
A Scenario containing the URL, raw HTML, and extracted text.
|
573
|
-
|
591
|
+
|
574
592
|
Raises:
|
575
593
|
requests.exceptions.RequestException: If the URL cannot be accessed.
|
576
|
-
|
594
|
+
|
577
595
|
Examples:
|
578
596
|
>>> s = Scenario.from_html("https://example.com")
|
579
597
|
>>> all(key in s for key in ["url", "html", "text"])
|
580
598
|
True
|
581
|
-
|
599
|
+
|
582
600
|
>>> s = Scenario.from_html("https://example.com", field_name="content")
|
583
601
|
>>> all(key in s for key in ["url", "html", "content"])
|
584
602
|
True
|
585
|
-
|
603
|
+
|
586
604
|
Notes:
|
587
605
|
- Uses BeautifulSoup for HTML parsing when available
|
588
606
|
- Stores both the raw HTML and the extracted text
|
@@ -599,17 +617,17 @@ class Scenario(Base, UserDict):
|
|
599
617
|
def fetch_html(url: str) -> Optional[str]:
|
600
618
|
"""
|
601
619
|
Fetches HTML content from a URL with robust error handling and retries.
|
602
|
-
|
620
|
+
|
603
621
|
This method creates a session with configurable retries to fetch HTML content
|
604
622
|
from a URL. It uses a realistic user agent to avoid being blocked by websites
|
605
623
|
that filter bot traffic.
|
606
|
-
|
624
|
+
|
607
625
|
Args:
|
608
626
|
url: The URL to fetch HTML content from.
|
609
|
-
|
627
|
+
|
610
628
|
Returns:
|
611
629
|
The HTML content as a string, or None if the request failed.
|
612
|
-
|
630
|
+
|
613
631
|
Raises:
|
614
632
|
requests.exceptions.RequestException: If a request error occurs.
|
615
633
|
"""
|
@@ -642,71 +660,71 @@ class Scenario(Base, UserDict):
|
|
642
660
|
def extract_text(html: Optional[str]) -> str:
|
643
661
|
"""
|
644
662
|
Extracts readable text from HTML content using BeautifulSoup.
|
645
|
-
|
663
|
+
|
646
664
|
This method parses HTML content and extracts the readable text while
|
647
665
|
removing HTML tags and script content.
|
648
|
-
|
666
|
+
|
649
667
|
Args:
|
650
668
|
html: The HTML content to extract text from.
|
651
|
-
|
669
|
+
|
652
670
|
Returns:
|
653
671
|
The extracted text content as a string. Returns an empty string
|
654
672
|
if the input is None or if parsing fails.
|
655
673
|
"""
|
656
674
|
if html is None:
|
657
675
|
return ""
|
658
|
-
|
676
|
+
|
659
677
|
try:
|
660
678
|
from bs4 import BeautifulSoup
|
679
|
+
|
661
680
|
soup = BeautifulSoup(html, "html.parser")
|
662
|
-
|
681
|
+
|
663
682
|
# Remove script and style elements that might contain non-readable content
|
664
683
|
for element in soup(["script", "style"]):
|
665
684
|
element.extract()
|
666
|
-
|
685
|
+
|
667
686
|
text = soup.get_text()
|
668
|
-
|
687
|
+
|
669
688
|
# Normalize whitespace
|
670
689
|
lines = (line.strip() for line in text.splitlines())
|
671
690
|
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
672
|
-
text =
|
673
|
-
|
691
|
+
text = "\n".join(chunk for chunk in chunks if chunk)
|
692
|
+
|
674
693
|
return text
|
675
694
|
except Exception as e:
|
676
695
|
print(f"Error extracting text from HTML: {e}")
|
677
696
|
return ""
|
678
697
|
|
679
|
-
|
680
698
|
@classmethod
|
681
699
|
def from_pdf_to_image(cls, pdf_path: str, image_format: str = "jpeg") -> "Scenario":
|
682
700
|
"""
|
683
701
|
Converts each page of a PDF into an image and creates a Scenario containing them.
|
684
|
-
|
702
|
+
|
685
703
|
This method takes a PDF file, converts each page to an image in the specified
|
686
704
|
format, and creates a Scenario containing the original file path and FileStore
|
687
705
|
objects for each page image. This is particularly useful for visualizing PDF
|
688
706
|
content or for image-based processing of PDF documents.
|
689
|
-
|
707
|
+
|
690
708
|
Args:
|
691
709
|
pdf_path: Path to the PDF file to convert to images.
|
692
710
|
image_format: Format of the output images (default is 'jpeg').
|
693
711
|
Other formats include 'png', 'tiff', etc.
|
694
|
-
|
712
|
+
|
695
713
|
Returns:
|
696
714
|
A Scenario containing the original PDF file path and FileStore objects
|
697
715
|
for each page image, with keys like "page_0", "page_1", etc.
|
698
|
-
|
716
|
+
|
699
717
|
Raises:
|
700
718
|
FileNotFoundError: If the specified PDF file does not exist.
|
701
719
|
ImportError: If pdf2image is not installed.
|
702
|
-
|
720
|
+
|
703
721
|
Examples:
|
704
722
|
>>> import os
|
705
723
|
>>> # Assuming a PDF file exists
|
706
724
|
>>> if os.path.exists("document.pdf"):
|
707
725
|
... s = Scenario.from_pdf_to_image("document.pdf")
|
708
726
|
... s_png = Scenario.from_pdf_to_image("document.pdf", "png")
|
709
|
-
|
727
|
+
|
710
728
|
Notes:
|
711
729
|
- Requires the pdf2image library which depends on poppler
|
712
730
|
- Creates a separate image for each page of the PDF
|
@@ -729,6 +747,7 @@ class Scenario(Base, UserDict):
|
|
729
747
|
image.save(image_path, image_format.upper())
|
730
748
|
|
731
749
|
from edsl.scenarios import FileStore
|
750
|
+
|
732
751
|
scenario_dict[f"page_{i}"] = FileStore(image_path)
|
733
752
|
|
734
753
|
scenario = Scenario(scenario_dict)
|
@@ -739,21 +758,21 @@ class Scenario(Base, UserDict):
|
|
739
758
|
def from_docx(cls, docx_path: str) -> "Scenario":
|
740
759
|
"""
|
741
760
|
Creates a Scenario containing text extracted from a Microsoft Word document.
|
742
|
-
|
761
|
+
|
743
762
|
This method extracts text and structure from a DOCX file and creates a Scenario
|
744
|
-
containing this information. It uses the DocxScenario class to handle the
|
763
|
+
containing this information. It uses the DocxScenario class to handle the
|
745
764
|
extraction process and maintain document structure where possible.
|
746
|
-
|
765
|
+
|
747
766
|
Args:
|
748
767
|
docx_path: Path to the DOCX file to extract content from.
|
749
|
-
|
768
|
+
|
750
769
|
Returns:
|
751
770
|
A Scenario containing the file path and extracted text from the DOCX file.
|
752
|
-
|
771
|
+
|
753
772
|
Raises:
|
754
773
|
FileNotFoundError: If the specified DOCX file does not exist.
|
755
774
|
ImportError: If the python-docx library is not installed.
|
756
|
-
|
775
|
+
|
757
776
|
Examples:
|
758
777
|
>>> from docx import Document
|
759
778
|
>>> doc = Document()
|
@@ -764,7 +783,7 @@ class Scenario(Base, UserDict):
|
|
764
783
|
>>> s
|
765
784
|
Scenario({'file_path': 'test.docx', 'text': 'EDSL Survey\\nThis is a test.'})
|
766
785
|
>>> import os; os.remove("test.docx")
|
767
|
-
|
786
|
+
|
768
787
|
Notes:
|
769
788
|
- The returned Scenario typically contains the file path and extracted text
|
770
789
|
- The extraction process attempts to maintain document structure
|
@@ -784,12 +803,12 @@ class Scenario(Base, UserDict):
|
|
784
803
|
) -> "ScenarioList":
|
785
804
|
"""
|
786
805
|
Splits a text field into chunks of a specified size, creating a ScenarioList.
|
787
|
-
|
806
|
+
|
788
807
|
This method takes a field containing text and divides it into smaller chunks
|
789
808
|
based on either word count or line count. It's particularly useful for processing
|
790
809
|
large text documents in manageable pieces, such as for summarization, analysis,
|
791
810
|
or when working with models that have token limits.
|
792
|
-
|
811
|
+
|
793
812
|
Args:
|
794
813
|
field: The key name of the field in the Scenario to split.
|
795
814
|
num_words: The number of words to include in each chunk. Mutually exclusive
|
@@ -800,16 +819,16 @@ class Scenario(Base, UserDict):
|
|
800
819
|
with a "_original" suffix.
|
801
820
|
hash_original: If True and include_original is True, stores a hash of the
|
802
821
|
original text instead of the full text.
|
803
|
-
|
822
|
+
|
804
823
|
Returns:
|
805
824
|
A ScenarioList containing multiple Scenarios, each with a chunk of the
|
806
825
|
original text. Each Scenario includes the chunk text, chunk index, character
|
807
826
|
count, and word count.
|
808
|
-
|
827
|
+
|
809
828
|
Raises:
|
810
829
|
ValueError: If neither num_words nor num_lines is specified, or if both are.
|
811
830
|
KeyError: If the specified field doesn't exist in the Scenario.
|
812
|
-
|
831
|
+
|
813
832
|
Examples:
|
814
833
|
Split by lines (1 line per chunk):
|
815
834
|
>>> s = Scenario({"text": "This is a test.\\nThis is a test.\\n\\nThis is a test."})
|
@@ -828,7 +847,7 @@ class Scenario(Base, UserDict):
|
|
828
847
|
Use a hash of the original text:
|
829
848
|
>>> s.chunk("text", num_words=1, include_original=True, hash_original=True)
|
830
849
|
ScenarioList([Scenario({'text': 'Hello', 'text_chunk': 0, 'text_char_count': 5, 'text_word_count': 1, 'text_original': 'b10a8db164e0754105b7a99be72e3fe5'}), Scenario({'text': 'World', 'text_chunk': 1, 'text_char_count': 5, 'text_word_count': 1, 'text_original': 'b10a8db164e0754105b7a99be72e3fe5'})])
|
831
|
-
|
850
|
+
|
832
851
|
Notes:
|
833
852
|
- Either num_words or num_lines must be specified, but not both
|
834
853
|
- Each chunk is assigned a sequential index in the 'text_chunk' field
|
@@ -847,28 +866,28 @@ class Scenario(Base, UserDict):
|
|
847
866
|
def from_dict(cls, d: dict) -> "Scenario":
|
848
867
|
"""
|
849
868
|
Creates a Scenario from a dictionary, with special handling for FileStore objects.
|
850
|
-
|
869
|
+
|
851
870
|
This method creates a Scenario using the provided dictionary. It has special handling
|
852
871
|
for dictionary values that represent serialized FileStore objects, which it will
|
853
872
|
deserialize back into proper FileStore instances.
|
854
|
-
|
873
|
+
|
855
874
|
Args:
|
856
875
|
d: A dictionary to convert to a Scenario.
|
857
|
-
|
876
|
+
|
858
877
|
Returns:
|
859
878
|
A new Scenario containing the provided dictionary data.
|
860
|
-
|
879
|
+
|
861
880
|
Examples:
|
862
881
|
>>> Scenario.from_dict({"food": "wood chips"})
|
863
882
|
Scenario({'food': 'wood chips'})
|
864
|
-
|
883
|
+
|
865
884
|
>>> # Example with a serialized FileStore
|
866
|
-
>>> from edsl import FileStore
|
867
|
-
>>> file_dict = {"path": "example.txt", "base64_string": "SGVsbG8gV29ybGQ="}
|
868
|
-
>>> s = Scenario.from_dict({"document": file_dict})
|
869
|
-
>>> isinstance(s["document"], FileStore)
|
885
|
+
>>> from edsl import FileStore # doctest: +SKIP
|
886
|
+
>>> file_dict = {"path": "example.txt", "base64_string": "SGVsbG8gV29ybGQ="} # doctest: +SKIP
|
887
|
+
>>> s = Scenario.from_dict({"document": file_dict}) # doctest: +SKIP
|
888
|
+
>>> isinstance(s["document"], FileStore) # doctest: +SKIP
|
870
889
|
True
|
871
|
-
|
890
|
+
|
872
891
|
Notes:
|
873
892
|
- Any dictionary values that match the FileStore format will be converted to FileStore objects
|
874
893
|
- The method detects FileStore objects by looking for "base64_string" and "path" keys
|