edsl 0.1.54__py3-none-any.whl → 0.1.55__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. edsl/__init__.py +8 -1
  2. edsl/__init__original.py +134 -0
  3. edsl/__version__.py +1 -1
  4. edsl/agents/agent.py +29 -0
  5. edsl/agents/agent_list.py +36 -1
  6. edsl/base/base_class.py +281 -151
  7. edsl/buckets/__init__.py +8 -3
  8. edsl/buckets/bucket_collection.py +9 -3
  9. edsl/buckets/model_buckets.py +4 -2
  10. edsl/buckets/token_bucket.py +2 -2
  11. edsl/buckets/token_bucket_client.py +5 -3
  12. edsl/caching/cache.py +131 -62
  13. edsl/caching/cache_entry.py +70 -58
  14. edsl/caching/sql_dict.py +17 -0
  15. edsl/cli.py +99 -0
  16. edsl/config/config_class.py +16 -0
  17. edsl/conversation/__init__.py +31 -0
  18. edsl/coop/coop.py +276 -242
  19. edsl/coop/coop_jobs_objects.py +59 -0
  20. edsl/coop/coop_objects.py +29 -0
  21. edsl/coop/coop_regular_objects.py +26 -0
  22. edsl/coop/utils.py +24 -19
  23. edsl/dataset/dataset.py +338 -101
  24. edsl/db_list/sqlite_list.py +349 -0
  25. edsl/inference_services/__init__.py +40 -5
  26. edsl/inference_services/exceptions.py +11 -0
  27. edsl/inference_services/services/anthropic_service.py +5 -2
  28. edsl/inference_services/services/aws_bedrock.py +6 -2
  29. edsl/inference_services/services/azure_ai.py +6 -2
  30. edsl/inference_services/services/google_service.py +3 -2
  31. edsl/inference_services/services/mistral_ai_service.py +6 -2
  32. edsl/inference_services/services/open_ai_service.py +6 -2
  33. edsl/inference_services/services/perplexity_service.py +6 -2
  34. edsl/inference_services/services/test_service.py +94 -5
  35. edsl/interviews/answering_function.py +167 -59
  36. edsl/interviews/interview.py +124 -72
  37. edsl/interviews/interview_task_manager.py +10 -0
  38. edsl/invigilators/invigilators.py +9 -0
  39. edsl/jobs/async_interview_runner.py +146 -104
  40. edsl/jobs/data_structures.py +6 -4
  41. edsl/jobs/decorators.py +61 -0
  42. edsl/jobs/fetch_invigilator.py +61 -18
  43. edsl/jobs/html_table_job_logger.py +14 -2
  44. edsl/jobs/jobs.py +180 -104
  45. edsl/jobs/jobs_component_constructor.py +2 -2
  46. edsl/jobs/jobs_interview_constructor.py +2 -0
  47. edsl/jobs/jobs_remote_inference_logger.py +4 -0
  48. edsl/jobs/jobs_runner_status.py +30 -25
  49. edsl/jobs/progress_bar_manager.py +79 -0
  50. edsl/jobs/remote_inference.py +35 -1
  51. edsl/key_management/key_lookup_builder.py +6 -1
  52. edsl/language_models/language_model.py +86 -6
  53. edsl/language_models/model.py +10 -3
  54. edsl/language_models/price_manager.py +45 -75
  55. edsl/language_models/registry.py +5 -0
  56. edsl/notebooks/notebook.py +77 -10
  57. edsl/questions/VALIDATION_README.md +134 -0
  58. edsl/questions/__init__.py +24 -1
  59. edsl/questions/exceptions.py +21 -0
  60. edsl/questions/question_dict.py +201 -16
  61. edsl/questions/question_multiple_choice_with_other.py +624 -0
  62. edsl/questions/question_registry.py +2 -1
  63. edsl/questions/templates/multiple_choice_with_other/__init__.py +0 -0
  64. edsl/questions/templates/multiple_choice_with_other/answering_instructions.jinja +15 -0
  65. edsl/questions/templates/multiple_choice_with_other/question_presentation.jinja +17 -0
  66. edsl/questions/validation_analysis.py +185 -0
  67. edsl/questions/validation_cli.py +131 -0
  68. edsl/questions/validation_html_report.py +404 -0
  69. edsl/questions/validation_logger.py +136 -0
  70. edsl/results/result.py +63 -16
  71. edsl/results/results.py +702 -171
  72. edsl/scenarios/construct_download_link.py +16 -3
  73. edsl/scenarios/directory_scanner.py +226 -226
  74. edsl/scenarios/file_methods.py +5 -0
  75. edsl/scenarios/file_store.py +117 -6
  76. edsl/scenarios/handlers/__init__.py +5 -1
  77. edsl/scenarios/handlers/mp4_file_store.py +104 -0
  78. edsl/scenarios/handlers/webm_file_store.py +104 -0
  79. edsl/scenarios/scenario.py +120 -101
  80. edsl/scenarios/scenario_list.py +800 -727
  81. edsl/scenarios/scenario_list_gc_test.py +146 -0
  82. edsl/scenarios/scenario_list_memory_test.py +214 -0
  83. edsl/scenarios/scenario_list_source_refactor.md +35 -0
  84. edsl/scenarios/scenario_selector.py +5 -4
  85. edsl/scenarios/scenario_source.py +1990 -0
  86. edsl/scenarios/tests/test_scenario_list_sources.py +52 -0
  87. edsl/surveys/survey.py +22 -0
  88. edsl/tasks/__init__.py +4 -2
  89. edsl/tasks/task_history.py +198 -36
  90. edsl/tests/scenarios/test_ScenarioSource.py +51 -0
  91. edsl/tests/scenarios/test_scenario_list_sources.py +51 -0
  92. edsl/utilities/__init__.py +2 -1
  93. edsl/utilities/decorators.py +121 -0
  94. edsl/utilities/memory_debugger.py +1010 -0
  95. {edsl-0.1.54.dist-info → edsl-0.1.55.dist-info}/METADATA +51 -76
  96. {edsl-0.1.54.dist-info → edsl-0.1.55.dist-info}/RECORD +99 -75
  97. edsl/jobs/jobs_runner_asyncio.py +0 -281
  98. edsl/language_models/unused/fake_openai_service.py +0 -60
  99. {edsl-0.1.54.dist-info → edsl-0.1.55.dist-info}/LICENSE +0 -0
  100. {edsl-0.1.54.dist-info → edsl-0.1.55.dist-info}/WHEEL +0 -0
  101. {edsl-0.1.54.dist-info → edsl-0.1.55.dist-info}/entry_points.txt +0 -0
@@ -2,7 +2,7 @@
2
2
  A Scenario is a dictionary-like object that stores key-value pairs for parameterizing questions.
3
3
 
4
4
  Scenarios are a fundamental concept in EDSL, providing a mechanism to parameterize
5
- questions with dynamic values. Each Scenario contains key-value pairs that can be
5
+ questions with dynamic values. Each Scenario contains key-value pairs that can be
6
6
  referenced within question templates using Jinja syntax. This allows for creating
7
7
  questions that vary based on the specific scenario being presented.
8
8
 
@@ -33,36 +33,35 @@ if TYPE_CHECKING:
33
33
  from ..dataset import Dataset
34
34
 
35
35
 
36
-
37
36
  class Scenario(Base, UserDict):
38
37
  """
39
38
  A dictionary-like object that stores key-value pairs for parameterizing questions.
40
-
39
+
41
40
  A Scenario inherits from both the EDSL Base class and Python's UserDict, allowing
42
41
  it to function as a dictionary while providing additional functionality. Scenarios
43
42
  are used to parameterize questions by providing variable data that can be referenced
44
43
  within question templates using Jinja syntax.
45
-
44
+
46
45
  Scenarios can be created directly with dictionary data or constructed from various
47
46
  sources using class methods (from_file, from_url, from_pdf, etc.). They support
48
47
  operations like addition (combining scenarios) and multiplication (creating cross
49
48
  products with other scenarios or scenario lists).
50
-
49
+
51
50
  Attributes:
52
51
  data (dict): The underlying dictionary data.
53
52
  name (str, optional): A name for the scenario.
54
-
53
+
55
54
  Examples:
56
55
  Create a simple scenario:
57
56
  >>> s = Scenario({"product": "coffee", "price": 4.99})
58
-
57
+
59
58
  Combine scenarios:
60
59
  >>> s1 = Scenario({"product": "coffee"})
61
60
  >>> s2 = Scenario({"price": 4.99})
62
61
  >>> s3 = s1 + s2
63
62
  >>> s3
64
63
  Scenario({'product': 'coffee', 'price': 4.99})
65
-
64
+
66
65
  Create a scenario from a file:
67
66
  >>> import tempfile
68
67
  >>> with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
@@ -96,14 +95,18 @@ class Scenario(Base, UserDict):
96
95
  data = dict(data)
97
96
  except Exception as e:
98
97
  raise ScenarioError(
99
- f"You must pass in a dictionary to initialize a Scenario. You passed in {data}" + "Exception message:" + str(e),
98
+ f"You must pass in a dictionary to initialize a Scenario. You passed in {data}"
99
+ + "Exception message:"
100
+ + str(e),
100
101
  )
101
102
 
102
103
  super().__init__()
103
104
  self.data = data if data is not None else {}
104
105
  self.name = name
105
106
 
106
- def __mul__(self, scenario_list_or_scenario: Union["ScenarioList", "Scenario"]) -> "ScenarioList":
107
+ def __mul__(
108
+ self, scenario_list_or_scenario: Union["ScenarioList", "Scenario"]
109
+ ) -> "ScenarioList":
107
110
  """Takes the cross product of a Scenario with another Scenario or ScenarioList.
108
111
 
109
112
  Args:
@@ -125,12 +128,15 @@ class Scenario(Base, UserDict):
125
128
  True
126
129
  """
127
130
  from .scenario_list import ScenarioList
131
+
128
132
  if isinstance(scenario_list_or_scenario, ScenarioList):
129
133
  return scenario_list_or_scenario * self
130
134
  elif isinstance(scenario_list_or_scenario, Scenario):
131
135
  return ScenarioList([self]) * scenario_list_or_scenario
132
136
  else:
133
- raise TypeError(f"Cannot multiply Scenario with {type(scenario_list_or_scenario)}")
137
+ raise TypeError(
138
+ f"Cannot multiply Scenario with {type(scenario_list_or_scenario)}"
139
+ )
134
140
 
135
141
  def replicate(self, n: int) -> "ScenarioList":
136
142
  """Replicate a scenario n times to return a ScenarioList.
@@ -258,14 +264,13 @@ class Scenario(Base, UserDict):
258
264
  """Display a scenario as a table."""
259
265
  return self.to_dataset().table(tablefmt=tablefmt)
260
266
 
261
-
262
267
  def to_dict(self, add_edsl_version: bool = True) -> dict:
263
268
  """Convert a scenario to a dictionary.
264
269
 
265
270
  Example:
266
271
 
267
272
  >>> s = Scenario({"food": "wood chips"})
268
- >>> s.to_dict()
273
+ >>> s.to_dict() # doctest: +ELLIPSIS
269
274
  {'food': 'wood chips', 'edsl_version': '...', 'edsl_class_name': 'Scenario'}
270
275
 
271
276
  >>> s.to_dict(add_edsl_version = False)
@@ -273,10 +278,11 @@ class Scenario(Base, UserDict):
273
278
 
274
279
  """
275
280
  from edsl.scenarios import FileStore
281
+ from edsl.prompts import Prompt
276
282
 
277
283
  d = self.data.copy()
278
284
  for key, value in d.items():
279
- if isinstance(value, FileStore):
285
+ if isinstance(value, FileStore) or isinstance(value, Prompt):
280
286
  d[key] = value.to_dict(add_edsl_version=add_edsl_version)
281
287
  if add_edsl_version:
282
288
  from edsl import __version__
@@ -363,36 +369,38 @@ class Scenario(Base, UserDict):
363
369
  return self.select(list_of_keys)
364
370
 
365
371
  @classmethod
366
- def from_url(cls, url: str, field_name: Optional[str] = "text", testing: bool = False) -> "Scenario":
372
+ def from_url(
373
+ cls, url: str, field_name: Optional[str] = "text", testing: bool = False
374
+ ) -> "Scenario":
367
375
  """
368
376
  Creates a Scenario from the content of a URL.
369
-
370
- This method fetches content from a web URL and creates a Scenario containing the URL
377
+
378
+ This method fetches content from a web URL and creates a Scenario containing the URL
371
379
  and the extracted text. When available, BeautifulSoup is used for better HTML parsing
372
380
  and text extraction, otherwise a basic requests approach is used.
373
-
381
+
374
382
  Args:
375
383
  url: The URL to fetch content from.
376
384
  field_name: The key name to use for storing the extracted text in the Scenario.
377
385
  Defaults to "text".
378
386
  testing: If True, uses a simplified requests method instead of BeautifulSoup.
379
387
  This is primarily for testing purposes.
380
-
388
+
381
389
  Returns:
382
390
  A Scenario containing the URL and extracted text.
383
-
391
+
384
392
  Raises:
385
393
  requests.exceptions.RequestException: If the URL cannot be accessed.
386
-
394
+
387
395
  Examples:
388
396
  >>> s = Scenario.from_url("https://example.com", testing=True)
389
397
  >>> "url" in s and "text" in s
390
398
  True
391
-
399
+
392
400
  >>> s = Scenario.from_url("https://example.com", field_name="content", testing=True)
393
401
  >>> "url" in s and "content" in s
394
402
  True
395
-
403
+
396
404
  Notes:
397
405
  - The method attempts to use BeautifulSoup and fake_useragent for better
398
406
  HTML parsing and to mimic a real browser.
@@ -409,24 +417,33 @@ class Scenario(Base, UserDict):
409
417
  try:
410
418
  from bs4 import BeautifulSoup
411
419
  from fake_useragent import UserAgent
412
-
420
+
413
421
  # Configure request headers to appear more like a regular browser
414
422
  ua = UserAgent()
415
423
  headers = {
416
- 'User-Agent': ua.random,
417
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
418
- 'Accept-Language': 'en-US,en;q=0.5'
424
+ "User-Agent": ua.random,
425
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
426
+ "Accept-Language": "en-US,en;q=0.5",
419
427
  }
420
428
 
421
429
  response = requests.get(url, headers=headers)
422
- soup = BeautifulSoup(response.content, 'html.parser')
423
-
430
+ soup = BeautifulSoup(response.content, "html.parser")
431
+
424
432
  # Get text content while preserving some structure
425
- text = ' '.join([p.get_text(strip=True) for p in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])])
433
+ text = " ".join(
434
+ [
435
+ p.get_text(strip=True)
436
+ for p in soup.find_all(
437
+ ["p", "h1", "h2", "h3", "h4", "h5", "h6"]
438
+ )
439
+ ]
440
+ )
426
441
 
427
442
  except ImportError:
428
443
  # Fallback to basic requests if BeautifulSoup/fake_useragent not available
429
- print("BeautifulSoup/fake_useragent not available. Falling back to basic requests.")
444
+ print(
445
+ "BeautifulSoup/fake_useragent not available. Falling back to basic requests."
446
+ )
430
447
  response = requests.get(url)
431
448
  text = response.text
432
449
 
@@ -436,33 +453,33 @@ class Scenario(Base, UserDict):
436
453
  def from_file(cls, file_path: str, field_name: str) -> "Scenario":
437
454
  """
438
455
  Creates a Scenario containing a FileStore object from a file.
439
-
456
+
440
457
  This method creates a Scenario with a single key-value pair where the value
441
458
  is a FileStore object that encapsulates the specified file. The FileStore
442
459
  handles appropriate file loading, encoding, and extraction based on the file type.
443
-
460
+
444
461
  Args:
445
462
  file_path: Path to the file to be incorporated into the Scenario.
446
463
  field_name: Key name to use for storing the FileStore in the Scenario.
447
-
464
+
448
465
  Returns:
449
466
  A Scenario containing a FileStore object linked to the specified file.
450
-
467
+
451
468
  Raises:
452
469
  FileNotFoundError: If the specified file does not exist.
453
-
470
+
454
471
  Examples:
455
472
  >>> import tempfile
456
473
  >>> with tempfile.NamedTemporaryFile(suffix=".txt", mode="w") as f:
457
474
  ... _ = f.write("This is a test.")
458
475
  ... _ = f.flush()
459
476
  ... s = Scenario.from_file(f.name, "file")
460
- >>> s
477
+ >>> s # doctest: +ELLIPSIS
461
478
  Scenario({'file': FileStore(path='...', ...)})
462
-
479
+
463
480
  Notes:
464
481
  - The FileStore object handles various file formats differently
465
- - FileStore provides methods to access file content, extract text,
482
+ - FileStore provides methods to access file content, extract text,
466
483
  and manage file operations appropriate to the file type
467
484
  """
468
485
  from edsl.scenarios import FileStore
@@ -476,30 +493,30 @@ class Scenario(Base, UserDict):
476
493
  ) -> "Scenario":
477
494
  """
478
495
  Creates a Scenario containing an image file as a FileStore object.
479
-
496
+
480
497
  This method creates a Scenario with a single key-value pair where the value
481
498
  is a FileStore object that encapsulates the specified image file. The image
482
499
  is stored as a base64-encoded string, allowing it to be easily serialized
483
500
  and transmitted.
484
-
501
+
485
502
  Args:
486
503
  image_path: Path to the image file to be incorporated into the Scenario.
487
504
  image_name: Key name to use for storing the FileStore in the Scenario.
488
505
  If not provided, uses the filename without extension.
489
-
506
+
490
507
  Returns:
491
508
  A Scenario containing a FileStore object with the image data.
492
-
509
+
493
510
  Raises:
494
511
  FileNotFoundError: If the specified image file does not exist.
495
-
512
+
496
513
  Examples:
497
514
  >>> import os
498
515
  >>> # Assuming an image file exists
499
516
  >>> if os.path.exists("image.jpg"):
500
517
  ... s = Scenario.from_image("image.jpg")
501
518
  ... s_named = Scenario.from_image("image.jpg", "picture")
502
-
519
+
503
520
  Notes:
504
521
  - The resulting FileStore can be displayed in notebooks or used in questions
505
522
  - Supported image formats include JPG, PNG, GIF, etc.
@@ -517,27 +534,27 @@ class Scenario(Base, UserDict):
517
534
  def from_pdf(cls, pdf_path: str) -> "Scenario":
518
535
  """
519
536
  Creates a Scenario containing text extracted from a PDF file.
520
-
537
+
521
538
  This method extracts text and metadata from a PDF file and creates a Scenario
522
539
  containing this information. It uses the PdfExtractor class which provides
523
540
  access to text content, metadata, and structure from PDF files.
524
-
541
+
525
542
  Args:
526
543
  pdf_path: Path to the PDF file to extract content from.
527
-
544
+
528
545
  Returns:
529
546
  A Scenario containing extracted text and metadata from the PDF.
530
-
547
+
531
548
  Raises:
532
549
  FileNotFoundError: If the specified PDF file does not exist.
533
550
  ImportError: If the required PDF extraction libraries are not installed.
534
-
551
+
535
552
  Examples:
536
553
  >>> import os
537
554
  >>> # Assuming a PDF file exists
538
555
  >>> if os.path.exists("document.pdf"):
539
556
  ... s = Scenario.from_pdf("document.pdf")
540
-
557
+
541
558
  Notes:
542
559
  - The returned Scenario contains various keys with PDF content and metadata
543
560
  - PDF extraction requires the PyMuPDF library
@@ -545,6 +562,7 @@ class Scenario(Base, UserDict):
545
562
  """
546
563
  try:
547
564
  from edsl.scenarios.PdfExtractor import PdfExtractor
565
+
548
566
  extractor = PdfExtractor(pdf_path)
549
567
  return Scenario(extractor.get_pdf_dict())
550
568
  except ImportError as e:
@@ -558,31 +576,31 @@ class Scenario(Base, UserDict):
558
576
  def from_html(cls, url: str, field_name: Optional[str] = None) -> "Scenario":
559
577
  """
560
578
  Creates a Scenario containing both HTML content and extracted text from a URL.
561
-
579
+
562
580
  This method fetches HTML content from a URL, extracts readable text from it,
563
581
  and creates a Scenario containing the original URL, the raw HTML, and the
564
582
  extracted text. Unlike from_url, this method preserves the raw HTML content.
565
-
583
+
566
584
  Args:
567
585
  url: URL to fetch HTML content from.
568
586
  field_name: Key name to use for the extracted text in the Scenario.
569
587
  If not provided, defaults to "text".
570
-
588
+
571
589
  Returns:
572
590
  A Scenario containing the URL, raw HTML, and extracted text.
573
-
591
+
574
592
  Raises:
575
593
  requests.exceptions.RequestException: If the URL cannot be accessed.
576
-
594
+
577
595
  Examples:
578
596
  >>> s = Scenario.from_html("https://example.com")
579
597
  >>> all(key in s for key in ["url", "html", "text"])
580
598
  True
581
-
599
+
582
600
  >>> s = Scenario.from_html("https://example.com", field_name="content")
583
601
  >>> all(key in s for key in ["url", "html", "content"])
584
602
  True
585
-
603
+
586
604
  Notes:
587
605
  - Uses BeautifulSoup for HTML parsing when available
588
606
  - Stores both the raw HTML and the extracted text
@@ -599,17 +617,17 @@ class Scenario(Base, UserDict):
599
617
  def fetch_html(url: str) -> Optional[str]:
600
618
  """
601
619
  Fetches HTML content from a URL with robust error handling and retries.
602
-
620
+
603
621
  This method creates a session with configurable retries to fetch HTML content
604
622
  from a URL. It uses a realistic user agent to avoid being blocked by websites
605
623
  that filter bot traffic.
606
-
624
+
607
625
  Args:
608
626
  url: The URL to fetch HTML content from.
609
-
627
+
610
628
  Returns:
611
629
  The HTML content as a string, or None if the request failed.
612
-
630
+
613
631
  Raises:
614
632
  requests.exceptions.RequestException: If a request error occurs.
615
633
  """
@@ -642,71 +660,71 @@ class Scenario(Base, UserDict):
642
660
  def extract_text(html: Optional[str]) -> str:
643
661
  """
644
662
  Extracts readable text from HTML content using BeautifulSoup.
645
-
663
+
646
664
  This method parses HTML content and extracts the readable text while
647
665
  removing HTML tags and script content.
648
-
666
+
649
667
  Args:
650
668
  html: The HTML content to extract text from.
651
-
669
+
652
670
  Returns:
653
671
  The extracted text content as a string. Returns an empty string
654
672
  if the input is None or if parsing fails.
655
673
  """
656
674
  if html is None:
657
675
  return ""
658
-
676
+
659
677
  try:
660
678
  from bs4 import BeautifulSoup
679
+
661
680
  soup = BeautifulSoup(html, "html.parser")
662
-
681
+
663
682
  # Remove script and style elements that might contain non-readable content
664
683
  for element in soup(["script", "style"]):
665
684
  element.extract()
666
-
685
+
667
686
  text = soup.get_text()
668
-
687
+
669
688
  # Normalize whitespace
670
689
  lines = (line.strip() for line in text.splitlines())
671
690
  chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
672
- text = '\n'.join(chunk for chunk in chunks if chunk)
673
-
691
+ text = "\n".join(chunk for chunk in chunks if chunk)
692
+
674
693
  return text
675
694
  except Exception as e:
676
695
  print(f"Error extracting text from HTML: {e}")
677
696
  return ""
678
697
 
679
-
680
698
  @classmethod
681
699
  def from_pdf_to_image(cls, pdf_path: str, image_format: str = "jpeg") -> "Scenario":
682
700
  """
683
701
  Converts each page of a PDF into an image and creates a Scenario containing them.
684
-
702
+
685
703
  This method takes a PDF file, converts each page to an image in the specified
686
704
  format, and creates a Scenario containing the original file path and FileStore
687
705
  objects for each page image. This is particularly useful for visualizing PDF
688
706
  content or for image-based processing of PDF documents.
689
-
707
+
690
708
  Args:
691
709
  pdf_path: Path to the PDF file to convert to images.
692
710
  image_format: Format of the output images (default is 'jpeg').
693
711
  Other formats include 'png', 'tiff', etc.
694
-
712
+
695
713
  Returns:
696
714
  A Scenario containing the original PDF file path and FileStore objects
697
715
  for each page image, with keys like "page_0", "page_1", etc.
698
-
716
+
699
717
  Raises:
700
718
  FileNotFoundError: If the specified PDF file does not exist.
701
719
  ImportError: If pdf2image is not installed.
702
-
720
+
703
721
  Examples:
704
722
  >>> import os
705
723
  >>> # Assuming a PDF file exists
706
724
  >>> if os.path.exists("document.pdf"):
707
725
  ... s = Scenario.from_pdf_to_image("document.pdf")
708
726
  ... s_png = Scenario.from_pdf_to_image("document.pdf", "png")
709
-
727
+
710
728
  Notes:
711
729
  - Requires the pdf2image library which depends on poppler
712
730
  - Creates a separate image for each page of the PDF
@@ -729,6 +747,7 @@ class Scenario(Base, UserDict):
729
747
  image.save(image_path, image_format.upper())
730
748
 
731
749
  from edsl.scenarios import FileStore
750
+
732
751
  scenario_dict[f"page_{i}"] = FileStore(image_path)
733
752
 
734
753
  scenario = Scenario(scenario_dict)
@@ -739,21 +758,21 @@ class Scenario(Base, UserDict):
739
758
  def from_docx(cls, docx_path: str) -> "Scenario":
740
759
  """
741
760
  Creates a Scenario containing text extracted from a Microsoft Word document.
742
-
761
+
743
762
  This method extracts text and structure from a DOCX file and creates a Scenario
744
- containing this information. It uses the DocxScenario class to handle the
763
+ containing this information. It uses the DocxScenario class to handle the
745
764
  extraction process and maintain document structure where possible.
746
-
765
+
747
766
  Args:
748
767
  docx_path: Path to the DOCX file to extract content from.
749
-
768
+
750
769
  Returns:
751
770
  A Scenario containing the file path and extracted text from the DOCX file.
752
-
771
+
753
772
  Raises:
754
773
  FileNotFoundError: If the specified DOCX file does not exist.
755
774
  ImportError: If the python-docx library is not installed.
756
-
775
+
757
776
  Examples:
758
777
  >>> from docx import Document
759
778
  >>> doc = Document()
@@ -764,7 +783,7 @@ class Scenario(Base, UserDict):
764
783
  >>> s
765
784
  Scenario({'file_path': 'test.docx', 'text': 'EDSL Survey\\nThis is a test.'})
766
785
  >>> import os; os.remove("test.docx")
767
-
786
+
768
787
  Notes:
769
788
  - The returned Scenario typically contains the file path and extracted text
770
789
  - The extraction process attempts to maintain document structure
@@ -784,12 +803,12 @@ class Scenario(Base, UserDict):
784
803
  ) -> "ScenarioList":
785
804
  """
786
805
  Splits a text field into chunks of a specified size, creating a ScenarioList.
787
-
806
+
788
807
  This method takes a field containing text and divides it into smaller chunks
789
808
  based on either word count or line count. It's particularly useful for processing
790
809
  large text documents in manageable pieces, such as for summarization, analysis,
791
810
  or when working with models that have token limits.
792
-
811
+
793
812
  Args:
794
813
  field: The key name of the field in the Scenario to split.
795
814
  num_words: The number of words to include in each chunk. Mutually exclusive
@@ -800,16 +819,16 @@ class Scenario(Base, UserDict):
800
819
  with a "_original" suffix.
801
820
  hash_original: If True and include_original is True, stores a hash of the
802
821
  original text instead of the full text.
803
-
822
+
804
823
  Returns:
805
824
  A ScenarioList containing multiple Scenarios, each with a chunk of the
806
825
  original text. Each Scenario includes the chunk text, chunk index, character
807
826
  count, and word count.
808
-
827
+
809
828
  Raises:
810
829
  ValueError: If neither num_words nor num_lines is specified, or if both are.
811
830
  KeyError: If the specified field doesn't exist in the Scenario.
812
-
831
+
813
832
  Examples:
814
833
  Split by lines (1 line per chunk):
815
834
  >>> s = Scenario({"text": "This is a test.\\nThis is a test.\\n\\nThis is a test."})
@@ -828,7 +847,7 @@ class Scenario(Base, UserDict):
828
847
  Use a hash of the original text:
829
848
  >>> s.chunk("text", num_words=1, include_original=True, hash_original=True)
830
849
  ScenarioList([Scenario({'text': 'Hello', 'text_chunk': 0, 'text_char_count': 5, 'text_word_count': 1, 'text_original': 'b10a8db164e0754105b7a99be72e3fe5'}), Scenario({'text': 'World', 'text_chunk': 1, 'text_char_count': 5, 'text_word_count': 1, 'text_original': 'b10a8db164e0754105b7a99be72e3fe5'})])
831
-
850
+
832
851
  Notes:
833
852
  - Either num_words or num_lines must be specified, but not both
834
853
  - Each chunk is assigned a sequential index in the 'text_chunk' field
@@ -847,28 +866,28 @@ class Scenario(Base, UserDict):
847
866
  def from_dict(cls, d: dict) -> "Scenario":
848
867
  """
849
868
  Creates a Scenario from a dictionary, with special handling for FileStore objects.
850
-
869
+
851
870
  This method creates a Scenario using the provided dictionary. It has special handling
852
871
  for dictionary values that represent serialized FileStore objects, which it will
853
872
  deserialize back into proper FileStore instances.
854
-
873
+
855
874
  Args:
856
875
  d: A dictionary to convert to a Scenario.
857
-
876
+
858
877
  Returns:
859
878
  A new Scenario containing the provided dictionary data.
860
-
879
+
861
880
  Examples:
862
881
  >>> Scenario.from_dict({"food": "wood chips"})
863
882
  Scenario({'food': 'wood chips'})
864
-
883
+
865
884
  >>> # Example with a serialized FileStore
866
- >>> from edsl import FileStore
867
- >>> file_dict = {"path": "example.txt", "base64_string": "SGVsbG8gV29ybGQ="}
868
- >>> s = Scenario.from_dict({"document": file_dict})
869
- >>> isinstance(s["document"], FileStore)
885
+ >>> from edsl import FileStore # doctest: +SKIP
886
+ >>> file_dict = {"path": "example.txt", "base64_string": "SGVsbG8gV29ybGQ="} # doctest: +SKIP
887
+ >>> s = Scenario.from_dict({"document": file_dict}) # doctest: +SKIP
888
+ >>> isinstance(s["document"], FileStore) # doctest: +SKIP
870
889
  True
871
-
890
+
872
891
  Notes:
873
892
  - Any dictionary values that match the FileStore format will be converted to FileStore objects
874
893
  - The method detects FileStore objects by looking for "base64_string" and "path" keys