edsl 0.1.46__py3-none-any.whl → 0.1.48__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (328) hide show
  1. edsl/__init__.py +44 -39
  2. edsl/__version__.py +1 -1
  3. edsl/agents/__init__.py +4 -2
  4. edsl/agents/{Agent.py → agent.py} +442 -152
  5. edsl/agents/{AgentList.py → agent_list.py} +220 -162
  6. edsl/agents/descriptors.py +46 -7
  7. edsl/{exceptions/agents.py → agents/exceptions.py} +3 -12
  8. edsl/base/__init__.py +75 -0
  9. edsl/base/base_class.py +1303 -0
  10. edsl/base/data_transfer_models.py +114 -0
  11. edsl/base/enums.py +215 -0
  12. edsl/base.py +8 -0
  13. edsl/buckets/__init__.py +25 -0
  14. edsl/buckets/bucket_collection.py +324 -0
  15. edsl/buckets/model_buckets.py +206 -0
  16. edsl/buckets/token_bucket.py +502 -0
  17. edsl/{jobs/buckets/TokenBucketAPI.py → buckets/token_bucket_api.py} +1 -1
  18. edsl/buckets/token_bucket_client.py +509 -0
  19. edsl/caching/__init__.py +20 -0
  20. edsl/caching/cache.py +814 -0
  21. edsl/caching/cache_entry.py +427 -0
  22. edsl/{data/CacheHandler.py → caching/cache_handler.py} +14 -15
  23. edsl/caching/exceptions.py +24 -0
  24. edsl/caching/orm.py +30 -0
  25. edsl/{data/RemoteCacheSync.py → caching/remote_cache_sync.py} +3 -3
  26. edsl/caching/sql_dict.py +441 -0
  27. edsl/config/__init__.py +8 -0
  28. edsl/config/config_class.py +177 -0
  29. edsl/config.py +4 -176
  30. edsl/conversation/Conversation.py +7 -7
  31. edsl/conversation/car_buying.py +4 -4
  32. edsl/conversation/chips.py +6 -6
  33. edsl/coop/__init__.py +25 -2
  34. edsl/coop/coop.py +430 -113
  35. edsl/coop/{ExpectedParrotKeyHandler.py → ep_key_handling.py} +86 -10
  36. edsl/coop/exceptions.py +62 -0
  37. edsl/coop/price_fetcher.py +126 -0
  38. edsl/coop/utils.py +89 -24
  39. edsl/data_transfer_models.py +5 -72
  40. edsl/dataset/__init__.py +10 -0
  41. edsl/{results/Dataset.py → dataset/dataset.py} +116 -36
  42. edsl/dataset/dataset_operations_mixin.py +1492 -0
  43. edsl/{results/DatasetTree.py → dataset/dataset_tree.py} +156 -75
  44. edsl/{results/TableDisplay.py → dataset/display/table_display.py} +18 -7
  45. edsl/{results → dataset/display}/table_renderers.py +58 -2
  46. edsl/{results → dataset}/file_exports.py +4 -5
  47. edsl/{results → dataset}/smart_objects.py +2 -2
  48. edsl/enums.py +5 -205
  49. edsl/inference_services/__init__.py +5 -0
  50. edsl/inference_services/{AvailableModelCacheHandler.py → available_model_cache_handler.py} +2 -3
  51. edsl/inference_services/{AvailableModelFetcher.py → available_model_fetcher.py} +8 -14
  52. edsl/inference_services/data_structures.py +3 -2
  53. edsl/{exceptions/inference_services.py → inference_services/exceptions.py} +1 -1
  54. edsl/inference_services/{InferenceServiceABC.py → inference_service_abc.py} +1 -1
  55. edsl/inference_services/{InferenceServicesCollection.py → inference_services_collection.py} +8 -7
  56. edsl/inference_services/registry.py +4 -41
  57. edsl/inference_services/{ServiceAvailability.py → service_availability.py} +5 -25
  58. edsl/inference_services/services/__init__.py +31 -0
  59. edsl/inference_services/{AnthropicService.py → services/anthropic_service.py} +3 -3
  60. edsl/inference_services/{AwsBedrock.py → services/aws_bedrock.py} +2 -2
  61. edsl/inference_services/{AzureAI.py → services/azure_ai.py} +2 -2
  62. edsl/inference_services/{DeepInfraService.py → services/deep_infra_service.py} +1 -3
  63. edsl/inference_services/{DeepSeekService.py → services/deep_seek_service.py} +2 -4
  64. edsl/inference_services/{GoogleService.py → services/google_service.py} +5 -4
  65. edsl/inference_services/{GroqService.py → services/groq_service.py} +1 -1
  66. edsl/inference_services/{MistralAIService.py → services/mistral_ai_service.py} +3 -3
  67. edsl/inference_services/{OllamaService.py → services/ollama_service.py} +1 -7
  68. edsl/inference_services/{OpenAIService.py → services/open_ai_service.py} +5 -6
  69. edsl/inference_services/{PerplexityService.py → services/perplexity_service.py} +12 -12
  70. edsl/inference_services/{TestService.py → services/test_service.py} +7 -6
  71. edsl/inference_services/{TogetherAIService.py → services/together_ai_service.py} +2 -6
  72. edsl/inference_services/{XAIService.py → services/xai_service.py} +1 -1
  73. edsl/inference_services/write_available.py +1 -2
  74. edsl/instructions/__init__.py +6 -0
  75. edsl/{surveys/instructions/Instruction.py → instructions/instruction.py} +11 -6
  76. edsl/{surveys/instructions/InstructionCollection.py → instructions/instruction_collection.py} +10 -5
  77. edsl/{surveys/InstructionHandler.py → instructions/instruction_handler.py} +3 -3
  78. edsl/{jobs/interviews → interviews}/ReportErrors.py +2 -2
  79. edsl/interviews/__init__.py +4 -0
  80. edsl/{jobs/AnswerQuestionFunctionConstructor.py → interviews/answering_function.py} +45 -18
  81. edsl/{jobs/interviews/InterviewExceptionEntry.py → interviews/exception_tracking.py} +107 -22
  82. edsl/interviews/interview.py +638 -0
  83. edsl/{jobs/interviews/InterviewStatusDictionary.py → interviews/interview_status_dictionary.py} +21 -12
  84. edsl/{jobs/interviews/InterviewStatusLog.py → interviews/interview_status_log.py} +16 -7
  85. edsl/{jobs/InterviewTaskManager.py → interviews/interview_task_manager.py} +12 -7
  86. edsl/{jobs/RequestTokenEstimator.py → interviews/request_token_estimator.py} +8 -3
  87. edsl/{jobs/interviews/InterviewStatistic.py → interviews/statistics.py} +36 -10
  88. edsl/invigilators/__init__.py +38 -0
  89. edsl/invigilators/invigilator_base.py +477 -0
  90. edsl/{agents/Invigilator.py → invigilators/invigilators.py} +263 -10
  91. edsl/invigilators/prompt_constructor.py +476 -0
  92. edsl/{agents → invigilators}/prompt_helpers.py +2 -1
  93. edsl/{agents/QuestionInstructionPromptBuilder.py → invigilators/question_instructions_prompt_builder.py} +18 -13
  94. edsl/{agents → invigilators}/question_option_processor.py +96 -21
  95. edsl/{agents/QuestionTemplateReplacementsBuilder.py → invigilators/question_template_replacements_builder.py} +64 -12
  96. edsl/jobs/__init__.py +7 -1
  97. edsl/jobs/async_interview_runner.py +99 -35
  98. edsl/jobs/check_survey_scenario_compatibility.py +7 -5
  99. edsl/jobs/data_structures.py +153 -22
  100. edsl/{exceptions/jobs.py → jobs/exceptions.py} +2 -1
  101. edsl/jobs/{FetchInvigilator.py → fetch_invigilator.py} +4 -4
  102. edsl/jobs/{loggers/HTMLTableJobLogger.py → html_table_job_logger.py} +6 -2
  103. edsl/jobs/{Jobs.py → jobs.py} +321 -155
  104. edsl/jobs/{JobsChecks.py → jobs_checks.py} +15 -7
  105. edsl/jobs/{JobsComponentConstructor.py → jobs_component_constructor.py} +20 -17
  106. edsl/jobs/{InterviewsConstructor.py → jobs_interview_constructor.py} +10 -5
  107. edsl/jobs/jobs_pricing_estimation.py +347 -0
  108. edsl/jobs/{JobsRemoteInferenceLogger.py → jobs_remote_inference_logger.py} +4 -3
  109. edsl/jobs/jobs_runner_asyncio.py +282 -0
  110. edsl/jobs/{JobsRemoteInferenceHandler.py → remote_inference.py} +19 -22
  111. edsl/jobs/results_exceptions_handler.py +2 -2
  112. edsl/key_management/__init__.py +28 -0
  113. edsl/key_management/key_lookup.py +161 -0
  114. edsl/{language_models/key_management/KeyLookupBuilder.py → key_management/key_lookup_builder.py} +118 -47
  115. edsl/key_management/key_lookup_collection.py +82 -0
  116. edsl/key_management/models.py +218 -0
  117. edsl/language_models/__init__.py +7 -2
  118. edsl/language_models/{ComputeCost.py → compute_cost.py} +18 -3
  119. edsl/{exceptions/language_models.py → language_models/exceptions.py} +2 -1
  120. edsl/language_models/language_model.py +1080 -0
  121. edsl/language_models/model.py +10 -25
  122. edsl/language_models/{ModelList.py → model_list.py} +9 -14
  123. edsl/language_models/{RawResponseHandler.py → raw_response_handler.py} +1 -1
  124. edsl/language_models/{RegisterLanguageModelsMeta.py → registry.py} +1 -1
  125. edsl/language_models/repair.py +4 -4
  126. edsl/language_models/utilities.py +4 -4
  127. edsl/notebooks/__init__.py +3 -1
  128. edsl/notebooks/{Notebook.py → notebook.py} +7 -8
  129. edsl/prompts/__init__.py +1 -1
  130. edsl/{exceptions/prompts.py → prompts/exceptions.py} +3 -1
  131. edsl/prompts/{Prompt.py → prompt.py} +101 -95
  132. edsl/questions/HTMLQuestion.py +1 -1
  133. edsl/questions/__init__.py +154 -25
  134. edsl/questions/answer_validator_mixin.py +1 -1
  135. edsl/questions/compose_questions.py +4 -3
  136. edsl/questions/derived/question_likert_five.py +166 -0
  137. edsl/questions/derived/{QuestionLinearScale.py → question_linear_scale.py} +4 -4
  138. edsl/questions/derived/{QuestionTopK.py → question_top_k.py} +4 -4
  139. edsl/questions/derived/{QuestionYesNo.py → question_yes_no.py} +4 -5
  140. edsl/questions/descriptors.py +24 -30
  141. edsl/questions/loop_processor.py +65 -19
  142. edsl/questions/question_base.py +881 -0
  143. edsl/questions/question_base_gen_mixin.py +15 -16
  144. edsl/questions/{QuestionBasePromptsMixin.py → question_base_prompts_mixin.py} +2 -2
  145. edsl/questions/{QuestionBudget.py → question_budget.py} +3 -4
  146. edsl/questions/{QuestionCheckBox.py → question_check_box.py} +16 -16
  147. edsl/questions/{QuestionDict.py → question_dict.py} +39 -5
  148. edsl/questions/{QuestionExtract.py → question_extract.py} +9 -9
  149. edsl/questions/question_free_text.py +282 -0
  150. edsl/questions/{QuestionFunctional.py → question_functional.py} +6 -5
  151. edsl/questions/{QuestionList.py → question_list.py} +6 -7
  152. edsl/questions/{QuestionMatrix.py → question_matrix.py} +6 -5
  153. edsl/questions/{QuestionMultipleChoice.py → question_multiple_choice.py} +126 -21
  154. edsl/questions/{QuestionNumerical.py → question_numerical.py} +5 -5
  155. edsl/questions/{QuestionRank.py → question_rank.py} +6 -6
  156. edsl/questions/question_registry.py +10 -16
  157. edsl/questions/register_questions_meta.py +8 -4
  158. edsl/questions/response_validator_abc.py +17 -16
  159. edsl/results/__init__.py +4 -1
  160. edsl/{exceptions/results.py → results/exceptions.py} +1 -1
  161. edsl/results/report.py +197 -0
  162. edsl/results/{Result.py → result.py} +131 -45
  163. edsl/results/{Results.py → results.py} +420 -216
  164. edsl/results/results_selector.py +344 -25
  165. edsl/scenarios/__init__.py +30 -3
  166. edsl/scenarios/{ConstructDownloadLink.py → construct_download_link.py} +7 -0
  167. edsl/scenarios/directory_scanner.py +156 -13
  168. edsl/scenarios/document_chunker.py +186 -0
  169. edsl/scenarios/exceptions.py +101 -0
  170. edsl/scenarios/file_methods.py +2 -3
  171. edsl/scenarios/file_store.py +755 -0
  172. edsl/scenarios/handlers/__init__.py +14 -14
  173. edsl/scenarios/handlers/{csv.py → csv_file_store.py} +1 -2
  174. edsl/scenarios/handlers/{docx.py → docx_file_store.py} +8 -7
  175. edsl/scenarios/handlers/{html.py → html_file_store.py} +1 -2
  176. edsl/scenarios/handlers/{jpeg.py → jpeg_file_store.py} +1 -1
  177. edsl/scenarios/handlers/{json.py → json_file_store.py} +1 -1
  178. edsl/scenarios/handlers/latex_file_store.py +5 -0
  179. edsl/scenarios/handlers/{md.py → md_file_store.py} +1 -1
  180. edsl/scenarios/handlers/{pdf.py → pdf_file_store.py} +2 -2
  181. edsl/scenarios/handlers/{png.py → png_file_store.py} +1 -1
  182. edsl/scenarios/handlers/{pptx.py → pptx_file_store.py} +8 -7
  183. edsl/scenarios/handlers/{py.py → py_file_store.py} +1 -3
  184. edsl/scenarios/handlers/{sql.py → sql_file_store.py} +2 -1
  185. edsl/scenarios/handlers/{sqlite.py → sqlite_file_store.py} +2 -3
  186. edsl/scenarios/handlers/{txt.py → txt_file_store.py} +1 -1
  187. edsl/scenarios/scenario.py +928 -0
  188. edsl/scenarios/scenario_join.py +18 -5
  189. edsl/scenarios/{ScenarioList.py → scenario_list.py} +424 -106
  190. edsl/scenarios/{ScenarioListPdfMixin.py → scenario_list_pdf_tools.py} +16 -15
  191. edsl/scenarios/scenario_selector.py +5 -1
  192. edsl/study/ObjectEntry.py +2 -2
  193. edsl/study/SnapShot.py +5 -5
  194. edsl/study/Study.py +20 -21
  195. edsl/study/__init__.py +6 -4
  196. edsl/surveys/__init__.py +7 -4
  197. edsl/surveys/dag/__init__.py +2 -0
  198. edsl/surveys/{ConstructDAG.py → dag/construct_dag.py} +3 -3
  199. edsl/surveys/{DAG.py → dag/dag.py} +13 -10
  200. edsl/surveys/descriptors.py +1 -1
  201. edsl/surveys/{EditSurvey.py → edit_survey.py} +9 -9
  202. edsl/{exceptions/surveys.py → surveys/exceptions.py} +1 -2
  203. edsl/surveys/memory/__init__.py +3 -0
  204. edsl/surveys/{MemoryPlan.py → memory/memory_plan.py} +10 -9
  205. edsl/surveys/rules/__init__.py +3 -0
  206. edsl/surveys/{Rule.py → rules/rule.py} +103 -43
  207. edsl/surveys/{RuleCollection.py → rules/rule_collection.py} +21 -30
  208. edsl/surveys/{RuleManager.py → rules/rule_manager.py} +19 -13
  209. edsl/surveys/survey.py +1743 -0
  210. edsl/surveys/{SurveyExportMixin.py → survey_export.py} +22 -27
  211. edsl/surveys/{SurveyFlowVisualization.py → survey_flow_visualization.py} +11 -2
  212. edsl/surveys/{Simulator.py → survey_simulator.py} +10 -3
  213. edsl/tasks/__init__.py +32 -0
  214. edsl/{jobs/tasks/QuestionTaskCreator.py → tasks/question_task_creator.py} +115 -57
  215. edsl/tasks/task_creators.py +135 -0
  216. edsl/{jobs/tasks/TaskHistory.py → tasks/task_history.py} +86 -47
  217. edsl/{jobs/tasks → tasks}/task_status_enum.py +91 -7
  218. edsl/tasks/task_status_log.py +85 -0
  219. edsl/tokens/__init__.py +2 -0
  220. edsl/tokens/interview_token_usage.py +53 -0
  221. edsl/utilities/PrettyList.py +1 -1
  222. edsl/utilities/SystemInfo.py +25 -22
  223. edsl/utilities/__init__.py +29 -21
  224. edsl/utilities/gcp_bucket/__init__.py +2 -0
  225. edsl/utilities/gcp_bucket/cloud_storage.py +99 -96
  226. edsl/utilities/interface.py +44 -536
  227. edsl/{results/MarkdownToPDF.py → utilities/markdown_to_pdf.py} +13 -5
  228. edsl/utilities/repair_functions.py +1 -1
  229. {edsl-0.1.46.dist-info → edsl-0.1.48.dist-info}/METADATA +3 -2
  230. edsl-0.1.48.dist-info/RECORD +347 -0
  231. edsl/Base.py +0 -426
  232. edsl/BaseDiff.py +0 -260
  233. edsl/agents/InvigilatorBase.py +0 -260
  234. edsl/agents/PromptConstructor.py +0 -318
  235. edsl/auto/AutoStudy.py +0 -130
  236. edsl/auto/StageBase.py +0 -243
  237. edsl/auto/StageGenerateSurvey.py +0 -178
  238. edsl/auto/StageLabelQuestions.py +0 -125
  239. edsl/auto/StagePersona.py +0 -61
  240. edsl/auto/StagePersonaDimensionValueRanges.py +0 -88
  241. edsl/auto/StagePersonaDimensionValues.py +0 -74
  242. edsl/auto/StagePersonaDimensions.py +0 -69
  243. edsl/auto/StageQuestions.py +0 -74
  244. edsl/auto/SurveyCreatorPipeline.py +0 -21
  245. edsl/auto/utilities.py +0 -218
  246. edsl/base/Base.py +0 -279
  247. edsl/coop/PriceFetcher.py +0 -54
  248. edsl/data/Cache.py +0 -580
  249. edsl/data/CacheEntry.py +0 -230
  250. edsl/data/SQLiteDict.py +0 -292
  251. edsl/data/__init__.py +0 -5
  252. edsl/data/orm.py +0 -10
  253. edsl/exceptions/cache.py +0 -5
  254. edsl/exceptions/coop.py +0 -14
  255. edsl/exceptions/data.py +0 -14
  256. edsl/exceptions/scenarios.py +0 -29
  257. edsl/jobs/Answers.py +0 -43
  258. edsl/jobs/JobsPrompts.py +0 -354
  259. edsl/jobs/buckets/BucketCollection.py +0 -134
  260. edsl/jobs/buckets/ModelBuckets.py +0 -65
  261. edsl/jobs/buckets/TokenBucket.py +0 -283
  262. edsl/jobs/buckets/TokenBucketClient.py +0 -191
  263. edsl/jobs/interviews/Interview.py +0 -395
  264. edsl/jobs/interviews/InterviewExceptionCollection.py +0 -99
  265. edsl/jobs/interviews/InterviewStatisticsCollection.py +0 -25
  266. edsl/jobs/runners/JobsRunnerAsyncio.py +0 -163
  267. edsl/jobs/runners/JobsRunnerStatusData.py +0 -0
  268. edsl/jobs/tasks/TaskCreators.py +0 -64
  269. edsl/jobs/tasks/TaskStatusLog.py +0 -23
  270. edsl/jobs/tokens/InterviewTokenUsage.py +0 -27
  271. edsl/language_models/LanguageModel.py +0 -635
  272. edsl/language_models/ServiceDataSources.py +0 -0
  273. edsl/language_models/key_management/KeyLookup.py +0 -63
  274. edsl/language_models/key_management/KeyLookupCollection.py +0 -38
  275. edsl/language_models/key_management/models.py +0 -137
  276. edsl/questions/QuestionBase.py +0 -539
  277. edsl/questions/QuestionFreeText.py +0 -130
  278. edsl/questions/derived/QuestionLikertFive.py +0 -76
  279. edsl/results/DatasetExportMixin.py +0 -911
  280. edsl/results/ResultsExportMixin.py +0 -45
  281. edsl/results/TextEditor.py +0 -50
  282. edsl/results/results_fetch_mixin.py +0 -33
  283. edsl/results/results_tools_mixin.py +0 -98
  284. edsl/scenarios/DocumentChunker.py +0 -104
  285. edsl/scenarios/FileStore.py +0 -564
  286. edsl/scenarios/Scenario.py +0 -548
  287. edsl/scenarios/ScenarioHtmlMixin.py +0 -65
  288. edsl/scenarios/ScenarioListExportMixin.py +0 -45
  289. edsl/scenarios/handlers/latex.py +0 -5
  290. edsl/shared.py +0 -1
  291. edsl/surveys/Survey.py +0 -1306
  292. edsl/surveys/SurveyQualtricsImport.py +0 -284
  293. edsl/surveys/SurveyToApp.py +0 -141
  294. edsl/surveys/instructions/__init__.py +0 -0
  295. edsl/tools/__init__.py +0 -1
  296. edsl/tools/clusters.py +0 -192
  297. edsl/tools/embeddings.py +0 -27
  298. edsl/tools/embeddings_plotting.py +0 -118
  299. edsl/tools/plotting.py +0 -112
  300. edsl/tools/summarize.py +0 -18
  301. edsl/utilities/data/Registry.py +0 -6
  302. edsl/utilities/data/__init__.py +0 -1
  303. edsl/utilities/data/scooter_results.json +0 -1
  304. edsl-0.1.46.dist-info/RECORD +0 -366
  305. /edsl/coop/{CoopFunctionsMixin.py → coop_functions.py} +0 -0
  306. /edsl/{results → dataset/display}/CSSParameterizer.py +0 -0
  307. /edsl/{language_models/key_management → dataset/display}/__init__.py +0 -0
  308. /edsl/{results → dataset/display}/table_data_class.py +0 -0
  309. /edsl/{results → dataset/display}/table_display.css +0 -0
  310. /edsl/{results/ResultsGGMixin.py → dataset/r/ggplot.py} +0 -0
  311. /edsl/{results → dataset}/tree_explore.py +0 -0
  312. /edsl/{surveys/instructions/ChangeInstruction.py → instructions/change_instruction.py} +0 -0
  313. /edsl/{jobs/interviews → interviews}/interview_status_enum.py +0 -0
  314. /edsl/jobs/{runners/JobsRunnerStatus.py → jobs_runner_status.py} +0 -0
  315. /edsl/language_models/{PriceManager.py → price_manager.py} +0 -0
  316. /edsl/language_models/{fake_openai_call.py → unused/fake_openai_call.py} +0 -0
  317. /edsl/language_models/{fake_openai_service.py → unused/fake_openai_service.py} +0 -0
  318. /edsl/notebooks/{NotebookToLaTeX.py → notebook_to_latex.py} +0 -0
  319. /edsl/{exceptions/questions.py → questions/exceptions.py} +0 -0
  320. /edsl/questions/{SimpleAskMixin.py → simple_ask_mixin.py} +0 -0
  321. /edsl/surveys/{Memory.py → memory/memory.py} +0 -0
  322. /edsl/surveys/{MemoryManagement.py → memory/memory_management.py} +0 -0
  323. /edsl/surveys/{SurveyCSS.py → survey_css.py} +0 -0
  324. /edsl/{jobs/tokens/TokenUsage.py → tokens/token_usage.py} +0 -0
  325. /edsl/{results/MarkdownToDocx.py → utilities/markdown_to_docx.py} +0 -0
  326. /edsl/{TemplateLoader.py → utilities/template_loader.py} +0 -0
  327. {edsl-0.1.46.dist-info → edsl-0.1.48.dist-info}/LICENSE +0 -0
  328. {edsl-0.1.46.dist-info → edsl-0.1.48.dist-info}/WHEEL +0 -0
@@ -0,0 +1,928 @@
1
+ """
2
+ A Scenario is a dictionary-like object that stores key-value pairs for parameterizing questions.
3
+
4
+ Scenarios are a fundamental concept in EDSL, providing a mechanism to parameterize
5
+ questions with dynamic values. Each Scenario contains key-value pairs that can be
6
+ referenced within question templates using Jinja syntax. This allows for creating
7
+ questions that vary based on the specific scenario being presented.
8
+
9
+ Key features include:
10
+ - Dictionary-like behavior (inherits from UserDict)
11
+ - Support for combination operations (addition, multiplication)
12
+ - Conversion to/from various formats (dict, dataset)
13
+ - Methods for file and data source integration
14
+
15
+ Scenarios can be created from various sources including files, URLs, PDFs, images,
16
+ and HTML content. They serve as the primary mechanism for providing context or variable
17
+ information to questions in surveys.
18
+ """
19
+
20
+ from __future__ import annotations
21
+ import copy
22
+ import os
23
+ import json
24
+ from collections import UserDict
25
+ from typing import Union, List, Optional, TYPE_CHECKING, Collection
26
+ from uuid import uuid4
27
+
28
+ from ..base import Base
29
+ from ..utilities import remove_edsl_version
30
+ from .exceptions import ScenarioError
31
+
32
+ if TYPE_CHECKING:
33
+ from .scenario_list import ScenarioList
34
+ from ..dataset import Dataset
35
+
36
+
37
+
38
+ class Scenario(Base, UserDict):
39
+ """
40
+ A dictionary-like object that stores key-value pairs for parameterizing questions.
41
+
42
+ A Scenario inherits from both the EDSL Base class and Python's UserDict, allowing
43
+ it to function as a dictionary while providing additional functionality. Scenarios
44
+ are used to parameterize questions by providing variable data that can be referenced
45
+ within question templates using Jinja syntax.
46
+
47
+ Scenarios can be created directly with dictionary data or constructed from various
48
+ sources using class methods (from_file, from_url, from_pdf, etc.). They support
49
+ operations like addition (combining scenarios) and multiplication (creating cross
50
+ products with other scenarios or scenario lists).
51
+
52
+ Attributes:
53
+ data (dict): The underlying dictionary data.
54
+ name (str, optional): A name for the scenario.
55
+
56
+ Examples:
57
+ Create a simple scenario:
58
+ >>> s = Scenario({"product": "coffee", "price": 4.99})
59
+
60
+ Combine scenarios:
61
+ >>> s1 = Scenario({"product": "coffee"})
62
+ >>> s2 = Scenario({"price": 4.99})
63
+ >>> s3 = s1 + s2
64
+ >>> s3
65
+ Scenario({'product': 'coffee', 'price': 4.99})
66
+
67
+ Create a scenario from a file:
68
+ >>> import tempfile
69
+ >>> with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
70
+ ... _ = f.write("Hello World")
71
+ ... data_path = f.name
72
+ >>> s = Scenario.from_file(data_path, "document")
73
+ >>> import os
74
+ >>> os.unlink(data_path) # Clean up temp file
75
+ """
76
+
77
+ __documentation__ = "https://docs.expectedparrot.com/en/latest/scenarios.html"
78
+
79
+ def __init__(self, data: Optional[dict] = None, name: Optional[str] = None):
80
+ """
81
+ Initialize a new Scenario.
82
+
83
+ Args:
84
+ data: A dictionary of key-value pairs for parameterizing questions.
85
+ Any dictionary-like object that can be converted to a dict is accepted.
86
+ name: An optional name for the scenario to aid in identification.
87
+
88
+ Raises:
89
+ ScenarioError: If the data cannot be converted to a dictionary.
90
+
91
+ Examples:
92
+ >>> s = Scenario({"product": "coffee", "price": 4.99})
93
+ >>> s = Scenario({"question": "What is your favorite color?"}, name="color_question")
94
+ """
95
+ if not isinstance(data, dict) and data is not None:
96
+ try:
97
+ data = dict(data)
98
+ except Exception as e:
99
+ raise ScenarioError(
100
+ f"You must pass in a dictionary to initialize a Scenario. You passed in {data}",
101
+ "Exception message:" + str(e),
102
+ )
103
+
104
+ super().__init__()
105
+ self.data = data if data is not None else {}
106
+ self.name = name
107
+
108
+ def __mul__(self, scenario_list_or_scenario: Union["ScenarioList", "Scenario"]) -> "ScenarioList":
109
+ """Takes the cross product of a Scenario with another Scenario or ScenarioList.
110
+
111
+ Args:
112
+ scenario_list_or_scenario: A Scenario or ScenarioList to multiply with.
113
+
114
+ Returns:
115
+ A ScenarioList containing the cross product.
116
+
117
+ Example:
118
+ >>> s1 = Scenario({'a': 1})
119
+ >>> s2 = Scenario({'b': 2})
120
+ >>> s1 * s2
121
+ ScenarioList([Scenario({'a': 1, 'b': 2})])
122
+
123
+ >>> from edsl.scenarios import ScenarioList
124
+ >>> sl = ScenarioList([Scenario({'b': 2}), Scenario({'b': 3})])
125
+ >>> new_s = s1 * sl
126
+ >>> new_s == ScenarioList([Scenario({'a': 1, 'b': 2}), Scenario({'a': 1, 'b': 3})])
127
+ True
128
+ """
129
+ from .scenario_list import ScenarioList
130
+ if isinstance(scenario_list_or_scenario, ScenarioList):
131
+ return scenario_list_or_scenario * self
132
+ elif isinstance(scenario_list_or_scenario, Scenario):
133
+ return ScenarioList([self]) * scenario_list_or_scenario
134
+ else:
135
+ raise TypeError(f"Cannot multiply Scenario with {type(scenario_list_or_scenario)}")
136
+
137
+ def replicate(self, n: int) -> "ScenarioList":
138
+ """Replicate a scenario n times to return a ScenarioList.
139
+
140
+ :param n: The number of times to replicate the scenario.
141
+
142
+ Example:
143
+ >>> s = Scenario({"food": "wood chips"})
144
+ >>> s.replicate(2)
145
+ ScenarioList([Scenario({'food': 'wood chips'}), Scenario({'food': 'wood chips'})])
146
+ """
147
+ from .scenario_list import ScenarioList
148
+
149
+ return ScenarioList([copy.deepcopy(self) for _ in range(n)])
150
+
151
+ @property
152
+ def has_jinja_braces(self) -> bool:
153
+ """Return whether the scenario has jinja braces. This matters for rendering.
154
+
155
+ >>> s = Scenario({"food": "I love {{wood chips}}"})
156
+ >>> s.has_jinja_braces
157
+ True
158
+ """
159
+ for _, value in self.items():
160
+ if isinstance(value, str):
161
+ if "{{" in value and "}}" in value:
162
+ return True
163
+ return False
164
+
165
+ def _convert_jinja_braces(
166
+ self, replacement_left: str = "<<", replacement_right: str = ">>"
167
+ ) -> Scenario:
168
+ """Convert Jinja braces to some other character.
169
+
170
+ >>> s = Scenario({"food": "I love {{wood chips}}"})
171
+ >>> s._convert_jinja_braces()
172
+ Scenario({'food': 'I love <<wood chips>>'})
173
+
174
+ """
175
+ new_scenario = Scenario()
176
+ for key, value in self.items():
177
+ if isinstance(value, str):
178
+ new_scenario[key] = value.replace("{{", replacement_left).replace(
179
+ "}}", replacement_right
180
+ )
181
+ else:
182
+ new_scenario[key] = value
183
+ return new_scenario
184
+
185
+ def __add__(self, other_scenario: Scenario) -> Scenario:
186
+ """Combine two scenarios by taking the union of their keys
187
+
188
+ If the other scenario is None, then just return self.
189
+
190
+ :param other_scenario: The other scenario to combine with.
191
+
192
+ Example:
193
+
194
+ >>> s1 = Scenario({"price": 100, "quantity": 2})
195
+ >>> s2 = Scenario({"color": "red"})
196
+ >>> s1 + s2
197
+ Scenario({'price': 100, 'quantity': 2, 'color': 'red'})
198
+ >>> (s1 + s2).__class__.__name__
199
+ 'Scenario'
200
+ """
201
+ if other_scenario is None:
202
+ return self
203
+ else:
204
+ data1 = copy.deepcopy(self.data)
205
+ data2 = copy.deepcopy(other_scenario.data)
206
+ s = Scenario(data1 | data2)
207
+ return s
208
+
209
+ def rename(
210
+ self,
211
+ old_name_or_replacement_dict: Union[str, dict[str, str]],
212
+ new_name: Optional[str] = None,
213
+ ) -> Scenario:
214
+ """Rename the keys of a scenario.
215
+
216
+ :param old_name_or_replacement_dict: A dictionary of old keys to new keys *OR* a string of the old key.
217
+ :param new_name: The new name of the key.
218
+
219
+ Example:
220
+
221
+ >>> s = Scenario({"food": "wood chips"})
222
+ >>> s.rename({"food": "food_preference"})
223
+ Scenario({'food_preference': 'wood chips'})
224
+
225
+ >>> s = Scenario({"food": "wood chips"})
226
+ >>> s.rename("food", "snack")
227
+ Scenario({'snack': 'wood chips'})
228
+ """
229
+ if isinstance(old_name_or_replacement_dict, str) and new_name is not None:
230
+ replacement_dict = {old_name_or_replacement_dict: new_name}
231
+ else:
232
+ replacement_dict = old_name_or_replacement_dict
233
+
234
+ new_scenario = Scenario()
235
+ for key, value in self.items():
236
+ if key in replacement_dict:
237
+ new_scenario[replacement_dict[key]] = value
238
+ else:
239
+ new_scenario[key] = value
240
+ return new_scenario
241
+
242
+ def new_column_names(self, new_names: List[str]) -> Scenario:
243
+ """Rename the keys of a scenario.
244
+
245
+ >>> s = Scenario({"food": "wood chips"})
246
+ >>> s.new_column_names(["food_preference"])
247
+ Scenario({'food_preference': 'wood chips'})
248
+ """
249
+ try:
250
+ assert len(new_names) == len(self.keys())
251
+ except AssertionError:
252
+ print("The number of new names must match the number of keys.")
253
+
254
+ new_scenario = Scenario()
255
+ for new_names, value in zip(new_names, self.values()):
256
+ new_scenario[new_names] = value
257
+ return new_scenario
258
+
259
+ def table(self, tablefmt: str = "grid") -> str:
260
+ """Display a scenario as a table."""
261
+ return self.to_dataset().table(tablefmt=tablefmt)
262
+
263
+
264
+ def to_dict(self, add_edsl_version: bool = True) -> dict:
265
+ """Convert a scenario to a dictionary.
266
+
267
+ Example:
268
+
269
+ >>> s = Scenario({"food": "wood chips"})
270
+ >>> s.to_dict()
271
+ {'food': 'wood chips', 'edsl_version': '...', 'edsl_class_name': 'Scenario'}
272
+
273
+ >>> s.to_dict(add_edsl_version = False)
274
+ {'food': 'wood chips'}
275
+
276
+ """
277
+ from edsl.scenarios import FileStore
278
+
279
+ d = self.data.copy()
280
+ for key, value in d.items():
281
+ if isinstance(value, FileStore):
282
+ d[key] = value.to_dict(add_edsl_version=add_edsl_version)
283
+ if add_edsl_version:
284
+ from edsl import __version__
285
+
286
+ d["edsl_version"] = __version__
287
+ d["edsl_class_name"] = "Scenario"
288
+
289
+ return d
290
+
291
+ def __hash__(self) -> int:
292
+ """Return a hash of the scenario.
293
+
294
+ Example:
295
+
296
+ >>> s = Scenario({"food": "wood chips"})
297
+ >>> hash(s)
298
+ 1153210385458344214
299
+ """
300
+ from edsl.utilities.utilities import dict_hash
301
+
302
+ return dict_hash(self.to_dict(add_edsl_version=False))
303
+
304
+ def __repr__(self):
305
+ return "Scenario(" + repr(self.data) + ")"
306
+
307
+ def to_dataset(self) -> "Dataset":
308
+ """Convert a scenario to a dataset.
309
+
310
+ >>> s = Scenario({"food": "wood chips"})
311
+ >>> s.to_dataset()
312
+ Dataset([{'key': ['food']}, {'value': ['wood chips']}])
313
+ """
314
+ from ..dataset import Dataset
315
+
316
+ keys = list(self.keys())
317
+ values = list(self.values())
318
+ return Dataset([{"key": keys}, {"value": values}])
319
+
320
+ def select(self, list_of_keys: Collection[str]) -> "Scenario":
321
+ """Select a subset of keys from a scenario.
322
+
323
+ :param list_of_keys: The keys to select.
324
+
325
+ Example:
326
+
327
+ >>> s = Scenario({"food": "wood chips", "drink": "water"})
328
+ >>> s.select(["food"])
329
+ Scenario({'food': 'wood chips'})
330
+ """
331
+ new_scenario = Scenario()
332
+ for key in list_of_keys:
333
+ new_scenario[key] = self[key]
334
+ return new_scenario
335
+
336
+ def drop(self, list_of_keys: Collection[str]) -> "Scenario":
337
+ """Drop a subset of keys from a scenario.
338
+
339
+ :param list_of_keys: The keys to drop.
340
+
341
+ Example:
342
+
343
+ >>> s = Scenario({"food": "wood chips", "drink": "water"})
344
+ >>> s.drop(["food"])
345
+ Scenario({'drink': 'water'})
346
+ """
347
+ new_scenario = Scenario()
348
+ for key in self.keys():
349
+ if key not in list_of_keys:
350
+ new_scenario[key] = self[key]
351
+ return new_scenario
352
+
353
+ def keep(self, list_of_keys: List[str]) -> "Scenario":
354
+ """Keep a subset of keys from a scenario.
355
+
356
+ :param list_of_keys: The keys to keep.
357
+
358
+ Example:
359
+
360
+ >>> s = Scenario({"food": "wood chips", "drink": "water"})
361
+ >>> s.keep(["food"])
362
+ Scenario({'food': 'wood chips'})
363
+ """
364
+
365
+ return self.select(list_of_keys)
366
+
367
+ @classmethod
368
+ def from_url(cls, url: str, field_name: Optional[str] = "text", testing: bool = False) -> "Scenario":
369
+ """
370
+ Creates a Scenario from the content of a URL.
371
+
372
+ This method fetches content from a web URL and creates a Scenario containing the URL
373
+ and the extracted text. When available, BeautifulSoup is used for better HTML parsing
374
+ and text extraction, otherwise a basic requests approach is used.
375
+
376
+ Args:
377
+ url: The URL to fetch content from.
378
+ field_name: The key name to use for storing the extracted text in the Scenario.
379
+ Defaults to "text".
380
+ testing: If True, uses a simplified requests method instead of BeautifulSoup.
381
+ This is primarily for testing purposes.
382
+
383
+ Returns:
384
+ A Scenario containing the URL and extracted text.
385
+
386
+ Raises:
387
+ requests.exceptions.RequestException: If the URL cannot be accessed.
388
+
389
+ Examples:
390
+ >>> s = Scenario.from_url("https://example.com", testing=True)
391
+ >>> "url" in s and "text" in s
392
+ True
393
+
394
+ >>> s = Scenario.from_url("https://example.com", field_name="content", testing=True)
395
+ >>> "url" in s and "content" in s
396
+ True
397
+
398
+ Notes:
399
+ - The method attempts to use BeautifulSoup and fake_useragent for better
400
+ HTML parsing and to mimic a real browser.
401
+ - If these packages are not available, it falls back to basic requests.
402
+ - When using BeautifulSoup, it extracts text from paragraph and heading tags.
403
+ """
404
+ import requests
405
+
406
+ if testing:
407
+ # Use simple requests method for testing
408
+ response = requests.get(url)
409
+ text = response.text
410
+ else:
411
+ try:
412
+ from bs4 import BeautifulSoup
413
+ from fake_useragent import UserAgent
414
+
415
+ # Configure request headers to appear more like a regular browser
416
+ ua = UserAgent()
417
+ headers = {
418
+ 'User-Agent': ua.random,
419
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
420
+ 'Accept-Language': 'en-US,en;q=0.5'
421
+ }
422
+
423
+ response = requests.get(url, headers=headers)
424
+ soup = BeautifulSoup(response.content, 'html.parser')
425
+
426
+ # Get text content while preserving some structure
427
+ text = ' '.join([p.get_text(strip=True) for p in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])])
428
+
429
+ except ImportError:
430
+ # Fallback to basic requests if BeautifulSoup/fake_useragent not available
431
+ print("BeautifulSoup/fake_useragent not available. Falling back to basic requests.")
432
+ response = requests.get(url)
433
+ text = response.text
434
+
435
+ return cls({"url": url, field_name: text})
436
+
437
+ @classmethod
438
+ def from_file(cls, file_path: str, field_name: str) -> "Scenario":
439
+ """
440
+ Creates a Scenario containing a FileStore object from a file.
441
+
442
+ This method creates a Scenario with a single key-value pair where the value
443
+ is a FileStore object that encapsulates the specified file. The FileStore
444
+ handles appropriate file loading, encoding, and extraction based on the file type.
445
+
446
+ Args:
447
+ file_path: Path to the file to be incorporated into the Scenario.
448
+ field_name: Key name to use for storing the FileStore in the Scenario.
449
+
450
+ Returns:
451
+ A Scenario containing a FileStore object linked to the specified file.
452
+
453
+ Raises:
454
+ FileNotFoundError: If the specified file does not exist.
455
+
456
+ Examples:
457
+ >>> import tempfile
458
+ >>> with tempfile.NamedTemporaryFile(suffix=".txt", mode="w") as f:
459
+ ... _ = f.write("This is a test.")
460
+ ... _ = f.flush()
461
+ ... s = Scenario.from_file(f.name, "file")
462
+ >>> s
463
+ Scenario({'file': FileStore(path='...', ...)})
464
+
465
+ Notes:
466
+ - The FileStore object handles various file formats differently
467
+ - FileStore provides methods to access file content, extract text,
468
+ and manage file operations appropriate to the file type
469
+ """
470
+ from edsl.scenarios import FileStore
471
+
472
+ fs = FileStore(file_path)
473
+ return cls({field_name: fs})
474
+
475
+ @classmethod
476
+ def from_image(
477
+ cls, image_path: str, image_name: Optional[str] = None
478
+ ) -> "Scenario":
479
+ """
480
+ Creates a Scenario containing an image file as a FileStore object.
481
+
482
+ This method creates a Scenario with a single key-value pair where the value
483
+ is a FileStore object that encapsulates the specified image file. The image
484
+ is stored as a base64-encoded string, allowing it to be easily serialized
485
+ and transmitted.
486
+
487
+ Args:
488
+ image_path: Path to the image file to be incorporated into the Scenario.
489
+ image_name: Key name to use for storing the FileStore in the Scenario.
490
+ If not provided, uses the filename without extension.
491
+
492
+ Returns:
493
+ A Scenario containing a FileStore object with the image data.
494
+
495
+ Raises:
496
+ FileNotFoundError: If the specified image file does not exist.
497
+
498
+ Examples:
499
+ >>> import os
500
+ >>> # Assuming an image file exists
501
+ >>> if os.path.exists("image.jpg"):
502
+ ... s = Scenario.from_image("image.jpg")
503
+ ... s_named = Scenario.from_image("image.jpg", "picture")
504
+
505
+ Notes:
506
+ - The resulting FileStore can be displayed in notebooks or used in questions
507
+ - Supported image formats include JPG, PNG, GIF, etc.
508
+ - The image is stored as a base64-encoded string for portability
509
+ """
510
+ if not os.path.exists(image_path):
511
+ raise FileNotFoundError(f"Image file not found: {image_path}")
512
+
513
+ if image_name is None:
514
+ image_name = os.path.basename(image_path).split(".")[0]
515
+
516
+ return cls.from_file(image_path, image_name)
517
+
518
+ @classmethod
519
+ def from_pdf(cls, pdf_path: str) -> "Scenario":
520
+ """
521
+ Creates a Scenario containing text extracted from a PDF file.
522
+
523
+ This method extracts text and metadata from a PDF file and creates a Scenario
524
+ containing this information. It uses the PdfExtractor class which provides
525
+ access to text content, metadata, and structure from PDF files.
526
+
527
+ Args:
528
+ pdf_path: Path to the PDF file to extract content from.
529
+
530
+ Returns:
531
+ A Scenario containing extracted text and metadata from the PDF.
532
+
533
+ Raises:
534
+ FileNotFoundError: If the specified PDF file does not exist.
535
+ ImportError: If the required PDF extraction libraries are not installed.
536
+
537
+ Examples:
538
+ >>> import os
539
+ >>> # Assuming a PDF file exists
540
+ >>> if os.path.exists("document.pdf"):
541
+ ... s = Scenario.from_pdf("document.pdf")
542
+
543
+ Notes:
544
+ - The returned Scenario contains various keys with PDF content and metadata
545
+ - PDF extraction requires the PyMuPDF library
546
+ - The extraction process parses the PDF to maintain structure where possible
547
+ """
548
+ try:
549
+ from edsl.scenarios.PdfExtractor import PdfExtractor
550
+ extractor = PdfExtractor(pdf_path)
551
+ return Scenario(extractor.get_pdf_dict())
552
+ except ImportError as e:
553
+ raise ImportError(
554
+ f"Could not extract text from PDF: {str(e)}. "
555
+ "PDF extraction requires the PyMuPDF library. "
556
+ "Install it with: pip install pymupdf"
557
+ )
558
+
559
+ @classmethod
560
+ def from_html(cls, url: str, field_name: Optional[str] = None) -> "Scenario":
561
+ """
562
+ Creates a Scenario containing both HTML content and extracted text from a URL.
563
+
564
+ This method fetches HTML content from a URL, extracts readable text from it,
565
+ and creates a Scenario containing the original URL, the raw HTML, and the
566
+ extracted text. Unlike from_url, this method preserves the raw HTML content.
567
+
568
+ Args:
569
+ url: URL to fetch HTML content from.
570
+ field_name: Key name to use for the extracted text in the Scenario.
571
+ If not provided, defaults to "text".
572
+
573
+ Returns:
574
+ A Scenario containing the URL, raw HTML, and extracted text.
575
+
576
+ Raises:
577
+ requests.exceptions.RequestException: If the URL cannot be accessed.
578
+
579
+ Examples:
580
+ >>> s = Scenario.from_html("https://example.com")
581
+ >>> all(key in s for key in ["url", "html", "text"])
582
+ True
583
+
584
+ >>> s = Scenario.from_html("https://example.com", field_name="content")
585
+ >>> all(key in s for key in ["url", "html", "content"])
586
+ True
587
+
588
+ Notes:
589
+ - Uses BeautifulSoup for HTML parsing when available
590
+ - Stores both the raw HTML and the extracted text
591
+ - Provides a more comprehensive representation than from_url
592
+ - Useful when the HTML structure or specific elements are needed
593
+ """
594
+ html = cls.fetch_html(url)
595
+ text = cls.extract_text(html)
596
+ if not field_name:
597
+ field_name = "text"
598
+ return cls({"url": url, "html": html, field_name: text})
599
+
600
+ @staticmethod
601
+ def fetch_html(url: str) -> Optional[str]:
602
+ """
603
+ Fetches HTML content from a URL with robust error handling and retries.
604
+
605
+ This method creates a session with configurable retries to fetch HTML content
606
+ from a URL. It uses a realistic user agent to avoid being blocked by websites
607
+ that filter bot traffic.
608
+
609
+ Args:
610
+ url: The URL to fetch HTML content from.
611
+
612
+ Returns:
613
+ The HTML content as a string, or None if the request failed.
614
+
615
+ Raises:
616
+ requests.exceptions.RequestException: If a request error occurs.
617
+ """
618
+ import requests
619
+ from requests.adapters import HTTPAdapter
620
+ from requests.packages.urllib3.util.retry import Retry
621
+
622
+ headers = {
623
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
624
+ }
625
+
626
+ # Create a session to manage cookies and retries
627
+ session = requests.Session()
628
+ retries = Retry(
629
+ total=5, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504]
630
+ )
631
+ session.mount("http://", HTTPAdapter(max_retries=retries))
632
+ session.mount("https://", HTTPAdapter(max_retries=retries))
633
+
634
+ try:
635
+ # Make the request
636
+ response = session.get(url, headers=headers, timeout=10)
637
+ response.raise_for_status() # Raise an exception for HTTP errors
638
+ return response.text
639
+ except requests.exceptions.RequestException as e:
640
+ print(f"An error occurred: {e}")
641
+ return None
642
+
643
+ @staticmethod
644
+ def extract_text(html: Optional[str]) -> str:
645
+ """
646
+ Extracts readable text from HTML content using BeautifulSoup.
647
+
648
+ This method parses HTML content and extracts the readable text while
649
+ removing HTML tags and script content.
650
+
651
+ Args:
652
+ html: The HTML content to extract text from.
653
+
654
+ Returns:
655
+ The extracted text content as a string. Returns an empty string
656
+ if the input is None or if parsing fails.
657
+ """
658
+ if html is None:
659
+ return ""
660
+
661
+ try:
662
+ from bs4 import BeautifulSoup
663
+ soup = BeautifulSoup(html, "html.parser")
664
+
665
+ # Remove script and style elements that might contain non-readable content
666
+ for element in soup(["script", "style"]):
667
+ element.extract()
668
+
669
+ text = soup.get_text()
670
+
671
+ # Normalize whitespace
672
+ lines = (line.strip() for line in text.splitlines())
673
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
674
+ text = '\n'.join(chunk for chunk in chunks if chunk)
675
+
676
+ return text
677
+ except Exception as e:
678
+ print(f"Error extracting text from HTML: {e}")
679
+ return ""
680
+
681
+
682
+ @classmethod
683
+ def from_pdf_to_image(cls, pdf_path: str, image_format: str = "jpeg") -> "Scenario":
684
+ """
685
+ Converts each page of a PDF into an image and creates a Scenario containing them.
686
+
687
+ This method takes a PDF file, converts each page to an image in the specified
688
+ format, and creates a Scenario containing the original file path and FileStore
689
+ objects for each page image. This is particularly useful for visualizing PDF
690
+ content or for image-based processing of PDF documents.
691
+
692
+ Args:
693
+ pdf_path: Path to the PDF file to convert to images.
694
+ image_format: Format of the output images (default is 'jpeg').
695
+ Other formats include 'png', 'tiff', etc.
696
+
697
+ Returns:
698
+ A Scenario containing the original PDF file path and FileStore objects
699
+ for each page image, with keys like "page_0", "page_1", etc.
700
+
701
+ Raises:
702
+ FileNotFoundError: If the specified PDF file does not exist.
703
+ ImportError: If pdf2image is not installed.
704
+
705
+ Examples:
706
+ >>> import os
707
+ >>> # Assuming a PDF file exists
708
+ >>> if os.path.exists("document.pdf"):
709
+ ... s = Scenario.from_pdf_to_image("document.pdf")
710
+ ... s_png = Scenario.from_pdf_to_image("document.pdf", "png")
711
+
712
+ Notes:
713
+ - Requires the pdf2image library which depends on poppler
714
+ - Creates a separate image for each page of the PDF
715
+ - Images are stored in FileStore objects for easy display and handling
716
+ - Images are created in a temporary directory which is automatically cleaned up
717
+ """
718
+ import tempfile
719
+ from pdf2image import convert_from_path
720
+ from edsl.scenarios import Scenario
721
+
722
+ with tempfile.TemporaryDirectory() as output_folder:
723
+ # Convert PDF to images
724
+ images = convert_from_path(pdf_path)
725
+
726
+ scenario_dict = {"filepath": pdf_path}
727
+
728
+ # Save each page as an image and create Scenario instances
729
+ for i, image in enumerate(images):
730
+ image_path = os.path.join(output_folder, f"page_{i}.{image_format}")
731
+ image.save(image_path, image_format.upper())
732
+
733
+ from edsl.scenarios import FileStore
734
+ scenario_dict[f"page_{i}"] = FileStore(image_path)
735
+
736
+ scenario = Scenario(scenario_dict)
737
+
738
+ return cls(scenario)
739
+
740
+ @classmethod
741
+ def from_docx(cls, docx_path: str) -> "Scenario":
742
+ """
743
+ Creates a Scenario containing text extracted from a Microsoft Word document.
744
+
745
+ This method extracts text and structure from a DOCX file and creates a Scenario
746
+ containing this information. It uses the DocxScenario class to handle the
747
+ extraction process and maintain document structure where possible.
748
+
749
+ Args:
750
+ docx_path: Path to the DOCX file to extract content from.
751
+
752
+ Returns:
753
+ A Scenario containing the file path and extracted text from the DOCX file.
754
+
755
+ Raises:
756
+ FileNotFoundError: If the specified DOCX file does not exist.
757
+ ImportError: If the python-docx library is not installed.
758
+
759
+ Examples:
760
+ >>> from docx import Document
761
+ >>> doc = Document()
762
+ >>> _ = doc.add_heading("EDSL Survey")
763
+ >>> _ = doc.add_paragraph("This is a test.")
764
+ >>> doc.save("test.docx")
765
+ >>> s = Scenario.from_docx("test.docx")
766
+ >>> s
767
+ Scenario({'file_path': 'test.docx', 'text': 'EDSL Survey\\nThis is a test.'})
768
+ >>> import os; os.remove("test.docx")
769
+
770
+ Notes:
771
+ - The returned Scenario typically contains the file path and extracted text
772
+ - The extraction process attempts to maintain document structure
773
+ - Requires the python-docx library to be installed
774
+ """
775
+ from edsl.scenarios.DocxScenario import DocxScenario
776
+
777
+ return Scenario(DocxScenario(docx_path).get_scenario_dict())
778
+
779
+ def chunk(
780
+ self,
781
+ field: str,
782
+ num_words: Optional[int] = None,
783
+ num_lines: Optional[int] = None,
784
+ include_original: bool = False,
785
+ hash_original: bool = False,
786
+ ) -> "ScenarioList":
787
+ """
788
+ Splits a text field into chunks of a specified size, creating a ScenarioList.
789
+
790
+ This method takes a field containing text and divides it into smaller chunks
791
+ based on either word count or line count. It's particularly useful for processing
792
+ large text documents in manageable pieces, such as for summarization, analysis,
793
+ or when working with models that have token limits.
794
+
795
+ Args:
796
+ field: The key name of the field in the Scenario to split.
797
+ num_words: The number of words to include in each chunk. Mutually exclusive
798
+ with num_lines.
799
+ num_lines: The number of lines to include in each chunk. Mutually exclusive
800
+ with num_words.
801
+ include_original: If True, includes the original complete text in each chunk
802
+ with a "_original" suffix.
803
+ hash_original: If True and include_original is True, stores a hash of the
804
+ original text instead of the full text.
805
+
806
+ Returns:
807
+ A ScenarioList containing multiple Scenarios, each with a chunk of the
808
+ original text. Each Scenario includes the chunk text, chunk index, character
809
+ count, and word count.
810
+
811
+ Raises:
812
+ ValueError: If neither num_words nor num_lines is specified, or if both are.
813
+ KeyError: If the specified field doesn't exist in the Scenario.
814
+
815
+ Examples:
816
+ Split by lines (1 line per chunk):
817
+ >>> s = Scenario({"text": "This is a test.\\nThis is a test.\\n\\nThis is a test."})
818
+ >>> s.chunk("text", num_lines=1)
819
+ ScenarioList([Scenario({'text': 'This is a test.', 'text_chunk': 0, 'text_char_count': 15, 'text_word_count': 4}), Scenario({'text': 'This is a test.', 'text_chunk': 1, 'text_char_count': 15, 'text_word_count': 4}), Scenario({'text': '', 'text_chunk': 2, 'text_char_count': 0, 'text_word_count': 0}), Scenario({'text': 'This is a test.', 'text_chunk': 3, 'text_char_count': 15, 'text_word_count': 4})])
820
+
821
+ Split by words (2 words per chunk):
822
+ >>> s.chunk("text", num_words=2)
823
+ ScenarioList([Scenario({'text': 'This is', 'text_chunk': 0, 'text_char_count': 7, 'text_word_count': 2}), Scenario({'text': 'a test.', 'text_chunk': 1, 'text_char_count': 7, 'text_word_count': 2}), Scenario({'text': 'This is', 'text_chunk': 2, 'text_char_count': 7, 'text_word_count': 2}), Scenario({'text': 'a test.', 'text_chunk': 3, 'text_char_count': 7, 'text_word_count': 2}), Scenario({'text': 'This is', 'text_chunk': 4, 'text_char_count': 7, 'text_word_count': 2}), Scenario({'text': 'a test.', 'text_chunk': 5, 'text_char_count': 7, 'text_word_count': 2})])
824
+
825
+ Include original text in each chunk:
826
+ >>> s = Scenario({"text": "Hello World"})
827
+ >>> s.chunk("text", num_words=1, include_original=True)
828
+ ScenarioList([Scenario({'text': 'Hello', 'text_chunk': 0, 'text_char_count': 5, 'text_word_count': 1, 'text_original': 'Hello World'}), Scenario({'text': 'World', 'text_chunk': 1, 'text_char_count': 5, 'text_word_count': 1, 'text_original': 'Hello World'})])
829
+
830
+ Use a hash of the original text:
831
+ >>> s.chunk("text", num_words=1, include_original=True, hash_original=True)
832
+ ScenarioList([Scenario({'text': 'Hello', 'text_chunk': 0, 'text_char_count': 5, 'text_word_count': 1, 'text_original': 'b10a8db164e0754105b7a99be72e3fe5'}), Scenario({'text': 'World', 'text_chunk': 1, 'text_char_count': 5, 'text_word_count': 1, 'text_original': 'b10a8db164e0754105b7a99be72e3fe5'})])
833
+
834
+ Notes:
835
+ - Either num_words or num_lines must be specified, but not both
836
+ - Each chunk is assigned a sequential index in the 'text_chunk' field
837
+ - Character and word counts for each chunk are included
838
+ - When include_original is True, the original text is preserved in each chunk
839
+ - The hash_original option is useful to save space while maintaining traceability
840
+ """
841
+ from .document_chunker import DocumentChunker
842
+
843
+ return DocumentChunker(self).chunk(
844
+ field, num_words, num_lines, include_original, hash_original
845
+ )
846
+
847
+ @classmethod
848
+ @remove_edsl_version
849
+ def from_dict(cls, d: dict) -> "Scenario":
850
+ """
851
+ Creates a Scenario from a dictionary, with special handling for FileStore objects.
852
+
853
+ This method creates a Scenario using the provided dictionary. It has special handling
854
+ for dictionary values that represent serialized FileStore objects, which it will
855
+ deserialize back into proper FileStore instances.
856
+
857
+ Args:
858
+ d: A dictionary to convert to a Scenario.
859
+
860
+ Returns:
861
+ A new Scenario containing the provided dictionary data.
862
+
863
+ Examples:
864
+ >>> Scenario.from_dict({"food": "wood chips"})
865
+ Scenario({'food': 'wood chips'})
866
+
867
+ >>> # Example with a serialized FileStore
868
+ >>> from edsl import FileStore
869
+ >>> file_dict = {"path": "example.txt", "base64_string": "SGVsbG8gV29ybGQ="}
870
+ >>> s = Scenario.from_dict({"document": file_dict})
871
+ >>> isinstance(s["document"], FileStore)
872
+ True
873
+
874
+ Notes:
875
+ - Any dictionary values that match the FileStore format will be converted to FileStore objects
876
+ - The method detects FileStore objects by looking for "base64_string" and "path" keys
877
+ - EDSL version information is automatically removed by the @remove_edsl_version decorator
878
+ - This method is commonly used when deserializing scenarios from JSON or other formats
879
+ """
880
+ from edsl.scenarios import FileStore
881
+
882
+ for key, value in d.items():
883
+ # TODO: we should check this better if its a FileStore + add remote security check against path traversal
884
+ if (
885
+ isinstance(value, dict) and "base64_string" in value and "path" in value
886
+ ) or isinstance(value, FileStore):
887
+ d[key] = FileStore.from_dict(value)
888
+ return cls(d)
889
+
890
+ def _table(self) -> tuple[dict, list]:
891
+ """Prepare generic table data.
892
+ >>> s = Scenario({"food": "wood chips"})
893
+ >>> s._table()
894
+ ([{'Attribute': 'data', 'Value': "{'food': 'wood chips'}"}, {'Attribute': 'name', 'Value': 'None'}], ['Attribute', 'Value'])
895
+ """
896
+ table_data = []
897
+ for attr_name, attr_value in self.__dict__.items():
898
+ table_data.append({"Attribute": attr_name, "Value": repr(attr_value)})
899
+ column_names = ["Attribute", "Value"]
900
+ return table_data, column_names
901
+
902
+ @classmethod
903
+ def example(cls, randomize: bool = False) -> Scenario:
904
+ """
905
+ Returns an example Scenario instance.
906
+
907
+ :param randomize: If True, adds a random string to the value of the example key.
908
+ """
909
+ addition = "" if not randomize else str(uuid4())
910
+ return cls(
911
+ {
912
+ "persona": f"A reseacher studying whether LLMs can be used to generate surveys.{addition}",
913
+ }
914
+ )
915
+
916
+ def code(self) -> List[str]:
917
+ """Return the code for the scenario."""
918
+ lines = []
919
+ lines.append("from edsl.scenario import Scenario")
920
+ lines.append(f"s = Scenario({self.data})")
921
+ # return f"Scenario({self.data})"
922
+ return lines
923
+
924
+
925
+ if __name__ == "__main__":
926
+ import doctest
927
+
928
+ doctest.testmod(optionflags=doctest.ELLIPSIS)