edsl 0.1.47__py3-none-any.whl → 0.1.48__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- edsl/__init__.py +44 -39
- edsl/__version__.py +1 -1
- edsl/agents/__init__.py +4 -2
- edsl/agents/{Agent.py → agent.py} +442 -152
- edsl/agents/{AgentList.py → agent_list.py} +220 -162
- edsl/agents/descriptors.py +46 -7
- edsl/{exceptions/agents.py → agents/exceptions.py} +3 -12
- edsl/base/__init__.py +75 -0
- edsl/base/base_class.py +1303 -0
- edsl/base/data_transfer_models.py +114 -0
- edsl/base/enums.py +215 -0
- edsl/base.py +8 -0
- edsl/buckets/__init__.py +25 -0
- edsl/buckets/bucket_collection.py +324 -0
- edsl/buckets/model_buckets.py +206 -0
- edsl/buckets/token_bucket.py +502 -0
- edsl/{jobs/buckets/TokenBucketAPI.py → buckets/token_bucket_api.py} +1 -1
- edsl/buckets/token_bucket_client.py +509 -0
- edsl/caching/__init__.py +20 -0
- edsl/caching/cache.py +814 -0
- edsl/caching/cache_entry.py +427 -0
- edsl/{data/CacheHandler.py → caching/cache_handler.py} +14 -15
- edsl/caching/exceptions.py +24 -0
- edsl/caching/orm.py +30 -0
- edsl/{data/RemoteCacheSync.py → caching/remote_cache_sync.py} +3 -3
- edsl/caching/sql_dict.py +441 -0
- edsl/config/__init__.py +8 -0
- edsl/config/config_class.py +177 -0
- edsl/config.py +4 -176
- edsl/conversation/Conversation.py +7 -7
- edsl/conversation/car_buying.py +4 -4
- edsl/conversation/chips.py +6 -6
- edsl/coop/__init__.py +25 -2
- edsl/coop/coop.py +303 -67
- edsl/coop/{ExpectedParrotKeyHandler.py → ep_key_handling.py} +86 -10
- edsl/coop/exceptions.py +62 -0
- edsl/coop/price_fetcher.py +126 -0
- edsl/coop/utils.py +89 -24
- edsl/data_transfer_models.py +5 -72
- edsl/dataset/__init__.py +10 -0
- edsl/{results/Dataset.py → dataset/dataset.py} +116 -36
- edsl/{results/DatasetExportMixin.py → dataset/dataset_operations_mixin.py} +606 -122
- edsl/{results/DatasetTree.py → dataset/dataset_tree.py} +156 -75
- edsl/{results/TableDisplay.py → dataset/display/table_display.py} +18 -7
- edsl/{results → dataset/display}/table_renderers.py +58 -2
- edsl/{results → dataset}/file_exports.py +4 -5
- edsl/{results → dataset}/smart_objects.py +2 -2
- edsl/enums.py +5 -205
- edsl/inference_services/__init__.py +5 -0
- edsl/inference_services/{AvailableModelCacheHandler.py → available_model_cache_handler.py} +2 -3
- edsl/inference_services/{AvailableModelFetcher.py → available_model_fetcher.py} +8 -14
- edsl/inference_services/data_structures.py +3 -2
- edsl/{exceptions/inference_services.py → inference_services/exceptions.py} +1 -1
- edsl/inference_services/{InferenceServiceABC.py → inference_service_abc.py} +1 -1
- edsl/inference_services/{InferenceServicesCollection.py → inference_services_collection.py} +8 -7
- edsl/inference_services/registry.py +4 -41
- edsl/inference_services/{ServiceAvailability.py → service_availability.py} +5 -25
- edsl/inference_services/services/__init__.py +31 -0
- edsl/inference_services/{AnthropicService.py → services/anthropic_service.py} +3 -3
- edsl/inference_services/{AwsBedrock.py → services/aws_bedrock.py} +2 -2
- edsl/inference_services/{AzureAI.py → services/azure_ai.py} +2 -2
- edsl/inference_services/{DeepInfraService.py → services/deep_infra_service.py} +1 -3
- edsl/inference_services/{DeepSeekService.py → services/deep_seek_service.py} +2 -4
- edsl/inference_services/{GoogleService.py → services/google_service.py} +5 -4
- edsl/inference_services/{GroqService.py → services/groq_service.py} +1 -1
- edsl/inference_services/{MistralAIService.py → services/mistral_ai_service.py} +3 -3
- edsl/inference_services/{OllamaService.py → services/ollama_service.py} +1 -7
- edsl/inference_services/{OpenAIService.py → services/open_ai_service.py} +5 -6
- edsl/inference_services/{PerplexityService.py → services/perplexity_service.py} +3 -7
- edsl/inference_services/{TestService.py → services/test_service.py} +7 -6
- edsl/inference_services/{TogetherAIService.py → services/together_ai_service.py} +2 -6
- edsl/inference_services/{XAIService.py → services/xai_service.py} +1 -1
- edsl/inference_services/write_available.py +1 -2
- edsl/instructions/__init__.py +6 -0
- edsl/{surveys/instructions/Instruction.py → instructions/instruction.py} +11 -6
- edsl/{surveys/instructions/InstructionCollection.py → instructions/instruction_collection.py} +10 -5
- edsl/{surveys/InstructionHandler.py → instructions/instruction_handler.py} +3 -3
- edsl/{jobs/interviews → interviews}/ReportErrors.py +2 -2
- edsl/interviews/__init__.py +4 -0
- edsl/{jobs/AnswerQuestionFunctionConstructor.py → interviews/answering_function.py} +45 -18
- edsl/{jobs/interviews/InterviewExceptionEntry.py → interviews/exception_tracking.py} +107 -22
- edsl/interviews/interview.py +638 -0
- edsl/{jobs/interviews/InterviewStatusDictionary.py → interviews/interview_status_dictionary.py} +21 -12
- edsl/{jobs/interviews/InterviewStatusLog.py → interviews/interview_status_log.py} +16 -7
- edsl/{jobs/InterviewTaskManager.py → interviews/interview_task_manager.py} +12 -7
- edsl/{jobs/RequestTokenEstimator.py → interviews/request_token_estimator.py} +8 -3
- edsl/{jobs/interviews/InterviewStatistic.py → interviews/statistics.py} +36 -10
- edsl/invigilators/__init__.py +38 -0
- edsl/invigilators/invigilator_base.py +477 -0
- edsl/{agents/Invigilator.py → invigilators/invigilators.py} +263 -10
- edsl/invigilators/prompt_constructor.py +476 -0
- edsl/{agents → invigilators}/prompt_helpers.py +2 -1
- edsl/{agents/QuestionInstructionPromptBuilder.py → invigilators/question_instructions_prompt_builder.py} +18 -13
- edsl/{agents → invigilators}/question_option_processor.py +96 -21
- edsl/{agents/QuestionTemplateReplacementsBuilder.py → invigilators/question_template_replacements_builder.py} +64 -12
- edsl/jobs/__init__.py +7 -1
- edsl/jobs/async_interview_runner.py +99 -35
- edsl/jobs/check_survey_scenario_compatibility.py +7 -5
- edsl/jobs/data_structures.py +153 -22
- edsl/{exceptions/jobs.py → jobs/exceptions.py} +2 -1
- edsl/jobs/{FetchInvigilator.py → fetch_invigilator.py} +4 -4
- edsl/jobs/{loggers/HTMLTableJobLogger.py → html_table_job_logger.py} +6 -2
- edsl/jobs/{Jobs.py → jobs.py} +313 -167
- edsl/jobs/{JobsChecks.py → jobs_checks.py} +15 -7
- edsl/jobs/{JobsComponentConstructor.py → jobs_component_constructor.py} +19 -17
- edsl/jobs/{InterviewsConstructor.py → jobs_interview_constructor.py} +10 -5
- edsl/jobs/jobs_pricing_estimation.py +347 -0
- edsl/jobs/{JobsRemoteInferenceLogger.py → jobs_remote_inference_logger.py} +4 -3
- edsl/jobs/jobs_runner_asyncio.py +282 -0
- edsl/jobs/{JobsRemoteInferenceHandler.py → remote_inference.py} +19 -22
- edsl/jobs/results_exceptions_handler.py +2 -2
- edsl/key_management/__init__.py +28 -0
- edsl/key_management/key_lookup.py +161 -0
- edsl/{language_models/key_management/KeyLookupBuilder.py → key_management/key_lookup_builder.py} +118 -47
- edsl/key_management/key_lookup_collection.py +82 -0
- edsl/key_management/models.py +218 -0
- edsl/language_models/__init__.py +7 -2
- edsl/language_models/{ComputeCost.py → compute_cost.py} +18 -3
- edsl/{exceptions/language_models.py → language_models/exceptions.py} +2 -1
- edsl/language_models/language_model.py +1080 -0
- edsl/language_models/model.py +10 -25
- edsl/language_models/{ModelList.py → model_list.py} +9 -14
- edsl/language_models/{RawResponseHandler.py → raw_response_handler.py} +1 -1
- edsl/language_models/{RegisterLanguageModelsMeta.py → registry.py} +1 -1
- edsl/language_models/repair.py +4 -4
- edsl/language_models/utilities.py +4 -4
- edsl/notebooks/__init__.py +3 -1
- edsl/notebooks/{Notebook.py → notebook.py} +7 -8
- edsl/prompts/__init__.py +1 -1
- edsl/{exceptions/prompts.py → prompts/exceptions.py} +3 -1
- edsl/prompts/{Prompt.py → prompt.py} +101 -95
- edsl/questions/HTMLQuestion.py +1 -1
- edsl/questions/__init__.py +154 -25
- edsl/questions/answer_validator_mixin.py +1 -1
- edsl/questions/compose_questions.py +4 -3
- edsl/questions/derived/question_likert_five.py +166 -0
- edsl/questions/derived/{QuestionLinearScale.py → question_linear_scale.py} +4 -4
- edsl/questions/derived/{QuestionTopK.py → question_top_k.py} +4 -4
- edsl/questions/derived/{QuestionYesNo.py → question_yes_no.py} +4 -5
- edsl/questions/descriptors.py +24 -30
- edsl/questions/loop_processor.py +65 -19
- edsl/questions/question_base.py +881 -0
- edsl/questions/question_base_gen_mixin.py +15 -16
- edsl/questions/{QuestionBasePromptsMixin.py → question_base_prompts_mixin.py} +2 -2
- edsl/questions/{QuestionBudget.py → question_budget.py} +3 -4
- edsl/questions/{QuestionCheckBox.py → question_check_box.py} +16 -16
- edsl/questions/{QuestionDict.py → question_dict.py} +39 -5
- edsl/questions/{QuestionExtract.py → question_extract.py} +9 -9
- edsl/questions/question_free_text.py +282 -0
- edsl/questions/{QuestionFunctional.py → question_functional.py} +6 -5
- edsl/questions/{QuestionList.py → question_list.py} +6 -7
- edsl/questions/{QuestionMatrix.py → question_matrix.py} +6 -5
- edsl/questions/{QuestionMultipleChoice.py → question_multiple_choice.py} +126 -21
- edsl/questions/{QuestionNumerical.py → question_numerical.py} +5 -5
- edsl/questions/{QuestionRank.py → question_rank.py} +6 -6
- edsl/questions/question_registry.py +4 -9
- edsl/questions/register_questions_meta.py +8 -4
- edsl/questions/response_validator_abc.py +17 -16
- edsl/results/__init__.py +4 -1
- edsl/{exceptions/results.py → results/exceptions.py} +1 -1
- edsl/results/report.py +197 -0
- edsl/results/{Result.py → result.py} +131 -45
- edsl/results/{Results.py → results.py} +365 -220
- edsl/results/results_selector.py +344 -25
- edsl/scenarios/__init__.py +30 -3
- edsl/scenarios/{ConstructDownloadLink.py → construct_download_link.py} +7 -0
- edsl/scenarios/directory_scanner.py +156 -13
- edsl/scenarios/document_chunker.py +186 -0
- edsl/scenarios/exceptions.py +101 -0
- edsl/scenarios/file_methods.py +2 -3
- edsl/scenarios/{FileStore.py → file_store.py} +275 -189
- edsl/scenarios/handlers/__init__.py +14 -14
- edsl/scenarios/handlers/{csv.py → csv_file_store.py} +1 -2
- edsl/scenarios/handlers/{docx.py → docx_file_store.py} +8 -7
- edsl/scenarios/handlers/{html.py → html_file_store.py} +1 -2
- edsl/scenarios/handlers/{jpeg.py → jpeg_file_store.py} +1 -1
- edsl/scenarios/handlers/{json.py → json_file_store.py} +1 -1
- edsl/scenarios/handlers/latex_file_store.py +5 -0
- edsl/scenarios/handlers/{md.py → md_file_store.py} +1 -1
- edsl/scenarios/handlers/{pdf.py → pdf_file_store.py} +2 -2
- edsl/scenarios/handlers/{png.py → png_file_store.py} +1 -1
- edsl/scenarios/handlers/{pptx.py → pptx_file_store.py} +8 -7
- edsl/scenarios/handlers/{py.py → py_file_store.py} +1 -3
- edsl/scenarios/handlers/{sql.py → sql_file_store.py} +2 -1
- edsl/scenarios/handlers/{sqlite.py → sqlite_file_store.py} +2 -3
- edsl/scenarios/handlers/{txt.py → txt_file_store.py} +1 -1
- edsl/scenarios/scenario.py +928 -0
- edsl/scenarios/scenario_join.py +18 -5
- edsl/scenarios/{ScenarioList.py → scenario_list.py} +294 -106
- edsl/scenarios/{ScenarioListPdfMixin.py → scenario_list_pdf_tools.py} +16 -15
- edsl/scenarios/scenario_selector.py +5 -1
- edsl/study/ObjectEntry.py +2 -2
- edsl/study/SnapShot.py +5 -5
- edsl/study/Study.py +18 -19
- edsl/study/__init__.py +6 -4
- edsl/surveys/__init__.py +7 -4
- edsl/surveys/dag/__init__.py +2 -0
- edsl/surveys/{ConstructDAG.py → dag/construct_dag.py} +3 -3
- edsl/surveys/{DAG.py → dag/dag.py} +13 -10
- edsl/surveys/descriptors.py +1 -1
- edsl/surveys/{EditSurvey.py → edit_survey.py} +9 -9
- edsl/{exceptions/surveys.py → surveys/exceptions.py} +1 -2
- edsl/surveys/memory/__init__.py +3 -0
- edsl/surveys/{MemoryPlan.py → memory/memory_plan.py} +10 -9
- edsl/surveys/rules/__init__.py +3 -0
- edsl/surveys/{Rule.py → rules/rule.py} +103 -43
- edsl/surveys/{RuleCollection.py → rules/rule_collection.py} +21 -30
- edsl/surveys/{RuleManager.py → rules/rule_manager.py} +19 -13
- edsl/surveys/survey.py +1743 -0
- edsl/surveys/{SurveyExportMixin.py → survey_export.py} +22 -27
- edsl/surveys/{SurveyFlowVisualization.py → survey_flow_visualization.py} +11 -2
- edsl/surveys/{Simulator.py → survey_simulator.py} +10 -3
- edsl/tasks/__init__.py +32 -0
- edsl/{jobs/tasks/QuestionTaskCreator.py → tasks/question_task_creator.py} +115 -57
- edsl/tasks/task_creators.py +135 -0
- edsl/{jobs/tasks/TaskHistory.py → tasks/task_history.py} +86 -47
- edsl/{jobs/tasks → tasks}/task_status_enum.py +91 -7
- edsl/tasks/task_status_log.py +85 -0
- edsl/tokens/__init__.py +2 -0
- edsl/tokens/interview_token_usage.py +53 -0
- edsl/utilities/PrettyList.py +1 -1
- edsl/utilities/SystemInfo.py +25 -22
- edsl/utilities/__init__.py +29 -21
- edsl/utilities/gcp_bucket/__init__.py +2 -0
- edsl/utilities/gcp_bucket/cloud_storage.py +99 -96
- edsl/utilities/interface.py +44 -536
- edsl/{results/MarkdownToPDF.py → utilities/markdown_to_pdf.py} +13 -5
- edsl/utilities/repair_functions.py +1 -1
- {edsl-0.1.47.dist-info → edsl-0.1.48.dist-info}/METADATA +1 -1
- edsl-0.1.48.dist-info/RECORD +347 -0
- edsl/Base.py +0 -493
- edsl/BaseDiff.py +0 -260
- edsl/agents/InvigilatorBase.py +0 -260
- edsl/agents/PromptConstructor.py +0 -318
- edsl/coop/PriceFetcher.py +0 -54
- edsl/data/Cache.py +0 -582
- edsl/data/CacheEntry.py +0 -238
- edsl/data/SQLiteDict.py +0 -292
- edsl/data/__init__.py +0 -5
- edsl/data/orm.py +0 -10
- edsl/exceptions/cache.py +0 -5
- edsl/exceptions/coop.py +0 -14
- edsl/exceptions/data.py +0 -14
- edsl/exceptions/scenarios.py +0 -29
- edsl/jobs/Answers.py +0 -43
- edsl/jobs/JobsPrompts.py +0 -354
- edsl/jobs/buckets/BucketCollection.py +0 -134
- edsl/jobs/buckets/ModelBuckets.py +0 -65
- edsl/jobs/buckets/TokenBucket.py +0 -283
- edsl/jobs/buckets/TokenBucketClient.py +0 -191
- edsl/jobs/interviews/Interview.py +0 -395
- edsl/jobs/interviews/InterviewExceptionCollection.py +0 -99
- edsl/jobs/interviews/InterviewStatisticsCollection.py +0 -25
- edsl/jobs/runners/JobsRunnerAsyncio.py +0 -163
- edsl/jobs/runners/JobsRunnerStatusData.py +0 -0
- edsl/jobs/tasks/TaskCreators.py +0 -64
- edsl/jobs/tasks/TaskStatusLog.py +0 -23
- edsl/jobs/tokens/InterviewTokenUsage.py +0 -27
- edsl/language_models/LanguageModel.py +0 -635
- edsl/language_models/ServiceDataSources.py +0 -0
- edsl/language_models/key_management/KeyLookup.py +0 -63
- edsl/language_models/key_management/KeyLookupCollection.py +0 -38
- edsl/language_models/key_management/models.py +0 -137
- edsl/questions/QuestionBase.py +0 -544
- edsl/questions/QuestionFreeText.py +0 -130
- edsl/questions/derived/QuestionLikertFive.py +0 -76
- edsl/results/ResultsExportMixin.py +0 -45
- edsl/results/TextEditor.py +0 -50
- edsl/results/results_fetch_mixin.py +0 -33
- edsl/results/results_tools_mixin.py +0 -98
- edsl/scenarios/DocumentChunker.py +0 -104
- edsl/scenarios/Scenario.py +0 -548
- edsl/scenarios/ScenarioHtmlMixin.py +0 -65
- edsl/scenarios/ScenarioListExportMixin.py +0 -45
- edsl/scenarios/handlers/latex.py +0 -5
- edsl/shared.py +0 -1
- edsl/surveys/Survey.py +0 -1301
- edsl/surveys/SurveyQualtricsImport.py +0 -284
- edsl/surveys/SurveyToApp.py +0 -141
- edsl/surveys/instructions/__init__.py +0 -0
- edsl/tools/__init__.py +0 -1
- edsl/tools/clusters.py +0 -192
- edsl/tools/embeddings.py +0 -27
- edsl/tools/embeddings_plotting.py +0 -118
- edsl/tools/plotting.py +0 -112
- edsl/tools/summarize.py +0 -18
- edsl/utilities/data/Registry.py +0 -6
- edsl/utilities/data/__init__.py +0 -1
- edsl/utilities/data/scooter_results.json +0 -1
- edsl-0.1.47.dist-info/RECORD +0 -354
- /edsl/coop/{CoopFunctionsMixin.py → coop_functions.py} +0 -0
- /edsl/{results → dataset/display}/CSSParameterizer.py +0 -0
- /edsl/{language_models/key_management → dataset/display}/__init__.py +0 -0
- /edsl/{results → dataset/display}/table_data_class.py +0 -0
- /edsl/{results → dataset/display}/table_display.css +0 -0
- /edsl/{results/ResultsGGMixin.py → dataset/r/ggplot.py} +0 -0
- /edsl/{results → dataset}/tree_explore.py +0 -0
- /edsl/{surveys/instructions/ChangeInstruction.py → instructions/change_instruction.py} +0 -0
- /edsl/{jobs/interviews → interviews}/interview_status_enum.py +0 -0
- /edsl/jobs/{runners/JobsRunnerStatus.py → jobs_runner_status.py} +0 -0
- /edsl/language_models/{PriceManager.py → price_manager.py} +0 -0
- /edsl/language_models/{fake_openai_call.py → unused/fake_openai_call.py} +0 -0
- /edsl/language_models/{fake_openai_service.py → unused/fake_openai_service.py} +0 -0
- /edsl/notebooks/{NotebookToLaTeX.py → notebook_to_latex.py} +0 -0
- /edsl/{exceptions/questions.py → questions/exceptions.py} +0 -0
- /edsl/questions/{SimpleAskMixin.py → simple_ask_mixin.py} +0 -0
- /edsl/surveys/{Memory.py → memory/memory.py} +0 -0
- /edsl/surveys/{MemoryManagement.py → memory/memory_management.py} +0 -0
- /edsl/surveys/{SurveyCSS.py → survey_css.py} +0 -0
- /edsl/{jobs/tokens/TokenUsage.py → tokens/token_usage.py} +0 -0
- /edsl/{results/MarkdownToDocx.py → utilities/markdown_to_docx.py} +0 -0
- /edsl/{TemplateLoader.py → utilities/template_loader.py} +0 -0
- {edsl-0.1.47.dist-info → edsl-0.1.48.dist-info}/LICENSE +0 -0
- {edsl-0.1.47.dist-info → edsl-0.1.48.dist-info}/WHEEL +0 -0
@@ -0,0 +1,928 @@
|
|
1
|
+
"""
|
2
|
+
A Scenario is a dictionary-like object that stores key-value pairs for parameterizing questions.
|
3
|
+
|
4
|
+
Scenarios are a fundamental concept in EDSL, providing a mechanism to parameterize
|
5
|
+
questions with dynamic values. Each Scenario contains key-value pairs that can be
|
6
|
+
referenced within question templates using Jinja syntax. This allows for creating
|
7
|
+
questions that vary based on the specific scenario being presented.
|
8
|
+
|
9
|
+
Key features include:
|
10
|
+
- Dictionary-like behavior (inherits from UserDict)
|
11
|
+
- Support for combination operations (addition, multiplication)
|
12
|
+
- Conversion to/from various formats (dict, dataset)
|
13
|
+
- Methods for file and data source integration
|
14
|
+
|
15
|
+
Scenarios can be created from various sources including files, URLs, PDFs, images,
|
16
|
+
and HTML content. They serve as the primary mechanism for providing context or variable
|
17
|
+
information to questions in surveys.
|
18
|
+
"""
|
19
|
+
|
20
|
+
from __future__ import annotations
|
21
|
+
import copy
|
22
|
+
import os
|
23
|
+
import json
|
24
|
+
from collections import UserDict
|
25
|
+
from typing import Union, List, Optional, TYPE_CHECKING, Collection
|
26
|
+
from uuid import uuid4
|
27
|
+
|
28
|
+
from ..base import Base
|
29
|
+
from ..utilities import remove_edsl_version
|
30
|
+
from .exceptions import ScenarioError
|
31
|
+
|
32
|
+
if TYPE_CHECKING:
|
33
|
+
from .scenario_list import ScenarioList
|
34
|
+
from ..dataset import Dataset
|
35
|
+
|
36
|
+
|
37
|
+
|
38
|
+
class Scenario(Base, UserDict):
|
39
|
+
"""
|
40
|
+
A dictionary-like object that stores key-value pairs for parameterizing questions.
|
41
|
+
|
42
|
+
A Scenario inherits from both the EDSL Base class and Python's UserDict, allowing
|
43
|
+
it to function as a dictionary while providing additional functionality. Scenarios
|
44
|
+
are used to parameterize questions by providing variable data that can be referenced
|
45
|
+
within question templates using Jinja syntax.
|
46
|
+
|
47
|
+
Scenarios can be created directly with dictionary data or constructed from various
|
48
|
+
sources using class methods (from_file, from_url, from_pdf, etc.). They support
|
49
|
+
operations like addition (combining scenarios) and multiplication (creating cross
|
50
|
+
products with other scenarios or scenario lists).
|
51
|
+
|
52
|
+
Attributes:
|
53
|
+
data (dict): The underlying dictionary data.
|
54
|
+
name (str, optional): A name for the scenario.
|
55
|
+
|
56
|
+
Examples:
|
57
|
+
Create a simple scenario:
|
58
|
+
>>> s = Scenario({"product": "coffee", "price": 4.99})
|
59
|
+
|
60
|
+
Combine scenarios:
|
61
|
+
>>> s1 = Scenario({"product": "coffee"})
|
62
|
+
>>> s2 = Scenario({"price": 4.99})
|
63
|
+
>>> s3 = s1 + s2
|
64
|
+
>>> s3
|
65
|
+
Scenario({'product': 'coffee', 'price': 4.99})
|
66
|
+
|
67
|
+
Create a scenario from a file:
|
68
|
+
>>> import tempfile
|
69
|
+
>>> with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
|
70
|
+
... _ = f.write("Hello World")
|
71
|
+
... data_path = f.name
|
72
|
+
>>> s = Scenario.from_file(data_path, "document")
|
73
|
+
>>> import os
|
74
|
+
>>> os.unlink(data_path) # Clean up temp file
|
75
|
+
"""
|
76
|
+
|
77
|
+
__documentation__ = "https://docs.expectedparrot.com/en/latest/scenarios.html"
|
78
|
+
|
79
|
+
def __init__(self, data: Optional[dict] = None, name: Optional[str] = None):
|
80
|
+
"""
|
81
|
+
Initialize a new Scenario.
|
82
|
+
|
83
|
+
Args:
|
84
|
+
data: A dictionary of key-value pairs for parameterizing questions.
|
85
|
+
Any dictionary-like object that can be converted to a dict is accepted.
|
86
|
+
name: An optional name for the scenario to aid in identification.
|
87
|
+
|
88
|
+
Raises:
|
89
|
+
ScenarioError: If the data cannot be converted to a dictionary.
|
90
|
+
|
91
|
+
Examples:
|
92
|
+
>>> s = Scenario({"product": "coffee", "price": 4.99})
|
93
|
+
>>> s = Scenario({"question": "What is your favorite color?"}, name="color_question")
|
94
|
+
"""
|
95
|
+
if not isinstance(data, dict) and data is not None:
|
96
|
+
try:
|
97
|
+
data = dict(data)
|
98
|
+
except Exception as e:
|
99
|
+
raise ScenarioError(
|
100
|
+
f"You must pass in a dictionary to initialize a Scenario. You passed in {data}",
|
101
|
+
"Exception message:" + str(e),
|
102
|
+
)
|
103
|
+
|
104
|
+
super().__init__()
|
105
|
+
self.data = data if data is not None else {}
|
106
|
+
self.name = name
|
107
|
+
|
108
|
+
def __mul__(self, scenario_list_or_scenario: Union["ScenarioList", "Scenario"]) -> "ScenarioList":
|
109
|
+
"""Takes the cross product of a Scenario with another Scenario or ScenarioList.
|
110
|
+
|
111
|
+
Args:
|
112
|
+
scenario_list_or_scenario: A Scenario or ScenarioList to multiply with.
|
113
|
+
|
114
|
+
Returns:
|
115
|
+
A ScenarioList containing the cross product.
|
116
|
+
|
117
|
+
Example:
|
118
|
+
>>> s1 = Scenario({'a': 1})
|
119
|
+
>>> s2 = Scenario({'b': 2})
|
120
|
+
>>> s1 * s2
|
121
|
+
ScenarioList([Scenario({'a': 1, 'b': 2})])
|
122
|
+
|
123
|
+
>>> from edsl.scenarios import ScenarioList
|
124
|
+
>>> sl = ScenarioList([Scenario({'b': 2}), Scenario({'b': 3})])
|
125
|
+
>>> new_s = s1 * sl
|
126
|
+
>>> new_s == ScenarioList([Scenario({'a': 1, 'b': 2}), Scenario({'a': 1, 'b': 3})])
|
127
|
+
True
|
128
|
+
"""
|
129
|
+
from .scenario_list import ScenarioList
|
130
|
+
if isinstance(scenario_list_or_scenario, ScenarioList):
|
131
|
+
return scenario_list_or_scenario * self
|
132
|
+
elif isinstance(scenario_list_or_scenario, Scenario):
|
133
|
+
return ScenarioList([self]) * scenario_list_or_scenario
|
134
|
+
else:
|
135
|
+
raise TypeError(f"Cannot multiply Scenario with {type(scenario_list_or_scenario)}")
|
136
|
+
|
137
|
+
def replicate(self, n: int) -> "ScenarioList":
|
138
|
+
"""Replicate a scenario n times to return a ScenarioList.
|
139
|
+
|
140
|
+
:param n: The number of times to replicate the scenario.
|
141
|
+
|
142
|
+
Example:
|
143
|
+
>>> s = Scenario({"food": "wood chips"})
|
144
|
+
>>> s.replicate(2)
|
145
|
+
ScenarioList([Scenario({'food': 'wood chips'}), Scenario({'food': 'wood chips'})])
|
146
|
+
"""
|
147
|
+
from .scenario_list import ScenarioList
|
148
|
+
|
149
|
+
return ScenarioList([copy.deepcopy(self) for _ in range(n)])
|
150
|
+
|
151
|
+
@property
|
152
|
+
def has_jinja_braces(self) -> bool:
|
153
|
+
"""Return whether the scenario has jinja braces. This matters for rendering.
|
154
|
+
|
155
|
+
>>> s = Scenario({"food": "I love {{wood chips}}"})
|
156
|
+
>>> s.has_jinja_braces
|
157
|
+
True
|
158
|
+
"""
|
159
|
+
for _, value in self.items():
|
160
|
+
if isinstance(value, str):
|
161
|
+
if "{{" in value and "}}" in value:
|
162
|
+
return True
|
163
|
+
return False
|
164
|
+
|
165
|
+
def _convert_jinja_braces(
|
166
|
+
self, replacement_left: str = "<<", replacement_right: str = ">>"
|
167
|
+
) -> Scenario:
|
168
|
+
"""Convert Jinja braces to some other character.
|
169
|
+
|
170
|
+
>>> s = Scenario({"food": "I love {{wood chips}}"})
|
171
|
+
>>> s._convert_jinja_braces()
|
172
|
+
Scenario({'food': 'I love <<wood chips>>'})
|
173
|
+
|
174
|
+
"""
|
175
|
+
new_scenario = Scenario()
|
176
|
+
for key, value in self.items():
|
177
|
+
if isinstance(value, str):
|
178
|
+
new_scenario[key] = value.replace("{{", replacement_left).replace(
|
179
|
+
"}}", replacement_right
|
180
|
+
)
|
181
|
+
else:
|
182
|
+
new_scenario[key] = value
|
183
|
+
return new_scenario
|
184
|
+
|
185
|
+
def __add__(self, other_scenario: Scenario) -> Scenario:
|
186
|
+
"""Combine two scenarios by taking the union of their keys
|
187
|
+
|
188
|
+
If the other scenario is None, then just return self.
|
189
|
+
|
190
|
+
:param other_scenario: The other scenario to combine with.
|
191
|
+
|
192
|
+
Example:
|
193
|
+
|
194
|
+
>>> s1 = Scenario({"price": 100, "quantity": 2})
|
195
|
+
>>> s2 = Scenario({"color": "red"})
|
196
|
+
>>> s1 + s2
|
197
|
+
Scenario({'price': 100, 'quantity': 2, 'color': 'red'})
|
198
|
+
>>> (s1 + s2).__class__.__name__
|
199
|
+
'Scenario'
|
200
|
+
"""
|
201
|
+
if other_scenario is None:
|
202
|
+
return self
|
203
|
+
else:
|
204
|
+
data1 = copy.deepcopy(self.data)
|
205
|
+
data2 = copy.deepcopy(other_scenario.data)
|
206
|
+
s = Scenario(data1 | data2)
|
207
|
+
return s
|
208
|
+
|
209
|
+
def rename(
|
210
|
+
self,
|
211
|
+
old_name_or_replacement_dict: Union[str, dict[str, str]],
|
212
|
+
new_name: Optional[str] = None,
|
213
|
+
) -> Scenario:
|
214
|
+
"""Rename the keys of a scenario.
|
215
|
+
|
216
|
+
:param old_name_or_replacement_dict: A dictionary of old keys to new keys *OR* a string of the old key.
|
217
|
+
:param new_name: The new name of the key.
|
218
|
+
|
219
|
+
Example:
|
220
|
+
|
221
|
+
>>> s = Scenario({"food": "wood chips"})
|
222
|
+
>>> s.rename({"food": "food_preference"})
|
223
|
+
Scenario({'food_preference': 'wood chips'})
|
224
|
+
|
225
|
+
>>> s = Scenario({"food": "wood chips"})
|
226
|
+
>>> s.rename("food", "snack")
|
227
|
+
Scenario({'snack': 'wood chips'})
|
228
|
+
"""
|
229
|
+
if isinstance(old_name_or_replacement_dict, str) and new_name is not None:
|
230
|
+
replacement_dict = {old_name_or_replacement_dict: new_name}
|
231
|
+
else:
|
232
|
+
replacement_dict = old_name_or_replacement_dict
|
233
|
+
|
234
|
+
new_scenario = Scenario()
|
235
|
+
for key, value in self.items():
|
236
|
+
if key in replacement_dict:
|
237
|
+
new_scenario[replacement_dict[key]] = value
|
238
|
+
else:
|
239
|
+
new_scenario[key] = value
|
240
|
+
return new_scenario
|
241
|
+
|
242
|
+
def new_column_names(self, new_names: List[str]) -> Scenario:
|
243
|
+
"""Rename the keys of a scenario.
|
244
|
+
|
245
|
+
>>> s = Scenario({"food": "wood chips"})
|
246
|
+
>>> s.new_column_names(["food_preference"])
|
247
|
+
Scenario({'food_preference': 'wood chips'})
|
248
|
+
"""
|
249
|
+
try:
|
250
|
+
assert len(new_names) == len(self.keys())
|
251
|
+
except AssertionError:
|
252
|
+
print("The number of new names must match the number of keys.")
|
253
|
+
|
254
|
+
new_scenario = Scenario()
|
255
|
+
for new_names, value in zip(new_names, self.values()):
|
256
|
+
new_scenario[new_names] = value
|
257
|
+
return new_scenario
|
258
|
+
|
259
|
+
def table(self, tablefmt: str = "grid") -> str:
|
260
|
+
"""Display a scenario as a table."""
|
261
|
+
return self.to_dataset().table(tablefmt=tablefmt)
|
262
|
+
|
263
|
+
|
264
|
+
def to_dict(self, add_edsl_version: bool = True) -> dict:
|
265
|
+
"""Convert a scenario to a dictionary.
|
266
|
+
|
267
|
+
Example:
|
268
|
+
|
269
|
+
>>> s = Scenario({"food": "wood chips"})
|
270
|
+
>>> s.to_dict()
|
271
|
+
{'food': 'wood chips', 'edsl_version': '...', 'edsl_class_name': 'Scenario'}
|
272
|
+
|
273
|
+
>>> s.to_dict(add_edsl_version = False)
|
274
|
+
{'food': 'wood chips'}
|
275
|
+
|
276
|
+
"""
|
277
|
+
from edsl.scenarios import FileStore
|
278
|
+
|
279
|
+
d = self.data.copy()
|
280
|
+
for key, value in d.items():
|
281
|
+
if isinstance(value, FileStore):
|
282
|
+
d[key] = value.to_dict(add_edsl_version=add_edsl_version)
|
283
|
+
if add_edsl_version:
|
284
|
+
from edsl import __version__
|
285
|
+
|
286
|
+
d["edsl_version"] = __version__
|
287
|
+
d["edsl_class_name"] = "Scenario"
|
288
|
+
|
289
|
+
return d
|
290
|
+
|
291
|
+
def __hash__(self) -> int:
|
292
|
+
"""Return a hash of the scenario.
|
293
|
+
|
294
|
+
Example:
|
295
|
+
|
296
|
+
>>> s = Scenario({"food": "wood chips"})
|
297
|
+
>>> hash(s)
|
298
|
+
1153210385458344214
|
299
|
+
"""
|
300
|
+
from edsl.utilities.utilities import dict_hash
|
301
|
+
|
302
|
+
return dict_hash(self.to_dict(add_edsl_version=False))
|
303
|
+
|
304
|
+
def __repr__(self):
|
305
|
+
return "Scenario(" + repr(self.data) + ")"
|
306
|
+
|
307
|
+
def to_dataset(self) -> "Dataset":
|
308
|
+
"""Convert a scenario to a dataset.
|
309
|
+
|
310
|
+
>>> s = Scenario({"food": "wood chips"})
|
311
|
+
>>> s.to_dataset()
|
312
|
+
Dataset([{'key': ['food']}, {'value': ['wood chips']}])
|
313
|
+
"""
|
314
|
+
from ..dataset import Dataset
|
315
|
+
|
316
|
+
keys = list(self.keys())
|
317
|
+
values = list(self.values())
|
318
|
+
return Dataset([{"key": keys}, {"value": values}])
|
319
|
+
|
320
|
+
def select(self, list_of_keys: Collection[str]) -> "Scenario":
|
321
|
+
"""Select a subset of keys from a scenario.
|
322
|
+
|
323
|
+
:param list_of_keys: The keys to select.
|
324
|
+
|
325
|
+
Example:
|
326
|
+
|
327
|
+
>>> s = Scenario({"food": "wood chips", "drink": "water"})
|
328
|
+
>>> s.select(["food"])
|
329
|
+
Scenario({'food': 'wood chips'})
|
330
|
+
"""
|
331
|
+
new_scenario = Scenario()
|
332
|
+
for key in list_of_keys:
|
333
|
+
new_scenario[key] = self[key]
|
334
|
+
return new_scenario
|
335
|
+
|
336
|
+
def drop(self, list_of_keys: Collection[str]) -> "Scenario":
|
337
|
+
"""Drop a subset of keys from a scenario.
|
338
|
+
|
339
|
+
:param list_of_keys: The keys to drop.
|
340
|
+
|
341
|
+
Example:
|
342
|
+
|
343
|
+
>>> s = Scenario({"food": "wood chips", "drink": "water"})
|
344
|
+
>>> s.drop(["food"])
|
345
|
+
Scenario({'drink': 'water'})
|
346
|
+
"""
|
347
|
+
new_scenario = Scenario()
|
348
|
+
for key in self.keys():
|
349
|
+
if key not in list_of_keys:
|
350
|
+
new_scenario[key] = self[key]
|
351
|
+
return new_scenario
|
352
|
+
|
353
|
+
def keep(self, list_of_keys: List[str]) -> "Scenario":
|
354
|
+
"""Keep a subset of keys from a scenario.
|
355
|
+
|
356
|
+
:param list_of_keys: The keys to keep.
|
357
|
+
|
358
|
+
Example:
|
359
|
+
|
360
|
+
>>> s = Scenario({"food": "wood chips", "drink": "water"})
|
361
|
+
>>> s.keep(["food"])
|
362
|
+
Scenario({'food': 'wood chips'})
|
363
|
+
"""
|
364
|
+
|
365
|
+
return self.select(list_of_keys)
|
366
|
+
|
367
|
+
@classmethod
|
368
|
+
def from_url(cls, url: str, field_name: Optional[str] = "text", testing: bool = False) -> "Scenario":
|
369
|
+
"""
|
370
|
+
Creates a Scenario from the content of a URL.
|
371
|
+
|
372
|
+
This method fetches content from a web URL and creates a Scenario containing the URL
|
373
|
+
and the extracted text. When available, BeautifulSoup is used for better HTML parsing
|
374
|
+
and text extraction, otherwise a basic requests approach is used.
|
375
|
+
|
376
|
+
Args:
|
377
|
+
url: The URL to fetch content from.
|
378
|
+
field_name: The key name to use for storing the extracted text in the Scenario.
|
379
|
+
Defaults to "text".
|
380
|
+
testing: If True, uses a simplified requests method instead of BeautifulSoup.
|
381
|
+
This is primarily for testing purposes.
|
382
|
+
|
383
|
+
Returns:
|
384
|
+
A Scenario containing the URL and extracted text.
|
385
|
+
|
386
|
+
Raises:
|
387
|
+
requests.exceptions.RequestException: If the URL cannot be accessed.
|
388
|
+
|
389
|
+
Examples:
|
390
|
+
>>> s = Scenario.from_url("https://example.com", testing=True)
|
391
|
+
>>> "url" in s and "text" in s
|
392
|
+
True
|
393
|
+
|
394
|
+
>>> s = Scenario.from_url("https://example.com", field_name="content", testing=True)
|
395
|
+
>>> "url" in s and "content" in s
|
396
|
+
True
|
397
|
+
|
398
|
+
Notes:
|
399
|
+
- The method attempts to use BeautifulSoup and fake_useragent for better
|
400
|
+
HTML parsing and to mimic a real browser.
|
401
|
+
- If these packages are not available, it falls back to basic requests.
|
402
|
+
- When using BeautifulSoup, it extracts text from paragraph and heading tags.
|
403
|
+
"""
|
404
|
+
import requests
|
405
|
+
|
406
|
+
if testing:
|
407
|
+
# Use simple requests method for testing
|
408
|
+
response = requests.get(url)
|
409
|
+
text = response.text
|
410
|
+
else:
|
411
|
+
try:
|
412
|
+
from bs4 import BeautifulSoup
|
413
|
+
from fake_useragent import UserAgent
|
414
|
+
|
415
|
+
# Configure request headers to appear more like a regular browser
|
416
|
+
ua = UserAgent()
|
417
|
+
headers = {
|
418
|
+
'User-Agent': ua.random,
|
419
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
420
|
+
'Accept-Language': 'en-US,en;q=0.5'
|
421
|
+
}
|
422
|
+
|
423
|
+
response = requests.get(url, headers=headers)
|
424
|
+
soup = BeautifulSoup(response.content, 'html.parser')
|
425
|
+
|
426
|
+
# Get text content while preserving some structure
|
427
|
+
text = ' '.join([p.get_text(strip=True) for p in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])])
|
428
|
+
|
429
|
+
except ImportError:
|
430
|
+
# Fallback to basic requests if BeautifulSoup/fake_useragent not available
|
431
|
+
print("BeautifulSoup/fake_useragent not available. Falling back to basic requests.")
|
432
|
+
response = requests.get(url)
|
433
|
+
text = response.text
|
434
|
+
|
435
|
+
return cls({"url": url, field_name: text})
|
436
|
+
|
437
|
+
@classmethod
|
438
|
+
def from_file(cls, file_path: str, field_name: str) -> "Scenario":
|
439
|
+
"""
|
440
|
+
Creates a Scenario containing a FileStore object from a file.
|
441
|
+
|
442
|
+
This method creates a Scenario with a single key-value pair where the value
|
443
|
+
is a FileStore object that encapsulates the specified file. The FileStore
|
444
|
+
handles appropriate file loading, encoding, and extraction based on the file type.
|
445
|
+
|
446
|
+
Args:
|
447
|
+
file_path: Path to the file to be incorporated into the Scenario.
|
448
|
+
field_name: Key name to use for storing the FileStore in the Scenario.
|
449
|
+
|
450
|
+
Returns:
|
451
|
+
A Scenario containing a FileStore object linked to the specified file.
|
452
|
+
|
453
|
+
Raises:
|
454
|
+
FileNotFoundError: If the specified file does not exist.
|
455
|
+
|
456
|
+
Examples:
|
457
|
+
>>> import tempfile
|
458
|
+
>>> with tempfile.NamedTemporaryFile(suffix=".txt", mode="w") as f:
|
459
|
+
... _ = f.write("This is a test.")
|
460
|
+
... _ = f.flush()
|
461
|
+
... s = Scenario.from_file(f.name, "file")
|
462
|
+
>>> s
|
463
|
+
Scenario({'file': FileStore(path='...', ...)})
|
464
|
+
|
465
|
+
Notes:
|
466
|
+
- The FileStore object handles various file formats differently
|
467
|
+
- FileStore provides methods to access file content, extract text,
|
468
|
+
and manage file operations appropriate to the file type
|
469
|
+
"""
|
470
|
+
from edsl.scenarios import FileStore
|
471
|
+
|
472
|
+
fs = FileStore(file_path)
|
473
|
+
return cls({field_name: fs})
|
474
|
+
|
475
|
+
@classmethod
|
476
|
+
def from_image(
|
477
|
+
cls, image_path: str, image_name: Optional[str] = None
|
478
|
+
) -> "Scenario":
|
479
|
+
"""
|
480
|
+
Creates a Scenario containing an image file as a FileStore object.
|
481
|
+
|
482
|
+
This method creates a Scenario with a single key-value pair where the value
|
483
|
+
is a FileStore object that encapsulates the specified image file. The image
|
484
|
+
is stored as a base64-encoded string, allowing it to be easily serialized
|
485
|
+
and transmitted.
|
486
|
+
|
487
|
+
Args:
|
488
|
+
image_path: Path to the image file to be incorporated into the Scenario.
|
489
|
+
image_name: Key name to use for storing the FileStore in the Scenario.
|
490
|
+
If not provided, uses the filename without extension.
|
491
|
+
|
492
|
+
Returns:
|
493
|
+
A Scenario containing a FileStore object with the image data.
|
494
|
+
|
495
|
+
Raises:
|
496
|
+
FileNotFoundError: If the specified image file does not exist.
|
497
|
+
|
498
|
+
Examples:
|
499
|
+
>>> import os
|
500
|
+
>>> # Assuming an image file exists
|
501
|
+
>>> if os.path.exists("image.jpg"):
|
502
|
+
... s = Scenario.from_image("image.jpg")
|
503
|
+
... s_named = Scenario.from_image("image.jpg", "picture")
|
504
|
+
|
505
|
+
Notes:
|
506
|
+
- The resulting FileStore can be displayed in notebooks or used in questions
|
507
|
+
- Supported image formats include JPG, PNG, GIF, etc.
|
508
|
+
- The image is stored as a base64-encoded string for portability
|
509
|
+
"""
|
510
|
+
if not os.path.exists(image_path):
|
511
|
+
raise FileNotFoundError(f"Image file not found: {image_path}")
|
512
|
+
|
513
|
+
if image_name is None:
|
514
|
+
image_name = os.path.basename(image_path).split(".")[0]
|
515
|
+
|
516
|
+
return cls.from_file(image_path, image_name)
|
517
|
+
|
518
|
+
@classmethod
|
519
|
+
def from_pdf(cls, pdf_path: str) -> "Scenario":
|
520
|
+
"""
|
521
|
+
Creates a Scenario containing text extracted from a PDF file.
|
522
|
+
|
523
|
+
This method extracts text and metadata from a PDF file and creates a Scenario
|
524
|
+
containing this information. It uses the PdfExtractor class which provides
|
525
|
+
access to text content, metadata, and structure from PDF files.
|
526
|
+
|
527
|
+
Args:
|
528
|
+
pdf_path: Path to the PDF file to extract content from.
|
529
|
+
|
530
|
+
Returns:
|
531
|
+
A Scenario containing extracted text and metadata from the PDF.
|
532
|
+
|
533
|
+
Raises:
|
534
|
+
FileNotFoundError: If the specified PDF file does not exist.
|
535
|
+
ImportError: If the required PDF extraction libraries are not installed.
|
536
|
+
|
537
|
+
Examples:
|
538
|
+
>>> import os
|
539
|
+
>>> # Assuming a PDF file exists
|
540
|
+
>>> if os.path.exists("document.pdf"):
|
541
|
+
... s = Scenario.from_pdf("document.pdf")
|
542
|
+
|
543
|
+
Notes:
|
544
|
+
- The returned Scenario contains various keys with PDF content and metadata
|
545
|
+
- PDF extraction requires the PyMuPDF library
|
546
|
+
- The extraction process parses the PDF to maintain structure where possible
|
547
|
+
"""
|
548
|
+
try:
|
549
|
+
from edsl.scenarios.PdfExtractor import PdfExtractor
|
550
|
+
extractor = PdfExtractor(pdf_path)
|
551
|
+
return Scenario(extractor.get_pdf_dict())
|
552
|
+
except ImportError as e:
|
553
|
+
raise ImportError(
|
554
|
+
f"Could not extract text from PDF: {str(e)}. "
|
555
|
+
"PDF extraction requires the PyMuPDF library. "
|
556
|
+
"Install it with: pip install pymupdf"
|
557
|
+
)
|
558
|
+
|
559
|
+
@classmethod
|
560
|
+
def from_html(cls, url: str, field_name: Optional[str] = None) -> "Scenario":
|
561
|
+
"""
|
562
|
+
Creates a Scenario containing both HTML content and extracted text from a URL.
|
563
|
+
|
564
|
+
This method fetches HTML content from a URL, extracts readable text from it,
|
565
|
+
and creates a Scenario containing the original URL, the raw HTML, and the
|
566
|
+
extracted text. Unlike from_url, this method preserves the raw HTML content.
|
567
|
+
|
568
|
+
Args:
|
569
|
+
url: URL to fetch HTML content from.
|
570
|
+
field_name: Key name to use for the extracted text in the Scenario.
|
571
|
+
If not provided, defaults to "text".
|
572
|
+
|
573
|
+
Returns:
|
574
|
+
A Scenario containing the URL, raw HTML, and extracted text.
|
575
|
+
|
576
|
+
Raises:
|
577
|
+
requests.exceptions.RequestException: If the URL cannot be accessed.
|
578
|
+
|
579
|
+
Examples:
|
580
|
+
>>> s = Scenario.from_html("https://example.com")
|
581
|
+
>>> all(key in s for key in ["url", "html", "text"])
|
582
|
+
True
|
583
|
+
|
584
|
+
>>> s = Scenario.from_html("https://example.com", field_name="content")
|
585
|
+
>>> all(key in s for key in ["url", "html", "content"])
|
586
|
+
True
|
587
|
+
|
588
|
+
Notes:
|
589
|
+
- Uses BeautifulSoup for HTML parsing when available
|
590
|
+
- Stores both the raw HTML and the extracted text
|
591
|
+
- Provides a more comprehensive representation than from_url
|
592
|
+
- Useful when the HTML structure or specific elements are needed
|
593
|
+
"""
|
594
|
+
html = cls.fetch_html(url)
|
595
|
+
text = cls.extract_text(html)
|
596
|
+
if not field_name:
|
597
|
+
field_name = "text"
|
598
|
+
return cls({"url": url, "html": html, field_name: text})
|
599
|
+
|
600
|
+
@staticmethod
|
601
|
+
def fetch_html(url: str) -> Optional[str]:
|
602
|
+
"""
|
603
|
+
Fetches HTML content from a URL with robust error handling and retries.
|
604
|
+
|
605
|
+
This method creates a session with configurable retries to fetch HTML content
|
606
|
+
from a URL. It uses a realistic user agent to avoid being blocked by websites
|
607
|
+
that filter bot traffic.
|
608
|
+
|
609
|
+
Args:
|
610
|
+
url: The URL to fetch HTML content from.
|
611
|
+
|
612
|
+
Returns:
|
613
|
+
The HTML content as a string, or None if the request failed.
|
614
|
+
|
615
|
+
Raises:
|
616
|
+
requests.exceptions.RequestException: If a request error occurs.
|
617
|
+
"""
|
618
|
+
import requests
|
619
|
+
from requests.adapters import HTTPAdapter
|
620
|
+
from requests.packages.urllib3.util.retry import Retry
|
621
|
+
|
622
|
+
headers = {
|
623
|
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
624
|
+
}
|
625
|
+
|
626
|
+
# Create a session to manage cookies and retries
|
627
|
+
session = requests.Session()
|
628
|
+
retries = Retry(
|
629
|
+
total=5, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504]
|
630
|
+
)
|
631
|
+
session.mount("http://", HTTPAdapter(max_retries=retries))
|
632
|
+
session.mount("https://", HTTPAdapter(max_retries=retries))
|
633
|
+
|
634
|
+
try:
|
635
|
+
# Make the request
|
636
|
+
response = session.get(url, headers=headers, timeout=10)
|
637
|
+
response.raise_for_status() # Raise an exception for HTTP errors
|
638
|
+
return response.text
|
639
|
+
except requests.exceptions.RequestException as e:
|
640
|
+
print(f"An error occurred: {e}")
|
641
|
+
return None
|
642
|
+
|
643
|
+
@staticmethod
|
644
|
+
def extract_text(html: Optional[str]) -> str:
|
645
|
+
"""
|
646
|
+
Extracts readable text from HTML content using BeautifulSoup.
|
647
|
+
|
648
|
+
This method parses HTML content and extracts the readable text while
|
649
|
+
removing HTML tags and script content.
|
650
|
+
|
651
|
+
Args:
|
652
|
+
html: The HTML content to extract text from.
|
653
|
+
|
654
|
+
Returns:
|
655
|
+
The extracted text content as a string. Returns an empty string
|
656
|
+
if the input is None or if parsing fails.
|
657
|
+
"""
|
658
|
+
if html is None:
|
659
|
+
return ""
|
660
|
+
|
661
|
+
try:
|
662
|
+
from bs4 import BeautifulSoup
|
663
|
+
soup = BeautifulSoup(html, "html.parser")
|
664
|
+
|
665
|
+
# Remove script and style elements that might contain non-readable content
|
666
|
+
for element in soup(["script", "style"]):
|
667
|
+
element.extract()
|
668
|
+
|
669
|
+
text = soup.get_text()
|
670
|
+
|
671
|
+
# Normalize whitespace
|
672
|
+
lines = (line.strip() for line in text.splitlines())
|
673
|
+
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
674
|
+
text = '\n'.join(chunk for chunk in chunks if chunk)
|
675
|
+
|
676
|
+
return text
|
677
|
+
except Exception as e:
|
678
|
+
print(f"Error extracting text from HTML: {e}")
|
679
|
+
return ""
|
680
|
+
|
681
|
+
|
682
|
+
@classmethod
|
683
|
+
def from_pdf_to_image(cls, pdf_path: str, image_format: str = "jpeg") -> "Scenario":
|
684
|
+
"""
|
685
|
+
Converts each page of a PDF into an image and creates a Scenario containing them.
|
686
|
+
|
687
|
+
This method takes a PDF file, converts each page to an image in the specified
|
688
|
+
format, and creates a Scenario containing the original file path and FileStore
|
689
|
+
objects for each page image. This is particularly useful for visualizing PDF
|
690
|
+
content or for image-based processing of PDF documents.
|
691
|
+
|
692
|
+
Args:
|
693
|
+
pdf_path: Path to the PDF file to convert to images.
|
694
|
+
image_format: Format of the output images (default is 'jpeg').
|
695
|
+
Other formats include 'png', 'tiff', etc.
|
696
|
+
|
697
|
+
Returns:
|
698
|
+
A Scenario containing the original PDF file path and FileStore objects
|
699
|
+
for each page image, with keys like "page_0", "page_1", etc.
|
700
|
+
|
701
|
+
Raises:
|
702
|
+
FileNotFoundError: If the specified PDF file does not exist.
|
703
|
+
ImportError: If pdf2image is not installed.
|
704
|
+
|
705
|
+
Examples:
|
706
|
+
>>> import os
|
707
|
+
>>> # Assuming a PDF file exists
|
708
|
+
>>> if os.path.exists("document.pdf"):
|
709
|
+
... s = Scenario.from_pdf_to_image("document.pdf")
|
710
|
+
... s_png = Scenario.from_pdf_to_image("document.pdf", "png")
|
711
|
+
|
712
|
+
Notes:
|
713
|
+
- Requires the pdf2image library which depends on poppler
|
714
|
+
- Creates a separate image for each page of the PDF
|
715
|
+
- Images are stored in FileStore objects for easy display and handling
|
716
|
+
- Images are created in a temporary directory which is automatically cleaned up
|
717
|
+
"""
|
718
|
+
import tempfile
|
719
|
+
from pdf2image import convert_from_path
|
720
|
+
from edsl.scenarios import Scenario
|
721
|
+
|
722
|
+
with tempfile.TemporaryDirectory() as output_folder:
|
723
|
+
# Convert PDF to images
|
724
|
+
images = convert_from_path(pdf_path)
|
725
|
+
|
726
|
+
scenario_dict = {"filepath": pdf_path}
|
727
|
+
|
728
|
+
# Save each page as an image and create Scenario instances
|
729
|
+
for i, image in enumerate(images):
|
730
|
+
image_path = os.path.join(output_folder, f"page_{i}.{image_format}")
|
731
|
+
image.save(image_path, image_format.upper())
|
732
|
+
|
733
|
+
from edsl.scenarios import FileStore
|
734
|
+
scenario_dict[f"page_{i}"] = FileStore(image_path)
|
735
|
+
|
736
|
+
scenario = Scenario(scenario_dict)
|
737
|
+
|
738
|
+
return cls(scenario)
|
739
|
+
|
740
|
+
@classmethod
|
741
|
+
def from_docx(cls, docx_path: str) -> "Scenario":
|
742
|
+
"""
|
743
|
+
Creates a Scenario containing text extracted from a Microsoft Word document.
|
744
|
+
|
745
|
+
This method extracts text and structure from a DOCX file and creates a Scenario
|
746
|
+
containing this information. It uses the DocxScenario class to handle the
|
747
|
+
extraction process and maintain document structure where possible.
|
748
|
+
|
749
|
+
Args:
|
750
|
+
docx_path: Path to the DOCX file to extract content from.
|
751
|
+
|
752
|
+
Returns:
|
753
|
+
A Scenario containing the file path and extracted text from the DOCX file.
|
754
|
+
|
755
|
+
Raises:
|
756
|
+
FileNotFoundError: If the specified DOCX file does not exist.
|
757
|
+
ImportError: If the python-docx library is not installed.
|
758
|
+
|
759
|
+
Examples:
|
760
|
+
>>> from docx import Document
|
761
|
+
>>> doc = Document()
|
762
|
+
>>> _ = doc.add_heading("EDSL Survey")
|
763
|
+
>>> _ = doc.add_paragraph("This is a test.")
|
764
|
+
>>> doc.save("test.docx")
|
765
|
+
>>> s = Scenario.from_docx("test.docx")
|
766
|
+
>>> s
|
767
|
+
Scenario({'file_path': 'test.docx', 'text': 'EDSL Survey\\nThis is a test.'})
|
768
|
+
>>> import os; os.remove("test.docx")
|
769
|
+
|
770
|
+
Notes:
|
771
|
+
- The returned Scenario typically contains the file path and extracted text
|
772
|
+
- The extraction process attempts to maintain document structure
|
773
|
+
- Requires the python-docx library to be installed
|
774
|
+
"""
|
775
|
+
from edsl.scenarios.DocxScenario import DocxScenario
|
776
|
+
|
777
|
+
return Scenario(DocxScenario(docx_path).get_scenario_dict())
|
778
|
+
|
779
|
+
def chunk(
|
780
|
+
self,
|
781
|
+
field: str,
|
782
|
+
num_words: Optional[int] = None,
|
783
|
+
num_lines: Optional[int] = None,
|
784
|
+
include_original: bool = False,
|
785
|
+
hash_original: bool = False,
|
786
|
+
) -> "ScenarioList":
|
787
|
+
"""
|
788
|
+
Splits a text field into chunks of a specified size, creating a ScenarioList.
|
789
|
+
|
790
|
+
This method takes a field containing text and divides it into smaller chunks
|
791
|
+
based on either word count or line count. It's particularly useful for processing
|
792
|
+
large text documents in manageable pieces, such as for summarization, analysis,
|
793
|
+
or when working with models that have token limits.
|
794
|
+
|
795
|
+
Args:
|
796
|
+
field: The key name of the field in the Scenario to split.
|
797
|
+
num_words: The number of words to include in each chunk. Mutually exclusive
|
798
|
+
with num_lines.
|
799
|
+
num_lines: The number of lines to include in each chunk. Mutually exclusive
|
800
|
+
with num_words.
|
801
|
+
include_original: If True, includes the original complete text in each chunk
|
802
|
+
with a "_original" suffix.
|
803
|
+
hash_original: If True and include_original is True, stores a hash of the
|
804
|
+
original text instead of the full text.
|
805
|
+
|
806
|
+
Returns:
|
807
|
+
A ScenarioList containing multiple Scenarios, each with a chunk of the
|
808
|
+
original text. Each Scenario includes the chunk text, chunk index, character
|
809
|
+
count, and word count.
|
810
|
+
|
811
|
+
Raises:
|
812
|
+
ValueError: If neither num_words nor num_lines is specified, or if both are.
|
813
|
+
KeyError: If the specified field doesn't exist in the Scenario.
|
814
|
+
|
815
|
+
Examples:
|
816
|
+
Split by lines (1 line per chunk):
|
817
|
+
>>> s = Scenario({"text": "This is a test.\\nThis is a test.\\n\\nThis is a test."})
|
818
|
+
>>> s.chunk("text", num_lines=1)
|
819
|
+
ScenarioList([Scenario({'text': 'This is a test.', 'text_chunk': 0, 'text_char_count': 15, 'text_word_count': 4}), Scenario({'text': 'This is a test.', 'text_chunk': 1, 'text_char_count': 15, 'text_word_count': 4}), Scenario({'text': '', 'text_chunk': 2, 'text_char_count': 0, 'text_word_count': 0}), Scenario({'text': 'This is a test.', 'text_chunk': 3, 'text_char_count': 15, 'text_word_count': 4})])
|
820
|
+
|
821
|
+
Split by words (2 words per chunk):
|
822
|
+
>>> s.chunk("text", num_words=2)
|
823
|
+
ScenarioList([Scenario({'text': 'This is', 'text_chunk': 0, 'text_char_count': 7, 'text_word_count': 2}), Scenario({'text': 'a test.', 'text_chunk': 1, 'text_char_count': 7, 'text_word_count': 2}), Scenario({'text': 'This is', 'text_chunk': 2, 'text_char_count': 7, 'text_word_count': 2}), Scenario({'text': 'a test.', 'text_chunk': 3, 'text_char_count': 7, 'text_word_count': 2}), Scenario({'text': 'This is', 'text_chunk': 4, 'text_char_count': 7, 'text_word_count': 2}), Scenario({'text': 'a test.', 'text_chunk': 5, 'text_char_count': 7, 'text_word_count': 2})])
|
824
|
+
|
825
|
+
Include original text in each chunk:
|
826
|
+
>>> s = Scenario({"text": "Hello World"})
|
827
|
+
>>> s.chunk("text", num_words=1, include_original=True)
|
828
|
+
ScenarioList([Scenario({'text': 'Hello', 'text_chunk': 0, 'text_char_count': 5, 'text_word_count': 1, 'text_original': 'Hello World'}), Scenario({'text': 'World', 'text_chunk': 1, 'text_char_count': 5, 'text_word_count': 1, 'text_original': 'Hello World'})])
|
829
|
+
|
830
|
+
Use a hash of the original text:
|
831
|
+
>>> s.chunk("text", num_words=1, include_original=True, hash_original=True)
|
832
|
+
ScenarioList([Scenario({'text': 'Hello', 'text_chunk': 0, 'text_char_count': 5, 'text_word_count': 1, 'text_original': 'b10a8db164e0754105b7a99be72e3fe5'}), Scenario({'text': 'World', 'text_chunk': 1, 'text_char_count': 5, 'text_word_count': 1, 'text_original': 'b10a8db164e0754105b7a99be72e3fe5'})])
|
833
|
+
|
834
|
+
Notes:
|
835
|
+
- Either num_words or num_lines must be specified, but not both
|
836
|
+
- Each chunk is assigned a sequential index in the 'text_chunk' field
|
837
|
+
- Character and word counts for each chunk are included
|
838
|
+
- When include_original is True, the original text is preserved in each chunk
|
839
|
+
- The hash_original option is useful to save space while maintaining traceability
|
840
|
+
"""
|
841
|
+
from .document_chunker import DocumentChunker
|
842
|
+
|
843
|
+
return DocumentChunker(self).chunk(
|
844
|
+
field, num_words, num_lines, include_original, hash_original
|
845
|
+
)
|
846
|
+
|
847
|
+
@classmethod
|
848
|
+
@remove_edsl_version
|
849
|
+
def from_dict(cls, d: dict) -> "Scenario":
|
850
|
+
"""
|
851
|
+
Creates a Scenario from a dictionary, with special handling for FileStore objects.
|
852
|
+
|
853
|
+
This method creates a Scenario using the provided dictionary. It has special handling
|
854
|
+
for dictionary values that represent serialized FileStore objects, which it will
|
855
|
+
deserialize back into proper FileStore instances.
|
856
|
+
|
857
|
+
Args:
|
858
|
+
d: A dictionary to convert to a Scenario.
|
859
|
+
|
860
|
+
Returns:
|
861
|
+
A new Scenario containing the provided dictionary data.
|
862
|
+
|
863
|
+
Examples:
|
864
|
+
>>> Scenario.from_dict({"food": "wood chips"})
|
865
|
+
Scenario({'food': 'wood chips'})
|
866
|
+
|
867
|
+
>>> # Example with a serialized FileStore
|
868
|
+
>>> from edsl import FileStore
|
869
|
+
>>> file_dict = {"path": "example.txt", "base64_string": "SGVsbG8gV29ybGQ="}
|
870
|
+
>>> s = Scenario.from_dict({"document": file_dict})
|
871
|
+
>>> isinstance(s["document"], FileStore)
|
872
|
+
True
|
873
|
+
|
874
|
+
Notes:
|
875
|
+
- Any dictionary values that match the FileStore format will be converted to FileStore objects
|
876
|
+
- The method detects FileStore objects by looking for "base64_string" and "path" keys
|
877
|
+
- EDSL version information is automatically removed by the @remove_edsl_version decorator
|
878
|
+
- This method is commonly used when deserializing scenarios from JSON or other formats
|
879
|
+
"""
|
880
|
+
from edsl.scenarios import FileStore
|
881
|
+
|
882
|
+
for key, value in d.items():
|
883
|
+
# TODO: we should check this better if its a FileStore + add remote security check against path traversal
|
884
|
+
if (
|
885
|
+
isinstance(value, dict) and "base64_string" in value and "path" in value
|
886
|
+
) or isinstance(value, FileStore):
|
887
|
+
d[key] = FileStore.from_dict(value)
|
888
|
+
return cls(d)
|
889
|
+
|
890
|
+
def _table(self) -> tuple[dict, list]:
|
891
|
+
"""Prepare generic table data.
|
892
|
+
>>> s = Scenario({"food": "wood chips"})
|
893
|
+
>>> s._table()
|
894
|
+
([{'Attribute': 'data', 'Value': "{'food': 'wood chips'}"}, {'Attribute': 'name', 'Value': 'None'}], ['Attribute', 'Value'])
|
895
|
+
"""
|
896
|
+
table_data = []
|
897
|
+
for attr_name, attr_value in self.__dict__.items():
|
898
|
+
table_data.append({"Attribute": attr_name, "Value": repr(attr_value)})
|
899
|
+
column_names = ["Attribute", "Value"]
|
900
|
+
return table_data, column_names
|
901
|
+
|
902
|
+
@classmethod
|
903
|
+
def example(cls, randomize: bool = False) -> Scenario:
|
904
|
+
"""
|
905
|
+
Returns an example Scenario instance.
|
906
|
+
|
907
|
+
:param randomize: If True, adds a random string to the value of the example key.
|
908
|
+
"""
|
909
|
+
addition = "" if not randomize else str(uuid4())
|
910
|
+
return cls(
|
911
|
+
{
|
912
|
+
"persona": f"A reseacher studying whether LLMs can be used to generate surveys.{addition}",
|
913
|
+
}
|
914
|
+
)
|
915
|
+
|
916
|
+
def code(self) -> List[str]:
|
917
|
+
"""Return the code for the scenario."""
|
918
|
+
lines = []
|
919
|
+
lines.append("from edsl.scenario import Scenario")
|
920
|
+
lines.append(f"s = Scenario({self.data})")
|
921
|
+
# return f"Scenario({self.data})"
|
922
|
+
return lines
|
923
|
+
|
924
|
+
|
925
|
+
if __name__ == "__main__":
|
926
|
+
import doctest
|
927
|
+
|
928
|
+
doctest.testmod(optionflags=doctest.ELLIPSIS)
|