edsl 0.1.47__py3-none-any.whl → 0.1.49__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- edsl/__init__.py +44 -39
- edsl/__version__.py +1 -1
- edsl/agents/__init__.py +4 -2
- edsl/agents/{Agent.py → agent.py} +442 -152
- edsl/agents/{AgentList.py → agent_list.py} +220 -162
- edsl/agents/descriptors.py +46 -7
- edsl/{exceptions/agents.py → agents/exceptions.py} +3 -12
- edsl/base/__init__.py +75 -0
- edsl/base/base_class.py +1303 -0
- edsl/base/data_transfer_models.py +114 -0
- edsl/base/enums.py +215 -0
- edsl/base.py +8 -0
- edsl/buckets/__init__.py +25 -0
- edsl/buckets/bucket_collection.py +324 -0
- edsl/buckets/model_buckets.py +206 -0
- edsl/buckets/token_bucket.py +502 -0
- edsl/{jobs/buckets/TokenBucketAPI.py → buckets/token_bucket_api.py} +1 -1
- edsl/buckets/token_bucket_client.py +509 -0
- edsl/caching/__init__.py +20 -0
- edsl/caching/cache.py +814 -0
- edsl/caching/cache_entry.py +427 -0
- edsl/{data/CacheHandler.py → caching/cache_handler.py} +14 -15
- edsl/caching/exceptions.py +24 -0
- edsl/caching/orm.py +30 -0
- edsl/{data/RemoteCacheSync.py → caching/remote_cache_sync.py} +3 -3
- edsl/caching/sql_dict.py +441 -0
- edsl/config/__init__.py +8 -0
- edsl/config/config_class.py +177 -0
- edsl/config.py +4 -176
- edsl/conversation/Conversation.py +7 -7
- edsl/conversation/car_buying.py +4 -4
- edsl/conversation/chips.py +6 -6
- edsl/coop/__init__.py +25 -2
- edsl/coop/coop.py +311 -75
- edsl/coop/{ExpectedParrotKeyHandler.py → ep_key_handling.py} +86 -10
- edsl/coop/exceptions.py +62 -0
- edsl/coop/price_fetcher.py +126 -0
- edsl/coop/utils.py +89 -24
- edsl/data_transfer_models.py +5 -72
- edsl/dataset/__init__.py +10 -0
- edsl/{results/Dataset.py → dataset/dataset.py} +116 -36
- edsl/{results/DatasetExportMixin.py → dataset/dataset_operations_mixin.py} +606 -122
- edsl/{results/DatasetTree.py → dataset/dataset_tree.py} +156 -75
- edsl/{results/TableDisplay.py → dataset/display/table_display.py} +18 -7
- edsl/{results → dataset/display}/table_renderers.py +58 -2
- edsl/{results → dataset}/file_exports.py +4 -5
- edsl/{results → dataset}/smart_objects.py +2 -2
- edsl/enums.py +5 -205
- edsl/inference_services/__init__.py +5 -0
- edsl/inference_services/{AvailableModelCacheHandler.py → available_model_cache_handler.py} +2 -3
- edsl/inference_services/{AvailableModelFetcher.py → available_model_fetcher.py} +8 -14
- edsl/inference_services/data_structures.py +3 -2
- edsl/{exceptions/inference_services.py → inference_services/exceptions.py} +1 -1
- edsl/inference_services/{InferenceServiceABC.py → inference_service_abc.py} +1 -1
- edsl/inference_services/{InferenceServicesCollection.py → inference_services_collection.py} +8 -7
- edsl/inference_services/registry.py +4 -41
- edsl/inference_services/{ServiceAvailability.py → service_availability.py} +5 -25
- edsl/inference_services/services/__init__.py +31 -0
- edsl/inference_services/{AnthropicService.py → services/anthropic_service.py} +3 -3
- edsl/inference_services/{AwsBedrock.py → services/aws_bedrock.py} +2 -2
- edsl/inference_services/{AzureAI.py → services/azure_ai.py} +2 -2
- edsl/inference_services/{DeepInfraService.py → services/deep_infra_service.py} +1 -3
- edsl/inference_services/{DeepSeekService.py → services/deep_seek_service.py} +2 -4
- edsl/inference_services/{GoogleService.py → services/google_service.py} +5 -4
- edsl/inference_services/{GroqService.py → services/groq_service.py} +1 -1
- edsl/inference_services/{MistralAIService.py → services/mistral_ai_service.py} +3 -3
- edsl/inference_services/{OllamaService.py → services/ollama_service.py} +1 -7
- edsl/inference_services/{OpenAIService.py → services/open_ai_service.py} +5 -6
- edsl/inference_services/{PerplexityService.py → services/perplexity_service.py} +3 -7
- edsl/inference_services/{TestService.py → services/test_service.py} +7 -6
- edsl/inference_services/{TogetherAIService.py → services/together_ai_service.py} +2 -6
- edsl/inference_services/{XAIService.py → services/xai_service.py} +1 -1
- edsl/inference_services/write_available.py +1 -2
- edsl/instructions/__init__.py +6 -0
- edsl/{surveys/instructions/Instruction.py → instructions/instruction.py} +11 -6
- edsl/{surveys/instructions/InstructionCollection.py → instructions/instruction_collection.py} +10 -5
- edsl/{surveys/InstructionHandler.py → instructions/instruction_handler.py} +3 -3
- edsl/{jobs/interviews → interviews}/ReportErrors.py +2 -2
- edsl/interviews/__init__.py +4 -0
- edsl/{jobs/AnswerQuestionFunctionConstructor.py → interviews/answering_function.py} +45 -18
- edsl/{jobs/interviews/InterviewExceptionEntry.py → interviews/exception_tracking.py} +107 -22
- edsl/interviews/interview.py +638 -0
- edsl/{jobs/interviews/InterviewStatusDictionary.py → interviews/interview_status_dictionary.py} +21 -12
- edsl/{jobs/interviews/InterviewStatusLog.py → interviews/interview_status_log.py} +16 -7
- edsl/{jobs/InterviewTaskManager.py → interviews/interview_task_manager.py} +12 -7
- edsl/{jobs/RequestTokenEstimator.py → interviews/request_token_estimator.py} +8 -3
- edsl/{jobs/interviews/InterviewStatistic.py → interviews/statistics.py} +36 -10
- edsl/invigilators/__init__.py +38 -0
- edsl/invigilators/invigilator_base.py +477 -0
- edsl/{agents/Invigilator.py → invigilators/invigilators.py} +263 -10
- edsl/invigilators/prompt_constructor.py +476 -0
- edsl/{agents → invigilators}/prompt_helpers.py +2 -1
- edsl/{agents/QuestionInstructionPromptBuilder.py → invigilators/question_instructions_prompt_builder.py} +18 -13
- edsl/{agents → invigilators}/question_option_processor.py +96 -21
- edsl/{agents/QuestionTemplateReplacementsBuilder.py → invigilators/question_template_replacements_builder.py} +64 -12
- edsl/jobs/__init__.py +7 -1
- edsl/jobs/async_interview_runner.py +99 -35
- edsl/jobs/check_survey_scenario_compatibility.py +7 -5
- edsl/jobs/data_structures.py +153 -22
- edsl/{exceptions/jobs.py → jobs/exceptions.py} +2 -1
- edsl/jobs/{FetchInvigilator.py → fetch_invigilator.py} +4 -4
- edsl/jobs/{loggers/HTMLTableJobLogger.py → html_table_job_logger.py} +6 -2
- edsl/jobs/{Jobs.py → jobs.py} +313 -167
- edsl/jobs/{JobsChecks.py → jobs_checks.py} +15 -7
- edsl/jobs/{JobsComponentConstructor.py → jobs_component_constructor.py} +19 -17
- edsl/jobs/{InterviewsConstructor.py → jobs_interview_constructor.py} +10 -5
- edsl/jobs/jobs_pricing_estimation.py +347 -0
- edsl/jobs/{JobsRemoteInferenceLogger.py → jobs_remote_inference_logger.py} +4 -3
- edsl/jobs/jobs_runner_asyncio.py +282 -0
- edsl/jobs/{JobsRemoteInferenceHandler.py → remote_inference.py} +19 -22
- edsl/jobs/results_exceptions_handler.py +2 -2
- edsl/key_management/__init__.py +28 -0
- edsl/key_management/key_lookup.py +161 -0
- edsl/{language_models/key_management/KeyLookupBuilder.py → key_management/key_lookup_builder.py} +118 -47
- edsl/key_management/key_lookup_collection.py +82 -0
- edsl/key_management/models.py +218 -0
- edsl/language_models/__init__.py +7 -2
- edsl/language_models/{ComputeCost.py → compute_cost.py} +18 -3
- edsl/{exceptions/language_models.py → language_models/exceptions.py} +2 -1
- edsl/language_models/language_model.py +1080 -0
- edsl/language_models/model.py +10 -25
- edsl/language_models/{ModelList.py → model_list.py} +9 -14
- edsl/language_models/{RawResponseHandler.py → raw_response_handler.py} +1 -1
- edsl/language_models/{RegisterLanguageModelsMeta.py → registry.py} +1 -1
- edsl/language_models/repair.py +4 -4
- edsl/language_models/utilities.py +4 -4
- edsl/notebooks/__init__.py +3 -1
- edsl/notebooks/{Notebook.py → notebook.py} +7 -8
- edsl/prompts/__init__.py +1 -1
- edsl/{exceptions/prompts.py → prompts/exceptions.py} +3 -1
- edsl/prompts/{Prompt.py → prompt.py} +101 -95
- edsl/questions/HTMLQuestion.py +1 -1
- edsl/questions/__init__.py +154 -25
- edsl/questions/answer_validator_mixin.py +1 -1
- edsl/questions/compose_questions.py +4 -3
- edsl/questions/derived/question_likert_five.py +166 -0
- edsl/questions/derived/{QuestionLinearScale.py → question_linear_scale.py} +4 -4
- edsl/questions/derived/{QuestionTopK.py → question_top_k.py} +4 -4
- edsl/questions/derived/{QuestionYesNo.py → question_yes_no.py} +4 -5
- edsl/questions/descriptors.py +24 -30
- edsl/questions/loop_processor.py +65 -19
- edsl/questions/question_base.py +881 -0
- edsl/questions/question_base_gen_mixin.py +15 -16
- edsl/questions/{QuestionBasePromptsMixin.py → question_base_prompts_mixin.py} +2 -2
- edsl/questions/{QuestionBudget.py → question_budget.py} +3 -4
- edsl/questions/{QuestionCheckBox.py → question_check_box.py} +16 -16
- edsl/questions/{QuestionDict.py → question_dict.py} +39 -5
- edsl/questions/{QuestionExtract.py → question_extract.py} +9 -9
- edsl/questions/question_free_text.py +282 -0
- edsl/questions/{QuestionFunctional.py → question_functional.py} +6 -5
- edsl/questions/{QuestionList.py → question_list.py} +6 -7
- edsl/questions/{QuestionMatrix.py → question_matrix.py} +6 -5
- edsl/questions/{QuestionMultipleChoice.py → question_multiple_choice.py} +126 -21
- edsl/questions/{QuestionNumerical.py → question_numerical.py} +5 -5
- edsl/questions/{QuestionRank.py → question_rank.py} +6 -6
- edsl/questions/question_registry.py +4 -9
- edsl/questions/register_questions_meta.py +8 -4
- edsl/questions/response_validator_abc.py +17 -16
- edsl/results/__init__.py +4 -1
- edsl/{exceptions/results.py → results/exceptions.py} +1 -1
- edsl/results/report.py +197 -0
- edsl/results/{Result.py → result.py} +131 -45
- edsl/results/{Results.py → results.py} +365 -220
- edsl/results/results_selector.py +344 -25
- edsl/scenarios/__init__.py +30 -3
- edsl/scenarios/{ConstructDownloadLink.py → construct_download_link.py} +7 -0
- edsl/scenarios/directory_scanner.py +156 -13
- edsl/scenarios/document_chunker.py +186 -0
- edsl/scenarios/exceptions.py +101 -0
- edsl/scenarios/file_methods.py +2 -3
- edsl/scenarios/{FileStore.py → file_store.py} +275 -189
- edsl/scenarios/handlers/__init__.py +14 -14
- edsl/scenarios/handlers/{csv.py → csv_file_store.py} +1 -2
- edsl/scenarios/handlers/{docx.py → docx_file_store.py} +8 -7
- edsl/scenarios/handlers/{html.py → html_file_store.py} +1 -2
- edsl/scenarios/handlers/{jpeg.py → jpeg_file_store.py} +1 -1
- edsl/scenarios/handlers/{json.py → json_file_store.py} +1 -1
- edsl/scenarios/handlers/latex_file_store.py +5 -0
- edsl/scenarios/handlers/{md.py → md_file_store.py} +1 -1
- edsl/scenarios/handlers/{pdf.py → pdf_file_store.py} +2 -2
- edsl/scenarios/handlers/{png.py → png_file_store.py} +1 -1
- edsl/scenarios/handlers/{pptx.py → pptx_file_store.py} +8 -7
- edsl/scenarios/handlers/{py.py → py_file_store.py} +1 -3
- edsl/scenarios/handlers/{sql.py → sql_file_store.py} +2 -1
- edsl/scenarios/handlers/{sqlite.py → sqlite_file_store.py} +2 -3
- edsl/scenarios/handlers/{txt.py → txt_file_store.py} +1 -1
- edsl/scenarios/scenario.py +928 -0
- edsl/scenarios/scenario_join.py +18 -5
- edsl/scenarios/{ScenarioList.py → scenario_list.py} +294 -106
- edsl/scenarios/{ScenarioListPdfMixin.py → scenario_list_pdf_tools.py} +16 -15
- edsl/scenarios/scenario_selector.py +5 -1
- edsl/study/ObjectEntry.py +2 -2
- edsl/study/SnapShot.py +5 -5
- edsl/study/Study.py +18 -19
- edsl/study/__init__.py +6 -4
- edsl/surveys/__init__.py +7 -4
- edsl/surveys/dag/__init__.py +2 -0
- edsl/surveys/{ConstructDAG.py → dag/construct_dag.py} +3 -3
- edsl/surveys/{DAG.py → dag/dag.py} +13 -10
- edsl/surveys/descriptors.py +1 -1
- edsl/surveys/{EditSurvey.py → edit_survey.py} +9 -9
- edsl/{exceptions/surveys.py → surveys/exceptions.py} +1 -2
- edsl/surveys/memory/__init__.py +3 -0
- edsl/surveys/{MemoryPlan.py → memory/memory_plan.py} +10 -9
- edsl/surveys/rules/__init__.py +3 -0
- edsl/surveys/{Rule.py → rules/rule.py} +103 -43
- edsl/surveys/{RuleCollection.py → rules/rule_collection.py} +21 -30
- edsl/surveys/{RuleManager.py → rules/rule_manager.py} +19 -13
- edsl/surveys/survey.py +1743 -0
- edsl/surveys/{SurveyExportMixin.py → survey_export.py} +22 -27
- edsl/surveys/{SurveyFlowVisualization.py → survey_flow_visualization.py} +11 -2
- edsl/surveys/{Simulator.py → survey_simulator.py} +10 -3
- edsl/tasks/__init__.py +32 -0
- edsl/{jobs/tasks/QuestionTaskCreator.py → tasks/question_task_creator.py} +115 -57
- edsl/tasks/task_creators.py +135 -0
- edsl/{jobs/tasks/TaskHistory.py → tasks/task_history.py} +86 -47
- edsl/{jobs/tasks → tasks}/task_status_enum.py +91 -7
- edsl/tasks/task_status_log.py +85 -0
- edsl/tokens/__init__.py +2 -0
- edsl/tokens/interview_token_usage.py +53 -0
- edsl/utilities/PrettyList.py +1 -1
- edsl/utilities/SystemInfo.py +25 -22
- edsl/utilities/__init__.py +29 -21
- edsl/utilities/gcp_bucket/__init__.py +2 -0
- edsl/utilities/gcp_bucket/cloud_storage.py +99 -96
- edsl/utilities/interface.py +44 -536
- edsl/{results/MarkdownToPDF.py → utilities/markdown_to_pdf.py} +13 -5
- edsl/utilities/repair_functions.py +1 -1
- {edsl-0.1.47.dist-info → edsl-0.1.49.dist-info}/METADATA +1 -1
- edsl-0.1.49.dist-info/RECORD +347 -0
- edsl/Base.py +0 -493
- edsl/BaseDiff.py +0 -260
- edsl/agents/InvigilatorBase.py +0 -260
- edsl/agents/PromptConstructor.py +0 -318
- edsl/coop/PriceFetcher.py +0 -54
- edsl/data/Cache.py +0 -582
- edsl/data/CacheEntry.py +0 -238
- edsl/data/SQLiteDict.py +0 -292
- edsl/data/__init__.py +0 -5
- edsl/data/orm.py +0 -10
- edsl/exceptions/cache.py +0 -5
- edsl/exceptions/coop.py +0 -14
- edsl/exceptions/data.py +0 -14
- edsl/exceptions/scenarios.py +0 -29
- edsl/jobs/Answers.py +0 -43
- edsl/jobs/JobsPrompts.py +0 -354
- edsl/jobs/buckets/BucketCollection.py +0 -134
- edsl/jobs/buckets/ModelBuckets.py +0 -65
- edsl/jobs/buckets/TokenBucket.py +0 -283
- edsl/jobs/buckets/TokenBucketClient.py +0 -191
- edsl/jobs/interviews/Interview.py +0 -395
- edsl/jobs/interviews/InterviewExceptionCollection.py +0 -99
- edsl/jobs/interviews/InterviewStatisticsCollection.py +0 -25
- edsl/jobs/runners/JobsRunnerAsyncio.py +0 -163
- edsl/jobs/runners/JobsRunnerStatusData.py +0 -0
- edsl/jobs/tasks/TaskCreators.py +0 -64
- edsl/jobs/tasks/TaskStatusLog.py +0 -23
- edsl/jobs/tokens/InterviewTokenUsage.py +0 -27
- edsl/language_models/LanguageModel.py +0 -635
- edsl/language_models/ServiceDataSources.py +0 -0
- edsl/language_models/key_management/KeyLookup.py +0 -63
- edsl/language_models/key_management/KeyLookupCollection.py +0 -38
- edsl/language_models/key_management/models.py +0 -137
- edsl/questions/QuestionBase.py +0 -544
- edsl/questions/QuestionFreeText.py +0 -130
- edsl/questions/derived/QuestionLikertFive.py +0 -76
- edsl/results/ResultsExportMixin.py +0 -45
- edsl/results/TextEditor.py +0 -50
- edsl/results/results_fetch_mixin.py +0 -33
- edsl/results/results_tools_mixin.py +0 -98
- edsl/scenarios/DocumentChunker.py +0 -104
- edsl/scenarios/Scenario.py +0 -548
- edsl/scenarios/ScenarioHtmlMixin.py +0 -65
- edsl/scenarios/ScenarioListExportMixin.py +0 -45
- edsl/scenarios/handlers/latex.py +0 -5
- edsl/shared.py +0 -1
- edsl/surveys/Survey.py +0 -1301
- edsl/surveys/SurveyQualtricsImport.py +0 -284
- edsl/surveys/SurveyToApp.py +0 -141
- edsl/surveys/instructions/__init__.py +0 -0
- edsl/tools/__init__.py +0 -1
- edsl/tools/clusters.py +0 -192
- edsl/tools/embeddings.py +0 -27
- edsl/tools/embeddings_plotting.py +0 -118
- edsl/tools/plotting.py +0 -112
- edsl/tools/summarize.py +0 -18
- edsl/utilities/data/Registry.py +0 -6
- edsl/utilities/data/__init__.py +0 -1
- edsl/utilities/data/scooter_results.json +0 -1
- edsl-0.1.47.dist-info/RECORD +0 -354
- /edsl/coop/{CoopFunctionsMixin.py → coop_functions.py} +0 -0
- /edsl/{results → dataset/display}/CSSParameterizer.py +0 -0
- /edsl/{language_models/key_management → dataset/display}/__init__.py +0 -0
- /edsl/{results → dataset/display}/table_data_class.py +0 -0
- /edsl/{results → dataset/display}/table_display.css +0 -0
- /edsl/{results/ResultsGGMixin.py → dataset/r/ggplot.py} +0 -0
- /edsl/{results → dataset}/tree_explore.py +0 -0
- /edsl/{surveys/instructions/ChangeInstruction.py → instructions/change_instruction.py} +0 -0
- /edsl/{jobs/interviews → interviews}/interview_status_enum.py +0 -0
- /edsl/jobs/{runners/JobsRunnerStatus.py → jobs_runner_status.py} +0 -0
- /edsl/language_models/{PriceManager.py → price_manager.py} +0 -0
- /edsl/language_models/{fake_openai_call.py → unused/fake_openai_call.py} +0 -0
- /edsl/language_models/{fake_openai_service.py → unused/fake_openai_service.py} +0 -0
- /edsl/notebooks/{NotebookToLaTeX.py → notebook_to_latex.py} +0 -0
- /edsl/{exceptions/questions.py → questions/exceptions.py} +0 -0
- /edsl/questions/{SimpleAskMixin.py → simple_ask_mixin.py} +0 -0
- /edsl/surveys/{Memory.py → memory/memory.py} +0 -0
- /edsl/surveys/{MemoryManagement.py → memory/memory_management.py} +0 -0
- /edsl/surveys/{SurveyCSS.py → survey_css.py} +0 -0
- /edsl/{jobs/tokens/TokenUsage.py → tokens/token_usage.py} +0 -0
- /edsl/{results/MarkdownToDocx.py → utilities/markdown_to_docx.py} +0 -0
- /edsl/{TemplateLoader.py → utilities/template_loader.py} +0 -0
- {edsl-0.1.47.dist-info → edsl-0.1.49.dist-info}/LICENSE +0 -0
- {edsl-0.1.47.dist-info → edsl-0.1.49.dist-info}/WHEEL +0 -0
@@ -1,15 +1,56 @@
|
|
1
|
-
|
1
|
+
"""
|
2
|
+
The DirectoryScanner module provides functionality for finding and processing files in directories.
|
3
|
+
|
4
|
+
This module implements the DirectoryScanner class, which is designed to scan directories
|
5
|
+
for files matching specific criteria and process them using a factory function. It supports
|
6
|
+
recursive scanning, filtering by file extensions, and both eager and lazy iteration over
|
7
|
+
the matching files.
|
8
|
+
"""
|
9
|
+
|
2
10
|
from dataclasses import dataclass
|
3
11
|
from typing import Optional, List, Iterator, TypeVar, Generic, Callable, Any
|
4
12
|
import os
|
13
|
+
from pathlib import Path
|
5
14
|
|
15
|
+
# Generic type variable for the factory function's return type
|
6
16
|
T = TypeVar("T")
|
7
17
|
|
8
18
|
|
9
19
|
@dataclass
|
10
20
|
class DirectoryScanner:
|
11
21
|
"""
|
12
|
-
|
22
|
+
A utility class for finding and processing files in directories.
|
23
|
+
|
24
|
+
DirectoryScanner provides methods to scan directories for files that match specific
|
25
|
+
criteria, such as file extensions. It can process matching files using a factory
|
26
|
+
function that converts file paths to objects of a specified type.
|
27
|
+
|
28
|
+
The scanner supports both eager (scan) and lazy (iter_scan) iteration, recursive
|
29
|
+
directory traversal, and flexible filtering based on file extensions.
|
30
|
+
|
31
|
+
Attributes:
|
32
|
+
directory_path: The path to the directory to scan.
|
33
|
+
|
34
|
+
Examples:
|
35
|
+
>>> import tempfile
|
36
|
+
>>> import os
|
37
|
+
>>> # Create a temporary directory with some files
|
38
|
+
>>> with tempfile.TemporaryDirectory() as tmpdir:
|
39
|
+
... # Create a few files with different extensions
|
40
|
+
... _ = open(os.path.join(tmpdir, "file1.txt"), "w").write("content")
|
41
|
+
... _ = open(os.path.join(tmpdir, "file2.txt"), "w").write("content")
|
42
|
+
... _ = open(os.path.join(tmpdir, "image.jpg"), "w").write("content")
|
43
|
+
... # Create a scanner and find all text files
|
44
|
+
... scanner = DirectoryScanner(tmpdir)
|
45
|
+
... txt_files = scanner.scan(lambda path: path, suffix_allow_list=["txt"])
|
46
|
+
... len(txt_files)
|
47
|
+
... # Use a factory to process files
|
48
|
+
... def get_filename(path):
|
49
|
+
... return os.path.basename(path)
|
50
|
+
... filenames = scanner.scan(get_filename)
|
51
|
+
... sorted(filenames)
|
52
|
+
2
|
53
|
+
['file1.txt', 'file2.txt', 'image.jpg']
|
13
54
|
"""
|
14
55
|
|
15
56
|
directory_path: str
|
@@ -24,15 +65,56 @@ class DirectoryScanner:
|
|
24
65
|
include_no_extension: bool = True,
|
25
66
|
) -> List[T]:
|
26
67
|
"""
|
27
|
-
Eagerly scan directory and return list of objects created by factory.
|
28
|
-
|
68
|
+
Eagerly scan directory and return a list of objects created by the factory function.
|
69
|
+
|
70
|
+
This method performs a scan of the directory, filtering files based on the provided
|
71
|
+
criteria, and applies the factory function to each matching file path. It returns
|
72
|
+
a complete list of processed results.
|
73
|
+
|
29
74
|
Args:
|
30
|
-
factory:
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
75
|
+
factory: A callable that takes a file path string and returns an object of type T.
|
76
|
+
This is applied to each matching file path.
|
77
|
+
recursive: If True, traverses subdirectories recursively. If False, only scans
|
78
|
+
the top-level directory.
|
79
|
+
suffix_allow_list: A list of file extensions (without dots) to include.
|
80
|
+
If provided, only files with these extensions are included.
|
81
|
+
suffix_exclude_list: A list of file extensions to exclude. This takes precedence
|
82
|
+
over suffix_allow_list.
|
83
|
+
example_suffix: If provided, only include files ending with this exact suffix.
|
84
|
+
This checks the entire filename, not just the extension.
|
85
|
+
include_no_extension: Whether to include files without extensions. Defaults to True.
|
86
|
+
|
87
|
+
Returns:
|
88
|
+
A list of objects created by applying the factory function to each matching file path.
|
89
|
+
|
90
|
+
Examples:
|
91
|
+
>>> import tempfile
|
92
|
+
>>> import os
|
93
|
+
>>> with tempfile.TemporaryDirectory() as tmpdir:
|
94
|
+
... # Create test files
|
95
|
+
... _ = open(os.path.join(tmpdir, "doc1.txt"), "w").write("content")
|
96
|
+
... _ = open(os.path.join(tmpdir, "doc2.md"), "w").write("content")
|
97
|
+
... os.mkdir(os.path.join(tmpdir, "subdir"))
|
98
|
+
... _ = open(os.path.join(tmpdir, "subdir", "doc3.txt"), "w").write("content")
|
99
|
+
... # Scan for text files only
|
100
|
+
... scanner = DirectoryScanner(tmpdir)
|
101
|
+
... paths = scanner.scan(lambda p: p, suffix_allow_list=["txt"])
|
102
|
+
... len(paths)
|
103
|
+
... # Recursive scan for all files
|
104
|
+
... all_paths = scanner.scan(lambda p: p, recursive=True)
|
105
|
+
... len(all_paths)
|
106
|
+
... # Exclude specific extensions
|
107
|
+
... no_md = scanner.scan(lambda p: p, recursive=True, suffix_exclude_list=["md"])
|
108
|
+
... len(no_md)
|
109
|
+
1
|
110
|
+
3
|
111
|
+
2
|
112
|
+
|
113
|
+
Notes:
|
114
|
+
- This method is eager and collects all results into memory. For large directories,
|
115
|
+
consider using iter_scan instead.
|
116
|
+
- The filtering logic applies filters in this order: exclude list, example suffix,
|
117
|
+
allow list, and no extension.
|
36
118
|
"""
|
37
119
|
return list(
|
38
120
|
self.iter_scan(
|
@@ -55,12 +137,64 @@ class DirectoryScanner:
|
|
55
137
|
include_no_extension: bool = True,
|
56
138
|
) -> Iterator[T]:
|
57
139
|
"""
|
58
|
-
Lazily scan directory and yield objects created by factory.
|
140
|
+
Lazily scan directory and yield objects created by the factory function.
|
141
|
+
|
142
|
+
This method performs a lazy scan of the directory, filtering files based on the provided
|
143
|
+
criteria, and applies the factory function to each matching file path. It yields
|
144
|
+
results one by one, allowing for memory-efficient processing of large directories.
|
145
|
+
|
146
|
+
Args:
|
147
|
+
factory: A callable that takes a file path string and returns an object of type T.
|
148
|
+
This is applied to each matching file path.
|
149
|
+
recursive: If True, traverses subdirectories recursively. If False, only scans
|
150
|
+
the top-level directory.
|
151
|
+
suffix_allow_list: A list of file extensions (without dots) to include.
|
152
|
+
If provided, only files with these extensions are included.
|
153
|
+
suffix_exclude_list: A list of file extensions to exclude. This takes precedence
|
154
|
+
over suffix_allow_list.
|
155
|
+
example_suffix: If provided, only include files ending with this exact suffix.
|
156
|
+
This checks the entire filename, not just the extension.
|
157
|
+
include_no_extension: Whether to include files without extensions. Defaults to True.
|
158
|
+
|
159
|
+
Yields:
|
160
|
+
Objects created by applying the factory function to each matching file path,
|
161
|
+
yielded one at a time.
|
162
|
+
|
163
|
+
Examples:
|
164
|
+
>>> import tempfile
|
165
|
+
>>> import os
|
166
|
+
>>> with tempfile.TemporaryDirectory() as tmpdir:
|
167
|
+
... # Create test files
|
168
|
+
... _ = open(os.path.join(tmpdir, "doc1.txt"), "w").write("content")
|
169
|
+
... _ = open(os.path.join(tmpdir, "doc2.md"), "w").write("content")
|
170
|
+
... # Process files lazily
|
171
|
+
... scanner = DirectoryScanner(tmpdir)
|
172
|
+
... for path in scanner.iter_scan(lambda p: p):
|
173
|
+
... # Process each file path without loading all into memory
|
174
|
+
... file_exists = os.path.exists(path)
|
175
|
+
... assert file_exists
|
176
|
+
|
177
|
+
Notes:
|
178
|
+
- This method is lazy and yields results as they are processed, making it
|
179
|
+
suitable for memory-efficient processing of large directories.
|
180
|
+
- The filtering logic is identical to the scan method.
|
59
181
|
"""
|
60
182
|
|
61
183
|
def should_include_file(filepath: str) -> bool:
|
184
|
+
"""
|
185
|
+
Determine if a file should be included based on filtering criteria.
|
186
|
+
|
187
|
+
This helper function applies all the filtering rules to determine
|
188
|
+
if a given file path should be included in the results.
|
189
|
+
|
190
|
+
Args:
|
191
|
+
filepath: The path to the file to check.
|
192
|
+
|
193
|
+
Returns:
|
194
|
+
True if the file should be included, False otherwise.
|
195
|
+
"""
|
62
196
|
_, ext = os.path.splitext(filepath)
|
63
|
-
ext = ext[1:] if ext else ""
|
197
|
+
ext = ext[1:] if ext else "" # Remove leading dot from extension
|
64
198
|
|
65
199
|
# Handle no extension case
|
66
200
|
if not ext:
|
@@ -80,7 +214,16 @@ class DirectoryScanner:
|
|
80
214
|
|
81
215
|
return True
|
82
216
|
|
83
|
-
def iter_files():
|
217
|
+
def iter_files() -> Iterator[str]:
|
218
|
+
"""
|
219
|
+
Generate paths to all files in the directory, optionally recursively.
|
220
|
+
|
221
|
+
This helper function yields file paths from the directory, handling
|
222
|
+
the recursive option appropriately.
|
223
|
+
|
224
|
+
Yields:
|
225
|
+
Paths to files in the directory.
|
226
|
+
"""
|
84
227
|
if recursive:
|
85
228
|
for root, _, files in os.walk(self.directory_path):
|
86
229
|
for file in files:
|
@@ -0,0 +1,186 @@
|
|
1
|
+
"""
|
2
|
+
The DocumentChunker module provides functionality for splitting text into manageable chunks.
|
3
|
+
|
4
|
+
This module implements the DocumentChunker class, which is responsible for chunking
|
5
|
+
text content in Scenarios based on word or line counts. This is particularly useful
|
6
|
+
when working with large text documents that need to be processed in smaller pieces,
|
7
|
+
such as for summarization, analysis, or when dealing with models that have token
|
8
|
+
limits.
|
9
|
+
"""
|
10
|
+
|
11
|
+
from __future__ import annotations
|
12
|
+
from typing import Optional, Generator, TYPE_CHECKING, List, Union
|
13
|
+
import copy
|
14
|
+
import hashlib
|
15
|
+
|
16
|
+
from .scenario import Scenario
|
17
|
+
from .scenario_list import ScenarioList
|
18
|
+
|
19
|
+
|
20
|
+
class DocumentChunker:
|
21
|
+
"""
|
22
|
+
A utility class for splitting text in a Scenario into manageable chunks.
|
23
|
+
|
24
|
+
DocumentChunker provides methods to split text content from a Scenario field into
|
25
|
+
smaller chunks based on either word count or line count. It's primarily used by the
|
26
|
+
Scenario.chunk() method but can also be used directly for more control over the
|
27
|
+
chunking process.
|
28
|
+
|
29
|
+
Attributes:
|
30
|
+
scenario: The Scenario object containing the text to be chunked.
|
31
|
+
"""
|
32
|
+
|
33
|
+
def __init__(self, scenario: "Scenario"):
|
34
|
+
"""
|
35
|
+
Initialize a DocumentChunker for a specific Scenario.
|
36
|
+
|
37
|
+
Args:
|
38
|
+
scenario: The Scenario object containing the text field to be chunked.
|
39
|
+
"""
|
40
|
+
self.scenario = scenario
|
41
|
+
|
42
|
+
@staticmethod
|
43
|
+
def _line_chunks(text: str, num_lines: int) -> Generator[str, None, None]:
|
44
|
+
"""
|
45
|
+
Split text into chunks based on a specified number of lines per chunk.
|
46
|
+
|
47
|
+
This method divides a text string into chunks, where each chunk contains
|
48
|
+
at most the specified number of lines. It processes the text by splitting
|
49
|
+
on newline characters and then groups the lines into chunks.
|
50
|
+
|
51
|
+
Args:
|
52
|
+
text: The text string to split into chunks.
|
53
|
+
num_lines: The maximum number of lines to include in each chunk.
|
54
|
+
|
55
|
+
Yields:
|
56
|
+
String chunks containing at most num_lines lines each.
|
57
|
+
|
58
|
+
Examples:
|
59
|
+
>>> list(DocumentChunker._line_chunks("This is a test.\\nThis is a test. This is a test.", 1))
|
60
|
+
['This is a test.', 'This is a test. This is a test.']
|
61
|
+
|
62
|
+
>>> list(DocumentChunker._line_chunks("Line 1\\nLine 2\\nLine 3\\nLine 4", 2))
|
63
|
+
['Line 1\\nLine 2', 'Line 3\\nLine 4']
|
64
|
+
"""
|
65
|
+
lines = text.split("\n")
|
66
|
+
for i in range(0, len(lines), num_lines):
|
67
|
+
chunk = "\n".join(lines[i : i + num_lines])
|
68
|
+
yield chunk
|
69
|
+
|
70
|
+
@staticmethod
|
71
|
+
def _word_chunks(text: str, num_words: int) -> Generator[str, None, None]:
|
72
|
+
"""
|
73
|
+
Split text into chunks based on a specified number of words per chunk.
|
74
|
+
|
75
|
+
This method divides a text string into chunks, where each chunk contains
|
76
|
+
at most the specified number of words. It processes the text by splitting
|
77
|
+
on whitespace and then groups the words into chunks.
|
78
|
+
|
79
|
+
Args:
|
80
|
+
text: The text string to split into chunks.
|
81
|
+
num_words: The maximum number of words to include in each chunk.
|
82
|
+
|
83
|
+
Yields:
|
84
|
+
String chunks containing at most num_words words each.
|
85
|
+
|
86
|
+
Examples:
|
87
|
+
>>> list(DocumentChunker._word_chunks("This is a test.", 2))
|
88
|
+
['This is', 'a test.']
|
89
|
+
|
90
|
+
>>> list(DocumentChunker._word_chunks("One two three four five", 3))
|
91
|
+
['One two three', 'four five']
|
92
|
+
"""
|
93
|
+
words = text.split()
|
94
|
+
for i in range(0, len(words), num_words):
|
95
|
+
chunk = " ".join(words[i : i + num_words])
|
96
|
+
yield chunk
|
97
|
+
|
98
|
+
def chunk(
|
99
|
+
self,
|
100
|
+
field: str,
|
101
|
+
num_words: Optional[int] = None,
|
102
|
+
num_lines: Optional[int] = None,
|
103
|
+
include_original: bool = False,
|
104
|
+
hash_original: bool = False,
|
105
|
+
) -> ScenarioList:
|
106
|
+
"""
|
107
|
+
Split a text field in the Scenario into chunks and create a ScenarioList.
|
108
|
+
|
109
|
+
This method takes a field containing text from the Scenario and divides it into
|
110
|
+
smaller chunks based on either word count or line count. For each chunk, it creates
|
111
|
+
a new Scenario with additional metadata about the chunk.
|
112
|
+
|
113
|
+
Args:
|
114
|
+
field: The key name of the field in the Scenario to split.
|
115
|
+
num_words: The number of words to include in each chunk. Mutually exclusive
|
116
|
+
with num_lines.
|
117
|
+
num_lines: The number of lines to include in each chunk. Mutually exclusive
|
118
|
+
with num_words.
|
119
|
+
include_original: If True, includes the original complete text in each chunk
|
120
|
+
with a "_original" suffix.
|
121
|
+
hash_original: If True and include_original is True, stores a hash of the
|
122
|
+
original text instead of the full text.
|
123
|
+
|
124
|
+
Returns:
|
125
|
+
A ScenarioList containing multiple Scenarios, each with a chunk of the
|
126
|
+
original text and metadata about the chunk.
|
127
|
+
|
128
|
+
Raises:
|
129
|
+
ValueError: If neither num_words nor num_lines is specified, or if both are.
|
130
|
+
KeyError: If the specified field doesn't exist in the Scenario.
|
131
|
+
|
132
|
+
Notes:
|
133
|
+
- Each chunk is assigned a sequential index in the '{field}_chunk' field
|
134
|
+
- Character and word counts for each chunk are included in '{field}_char_count'
|
135
|
+
and '{field}_word_count' fields
|
136
|
+
- When include_original is True, the original text is preserved in each chunk
|
137
|
+
in the '{field}_original' field
|
138
|
+
- The hash_original option is useful to save space while maintaining traceability
|
139
|
+
"""
|
140
|
+
# Check if field exists in the scenario
|
141
|
+
if field not in self.scenario:
|
142
|
+
raise KeyError(f"Field '{field}' not found in the scenario")
|
143
|
+
|
144
|
+
# Validate parameters
|
145
|
+
if num_words is None and num_lines is None:
|
146
|
+
raise ValueError("You must specify either num_words or num_lines.")
|
147
|
+
|
148
|
+
if num_words is not None and num_lines is not None:
|
149
|
+
raise ValueError(
|
150
|
+
"You must specify either num_words or num_lines, but not both."
|
151
|
+
)
|
152
|
+
|
153
|
+
# Get appropriate chunks based on the specified chunking method
|
154
|
+
if num_words is not None:
|
155
|
+
chunks = list(self._word_chunks(self.scenario[field], num_words))
|
156
|
+
else: # num_lines is not None
|
157
|
+
chunks = list(self._line_chunks(self.scenario[field], num_lines))
|
158
|
+
|
159
|
+
# Create a new scenario for each chunk with metadata
|
160
|
+
scenarios = []
|
161
|
+
for i, chunk in enumerate(chunks):
|
162
|
+
new_scenario = copy.deepcopy(self.scenario)
|
163
|
+
new_scenario[field] = chunk
|
164
|
+
new_scenario[field + "_chunk"] = i
|
165
|
+
new_scenario[field + "_char_count"] = len(chunk)
|
166
|
+
new_scenario[field + "_word_count"] = len(chunk.split())
|
167
|
+
|
168
|
+
# Include the original text if requested
|
169
|
+
if include_original:
|
170
|
+
if hash_original:
|
171
|
+
# Use MD5 hash for brevity, not for cryptographic security
|
172
|
+
new_scenario[field + "_original"] = hashlib.md5(
|
173
|
+
self.scenario[field].encode()
|
174
|
+
).hexdigest()
|
175
|
+
else:
|
176
|
+
new_scenario[field + "_original"] = self.scenario[field]
|
177
|
+
|
178
|
+
scenarios.append(new_scenario)
|
179
|
+
|
180
|
+
return ScenarioList(scenarios)
|
181
|
+
|
182
|
+
|
183
|
+
if __name__ == "__main__":
|
184
|
+
import doctest
|
185
|
+
|
186
|
+
doctest.testmod()
|
@@ -0,0 +1,101 @@
|
|
1
|
+
"""
|
2
|
+
Exceptions module for the scenarios package.
|
3
|
+
|
4
|
+
This module defines custom exception classes used throughout the scenarios module.
|
5
|
+
These exceptions provide specific error information for different types of errors
|
6
|
+
that can occur when working with Scenarios, ScenarioLists, and related components.
|
7
|
+
"""
|
8
|
+
|
9
|
+
import re
|
10
|
+
from typing import List
|
11
|
+
|
12
|
+
from ..base import BaseException
|
13
|
+
|
14
|
+
|
15
|
+
class AgentListError(BaseException):
|
16
|
+
"""
|
17
|
+
Exception raised for errors related to AgentList operations.
|
18
|
+
|
19
|
+
This exception is raised when there are issues with creating, modifying,
|
20
|
+
or using an AgentList in conjunction with scenarios.
|
21
|
+
|
22
|
+
Args:
|
23
|
+
message: A description of the error that occurred.
|
24
|
+
"""
|
25
|
+
|
26
|
+
def __init__(self, message: str):
|
27
|
+
"""
|
28
|
+
Initialize the AgentListError with a message.
|
29
|
+
|
30
|
+
Args:
|
31
|
+
message: A description of the error that occurred.
|
32
|
+
"""
|
33
|
+
super().__init__(message)
|
34
|
+
|
35
|
+
|
36
|
+
class ScenarioError(BaseException):
|
37
|
+
"""
|
38
|
+
Exception raised for errors related to Scenario operations.
|
39
|
+
|
40
|
+
This exception is raised when there are issues with creating, modifying,
|
41
|
+
or using Scenarios. It automatically includes a link to the documentation
|
42
|
+
in the error message and makes URLs clickable in terminal output.
|
43
|
+
|
44
|
+
Args:
|
45
|
+
message: A description of the error that occurred.
|
46
|
+
"""
|
47
|
+
|
48
|
+
documentation = "https://docs.expectedparrot.com/en/latest/scenarios.html#module-edsl.scenarios.Scenario"
|
49
|
+
|
50
|
+
def __init__(self, message: str):
|
51
|
+
"""
|
52
|
+
Initialize the ScenarioError with a message and add documentation link.
|
53
|
+
|
54
|
+
Args:
|
55
|
+
message: A description of the error that occurred.
|
56
|
+
"""
|
57
|
+
self.message = message + "\n" + "Documentation: " + self.documentation
|
58
|
+
super().__init__(self.message)
|
59
|
+
|
60
|
+
def __str__(self) -> str:
|
61
|
+
"""
|
62
|
+
Return a string representation of the error with clickable URLs.
|
63
|
+
|
64
|
+
This method makes any URLs in the error message clickable when displayed
|
65
|
+
in terminal environments that support ANSI escape sequences.
|
66
|
+
|
67
|
+
Returns:
|
68
|
+
The error message with clickable URLs.
|
69
|
+
"""
|
70
|
+
return self.make_urls_clickable(self.message)
|
71
|
+
|
72
|
+
@staticmethod
|
73
|
+
def make_urls_clickable(text: str) -> str:
|
74
|
+
"""
|
75
|
+
Convert URLs in text to clickable links in terminal output.
|
76
|
+
|
77
|
+
This method finds all URLs in the given text and wraps them in ANSI
|
78
|
+
escape sequences that make them clickable in supporting terminals.
|
79
|
+
|
80
|
+
Args:
|
81
|
+
text: The text containing URLs to make clickable.
|
82
|
+
|
83
|
+
Returns:
|
84
|
+
The text with URLs converted to clickable links.
|
85
|
+
|
86
|
+
Example:
|
87
|
+
>>> error = ScenarioError("See docs at https://example.com")
|
88
|
+
>>> s = str(error) # Returns the message with clickable link
|
89
|
+
...
|
90
|
+
"""
|
91
|
+
url_pattern = r"https?://[^\s]+"
|
92
|
+
urls = re.findall(url_pattern, text)
|
93
|
+
for url in urls:
|
94
|
+
clickable_url = f"\033]8;;{url}\007{url}\033]8;;\007"
|
95
|
+
text = text.replace(url, clickable_url)
|
96
|
+
return text
|
97
|
+
|
98
|
+
|
99
|
+
if __name__ == "__main__":
|
100
|
+
import doctest
|
101
|
+
doctest.testmod(optionflags=doctest.ELLIPSIS)
|
edsl/scenarios/file_methods.py
CHANGED
@@ -2,8 +2,7 @@ from typing import Optional, Dict, Type
|
|
2
2
|
from abc import ABC, abstractmethod
|
3
3
|
import importlib.metadata
|
4
4
|
import importlib.util
|
5
|
-
|
6
|
-
from edsl.utilities.is_notebook import is_notebook
|
5
|
+
from ..utilities import is_notebook
|
7
6
|
|
8
7
|
|
9
8
|
class FileMethods(ABC):
|
@@ -30,7 +29,7 @@ class FileMethods(ABC):
|
|
30
29
|
def load_plugins(cls):
|
31
30
|
"""Load all file handler plugins including built-ins and external plugins."""
|
32
31
|
|
33
|
-
from
|
32
|
+
from . import handlers # noqa: F401 - import needed for handler registration
|
34
33
|
|
35
34
|
# Then load any external plugins
|
36
35
|
try:
|