edsl 0.1.47__py3-none-any.whl → 0.1.48__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (314) hide show
  1. edsl/__init__.py +44 -39
  2. edsl/__version__.py +1 -1
  3. edsl/agents/__init__.py +4 -2
  4. edsl/agents/{Agent.py → agent.py} +442 -152
  5. edsl/agents/{AgentList.py → agent_list.py} +220 -162
  6. edsl/agents/descriptors.py +46 -7
  7. edsl/{exceptions/agents.py → agents/exceptions.py} +3 -12
  8. edsl/base/__init__.py +75 -0
  9. edsl/base/base_class.py +1303 -0
  10. edsl/base/data_transfer_models.py +114 -0
  11. edsl/base/enums.py +215 -0
  12. edsl/base.py +8 -0
  13. edsl/buckets/__init__.py +25 -0
  14. edsl/buckets/bucket_collection.py +324 -0
  15. edsl/buckets/model_buckets.py +206 -0
  16. edsl/buckets/token_bucket.py +502 -0
  17. edsl/{jobs/buckets/TokenBucketAPI.py → buckets/token_bucket_api.py} +1 -1
  18. edsl/buckets/token_bucket_client.py +509 -0
  19. edsl/caching/__init__.py +20 -0
  20. edsl/caching/cache.py +814 -0
  21. edsl/caching/cache_entry.py +427 -0
  22. edsl/{data/CacheHandler.py → caching/cache_handler.py} +14 -15
  23. edsl/caching/exceptions.py +24 -0
  24. edsl/caching/orm.py +30 -0
  25. edsl/{data/RemoteCacheSync.py → caching/remote_cache_sync.py} +3 -3
  26. edsl/caching/sql_dict.py +441 -0
  27. edsl/config/__init__.py +8 -0
  28. edsl/config/config_class.py +177 -0
  29. edsl/config.py +4 -176
  30. edsl/conversation/Conversation.py +7 -7
  31. edsl/conversation/car_buying.py +4 -4
  32. edsl/conversation/chips.py +6 -6
  33. edsl/coop/__init__.py +25 -2
  34. edsl/coop/coop.py +303 -67
  35. edsl/coop/{ExpectedParrotKeyHandler.py → ep_key_handling.py} +86 -10
  36. edsl/coop/exceptions.py +62 -0
  37. edsl/coop/price_fetcher.py +126 -0
  38. edsl/coop/utils.py +89 -24
  39. edsl/data_transfer_models.py +5 -72
  40. edsl/dataset/__init__.py +10 -0
  41. edsl/{results/Dataset.py → dataset/dataset.py} +116 -36
  42. edsl/{results/DatasetExportMixin.py → dataset/dataset_operations_mixin.py} +606 -122
  43. edsl/{results/DatasetTree.py → dataset/dataset_tree.py} +156 -75
  44. edsl/{results/TableDisplay.py → dataset/display/table_display.py} +18 -7
  45. edsl/{results → dataset/display}/table_renderers.py +58 -2
  46. edsl/{results → dataset}/file_exports.py +4 -5
  47. edsl/{results → dataset}/smart_objects.py +2 -2
  48. edsl/enums.py +5 -205
  49. edsl/inference_services/__init__.py +5 -0
  50. edsl/inference_services/{AvailableModelCacheHandler.py → available_model_cache_handler.py} +2 -3
  51. edsl/inference_services/{AvailableModelFetcher.py → available_model_fetcher.py} +8 -14
  52. edsl/inference_services/data_structures.py +3 -2
  53. edsl/{exceptions/inference_services.py → inference_services/exceptions.py} +1 -1
  54. edsl/inference_services/{InferenceServiceABC.py → inference_service_abc.py} +1 -1
  55. edsl/inference_services/{InferenceServicesCollection.py → inference_services_collection.py} +8 -7
  56. edsl/inference_services/registry.py +4 -41
  57. edsl/inference_services/{ServiceAvailability.py → service_availability.py} +5 -25
  58. edsl/inference_services/services/__init__.py +31 -0
  59. edsl/inference_services/{AnthropicService.py → services/anthropic_service.py} +3 -3
  60. edsl/inference_services/{AwsBedrock.py → services/aws_bedrock.py} +2 -2
  61. edsl/inference_services/{AzureAI.py → services/azure_ai.py} +2 -2
  62. edsl/inference_services/{DeepInfraService.py → services/deep_infra_service.py} +1 -3
  63. edsl/inference_services/{DeepSeekService.py → services/deep_seek_service.py} +2 -4
  64. edsl/inference_services/{GoogleService.py → services/google_service.py} +5 -4
  65. edsl/inference_services/{GroqService.py → services/groq_service.py} +1 -1
  66. edsl/inference_services/{MistralAIService.py → services/mistral_ai_service.py} +3 -3
  67. edsl/inference_services/{OllamaService.py → services/ollama_service.py} +1 -7
  68. edsl/inference_services/{OpenAIService.py → services/open_ai_service.py} +5 -6
  69. edsl/inference_services/{PerplexityService.py → services/perplexity_service.py} +3 -7
  70. edsl/inference_services/{TestService.py → services/test_service.py} +7 -6
  71. edsl/inference_services/{TogetherAIService.py → services/together_ai_service.py} +2 -6
  72. edsl/inference_services/{XAIService.py → services/xai_service.py} +1 -1
  73. edsl/inference_services/write_available.py +1 -2
  74. edsl/instructions/__init__.py +6 -0
  75. edsl/{surveys/instructions/Instruction.py → instructions/instruction.py} +11 -6
  76. edsl/{surveys/instructions/InstructionCollection.py → instructions/instruction_collection.py} +10 -5
  77. edsl/{surveys/InstructionHandler.py → instructions/instruction_handler.py} +3 -3
  78. edsl/{jobs/interviews → interviews}/ReportErrors.py +2 -2
  79. edsl/interviews/__init__.py +4 -0
  80. edsl/{jobs/AnswerQuestionFunctionConstructor.py → interviews/answering_function.py} +45 -18
  81. edsl/{jobs/interviews/InterviewExceptionEntry.py → interviews/exception_tracking.py} +107 -22
  82. edsl/interviews/interview.py +638 -0
  83. edsl/{jobs/interviews/InterviewStatusDictionary.py → interviews/interview_status_dictionary.py} +21 -12
  84. edsl/{jobs/interviews/InterviewStatusLog.py → interviews/interview_status_log.py} +16 -7
  85. edsl/{jobs/InterviewTaskManager.py → interviews/interview_task_manager.py} +12 -7
  86. edsl/{jobs/RequestTokenEstimator.py → interviews/request_token_estimator.py} +8 -3
  87. edsl/{jobs/interviews/InterviewStatistic.py → interviews/statistics.py} +36 -10
  88. edsl/invigilators/__init__.py +38 -0
  89. edsl/invigilators/invigilator_base.py +477 -0
  90. edsl/{agents/Invigilator.py → invigilators/invigilators.py} +263 -10
  91. edsl/invigilators/prompt_constructor.py +476 -0
  92. edsl/{agents → invigilators}/prompt_helpers.py +2 -1
  93. edsl/{agents/QuestionInstructionPromptBuilder.py → invigilators/question_instructions_prompt_builder.py} +18 -13
  94. edsl/{agents → invigilators}/question_option_processor.py +96 -21
  95. edsl/{agents/QuestionTemplateReplacementsBuilder.py → invigilators/question_template_replacements_builder.py} +64 -12
  96. edsl/jobs/__init__.py +7 -1
  97. edsl/jobs/async_interview_runner.py +99 -35
  98. edsl/jobs/check_survey_scenario_compatibility.py +7 -5
  99. edsl/jobs/data_structures.py +153 -22
  100. edsl/{exceptions/jobs.py → jobs/exceptions.py} +2 -1
  101. edsl/jobs/{FetchInvigilator.py → fetch_invigilator.py} +4 -4
  102. edsl/jobs/{loggers/HTMLTableJobLogger.py → html_table_job_logger.py} +6 -2
  103. edsl/jobs/{Jobs.py → jobs.py} +313 -167
  104. edsl/jobs/{JobsChecks.py → jobs_checks.py} +15 -7
  105. edsl/jobs/{JobsComponentConstructor.py → jobs_component_constructor.py} +19 -17
  106. edsl/jobs/{InterviewsConstructor.py → jobs_interview_constructor.py} +10 -5
  107. edsl/jobs/jobs_pricing_estimation.py +347 -0
  108. edsl/jobs/{JobsRemoteInferenceLogger.py → jobs_remote_inference_logger.py} +4 -3
  109. edsl/jobs/jobs_runner_asyncio.py +282 -0
  110. edsl/jobs/{JobsRemoteInferenceHandler.py → remote_inference.py} +19 -22
  111. edsl/jobs/results_exceptions_handler.py +2 -2
  112. edsl/key_management/__init__.py +28 -0
  113. edsl/key_management/key_lookup.py +161 -0
  114. edsl/{language_models/key_management/KeyLookupBuilder.py → key_management/key_lookup_builder.py} +118 -47
  115. edsl/key_management/key_lookup_collection.py +82 -0
  116. edsl/key_management/models.py +218 -0
  117. edsl/language_models/__init__.py +7 -2
  118. edsl/language_models/{ComputeCost.py → compute_cost.py} +18 -3
  119. edsl/{exceptions/language_models.py → language_models/exceptions.py} +2 -1
  120. edsl/language_models/language_model.py +1080 -0
  121. edsl/language_models/model.py +10 -25
  122. edsl/language_models/{ModelList.py → model_list.py} +9 -14
  123. edsl/language_models/{RawResponseHandler.py → raw_response_handler.py} +1 -1
  124. edsl/language_models/{RegisterLanguageModelsMeta.py → registry.py} +1 -1
  125. edsl/language_models/repair.py +4 -4
  126. edsl/language_models/utilities.py +4 -4
  127. edsl/notebooks/__init__.py +3 -1
  128. edsl/notebooks/{Notebook.py → notebook.py} +7 -8
  129. edsl/prompts/__init__.py +1 -1
  130. edsl/{exceptions/prompts.py → prompts/exceptions.py} +3 -1
  131. edsl/prompts/{Prompt.py → prompt.py} +101 -95
  132. edsl/questions/HTMLQuestion.py +1 -1
  133. edsl/questions/__init__.py +154 -25
  134. edsl/questions/answer_validator_mixin.py +1 -1
  135. edsl/questions/compose_questions.py +4 -3
  136. edsl/questions/derived/question_likert_five.py +166 -0
  137. edsl/questions/derived/{QuestionLinearScale.py → question_linear_scale.py} +4 -4
  138. edsl/questions/derived/{QuestionTopK.py → question_top_k.py} +4 -4
  139. edsl/questions/derived/{QuestionYesNo.py → question_yes_no.py} +4 -5
  140. edsl/questions/descriptors.py +24 -30
  141. edsl/questions/loop_processor.py +65 -19
  142. edsl/questions/question_base.py +881 -0
  143. edsl/questions/question_base_gen_mixin.py +15 -16
  144. edsl/questions/{QuestionBasePromptsMixin.py → question_base_prompts_mixin.py} +2 -2
  145. edsl/questions/{QuestionBudget.py → question_budget.py} +3 -4
  146. edsl/questions/{QuestionCheckBox.py → question_check_box.py} +16 -16
  147. edsl/questions/{QuestionDict.py → question_dict.py} +39 -5
  148. edsl/questions/{QuestionExtract.py → question_extract.py} +9 -9
  149. edsl/questions/question_free_text.py +282 -0
  150. edsl/questions/{QuestionFunctional.py → question_functional.py} +6 -5
  151. edsl/questions/{QuestionList.py → question_list.py} +6 -7
  152. edsl/questions/{QuestionMatrix.py → question_matrix.py} +6 -5
  153. edsl/questions/{QuestionMultipleChoice.py → question_multiple_choice.py} +126 -21
  154. edsl/questions/{QuestionNumerical.py → question_numerical.py} +5 -5
  155. edsl/questions/{QuestionRank.py → question_rank.py} +6 -6
  156. edsl/questions/question_registry.py +4 -9
  157. edsl/questions/register_questions_meta.py +8 -4
  158. edsl/questions/response_validator_abc.py +17 -16
  159. edsl/results/__init__.py +4 -1
  160. edsl/{exceptions/results.py → results/exceptions.py} +1 -1
  161. edsl/results/report.py +197 -0
  162. edsl/results/{Result.py → result.py} +131 -45
  163. edsl/results/{Results.py → results.py} +365 -220
  164. edsl/results/results_selector.py +344 -25
  165. edsl/scenarios/__init__.py +30 -3
  166. edsl/scenarios/{ConstructDownloadLink.py → construct_download_link.py} +7 -0
  167. edsl/scenarios/directory_scanner.py +156 -13
  168. edsl/scenarios/document_chunker.py +186 -0
  169. edsl/scenarios/exceptions.py +101 -0
  170. edsl/scenarios/file_methods.py +2 -3
  171. edsl/scenarios/{FileStore.py → file_store.py} +275 -189
  172. edsl/scenarios/handlers/__init__.py +14 -14
  173. edsl/scenarios/handlers/{csv.py → csv_file_store.py} +1 -2
  174. edsl/scenarios/handlers/{docx.py → docx_file_store.py} +8 -7
  175. edsl/scenarios/handlers/{html.py → html_file_store.py} +1 -2
  176. edsl/scenarios/handlers/{jpeg.py → jpeg_file_store.py} +1 -1
  177. edsl/scenarios/handlers/{json.py → json_file_store.py} +1 -1
  178. edsl/scenarios/handlers/latex_file_store.py +5 -0
  179. edsl/scenarios/handlers/{md.py → md_file_store.py} +1 -1
  180. edsl/scenarios/handlers/{pdf.py → pdf_file_store.py} +2 -2
  181. edsl/scenarios/handlers/{png.py → png_file_store.py} +1 -1
  182. edsl/scenarios/handlers/{pptx.py → pptx_file_store.py} +8 -7
  183. edsl/scenarios/handlers/{py.py → py_file_store.py} +1 -3
  184. edsl/scenarios/handlers/{sql.py → sql_file_store.py} +2 -1
  185. edsl/scenarios/handlers/{sqlite.py → sqlite_file_store.py} +2 -3
  186. edsl/scenarios/handlers/{txt.py → txt_file_store.py} +1 -1
  187. edsl/scenarios/scenario.py +928 -0
  188. edsl/scenarios/scenario_join.py +18 -5
  189. edsl/scenarios/{ScenarioList.py → scenario_list.py} +294 -106
  190. edsl/scenarios/{ScenarioListPdfMixin.py → scenario_list_pdf_tools.py} +16 -15
  191. edsl/scenarios/scenario_selector.py +5 -1
  192. edsl/study/ObjectEntry.py +2 -2
  193. edsl/study/SnapShot.py +5 -5
  194. edsl/study/Study.py +18 -19
  195. edsl/study/__init__.py +6 -4
  196. edsl/surveys/__init__.py +7 -4
  197. edsl/surveys/dag/__init__.py +2 -0
  198. edsl/surveys/{ConstructDAG.py → dag/construct_dag.py} +3 -3
  199. edsl/surveys/{DAG.py → dag/dag.py} +13 -10
  200. edsl/surveys/descriptors.py +1 -1
  201. edsl/surveys/{EditSurvey.py → edit_survey.py} +9 -9
  202. edsl/{exceptions/surveys.py → surveys/exceptions.py} +1 -2
  203. edsl/surveys/memory/__init__.py +3 -0
  204. edsl/surveys/{MemoryPlan.py → memory/memory_plan.py} +10 -9
  205. edsl/surveys/rules/__init__.py +3 -0
  206. edsl/surveys/{Rule.py → rules/rule.py} +103 -43
  207. edsl/surveys/{RuleCollection.py → rules/rule_collection.py} +21 -30
  208. edsl/surveys/{RuleManager.py → rules/rule_manager.py} +19 -13
  209. edsl/surveys/survey.py +1743 -0
  210. edsl/surveys/{SurveyExportMixin.py → survey_export.py} +22 -27
  211. edsl/surveys/{SurveyFlowVisualization.py → survey_flow_visualization.py} +11 -2
  212. edsl/surveys/{Simulator.py → survey_simulator.py} +10 -3
  213. edsl/tasks/__init__.py +32 -0
  214. edsl/{jobs/tasks/QuestionTaskCreator.py → tasks/question_task_creator.py} +115 -57
  215. edsl/tasks/task_creators.py +135 -0
  216. edsl/{jobs/tasks/TaskHistory.py → tasks/task_history.py} +86 -47
  217. edsl/{jobs/tasks → tasks}/task_status_enum.py +91 -7
  218. edsl/tasks/task_status_log.py +85 -0
  219. edsl/tokens/__init__.py +2 -0
  220. edsl/tokens/interview_token_usage.py +53 -0
  221. edsl/utilities/PrettyList.py +1 -1
  222. edsl/utilities/SystemInfo.py +25 -22
  223. edsl/utilities/__init__.py +29 -21
  224. edsl/utilities/gcp_bucket/__init__.py +2 -0
  225. edsl/utilities/gcp_bucket/cloud_storage.py +99 -96
  226. edsl/utilities/interface.py +44 -536
  227. edsl/{results/MarkdownToPDF.py → utilities/markdown_to_pdf.py} +13 -5
  228. edsl/utilities/repair_functions.py +1 -1
  229. {edsl-0.1.47.dist-info → edsl-0.1.48.dist-info}/METADATA +1 -1
  230. edsl-0.1.48.dist-info/RECORD +347 -0
  231. edsl/Base.py +0 -493
  232. edsl/BaseDiff.py +0 -260
  233. edsl/agents/InvigilatorBase.py +0 -260
  234. edsl/agents/PromptConstructor.py +0 -318
  235. edsl/coop/PriceFetcher.py +0 -54
  236. edsl/data/Cache.py +0 -582
  237. edsl/data/CacheEntry.py +0 -238
  238. edsl/data/SQLiteDict.py +0 -292
  239. edsl/data/__init__.py +0 -5
  240. edsl/data/orm.py +0 -10
  241. edsl/exceptions/cache.py +0 -5
  242. edsl/exceptions/coop.py +0 -14
  243. edsl/exceptions/data.py +0 -14
  244. edsl/exceptions/scenarios.py +0 -29
  245. edsl/jobs/Answers.py +0 -43
  246. edsl/jobs/JobsPrompts.py +0 -354
  247. edsl/jobs/buckets/BucketCollection.py +0 -134
  248. edsl/jobs/buckets/ModelBuckets.py +0 -65
  249. edsl/jobs/buckets/TokenBucket.py +0 -283
  250. edsl/jobs/buckets/TokenBucketClient.py +0 -191
  251. edsl/jobs/interviews/Interview.py +0 -395
  252. edsl/jobs/interviews/InterviewExceptionCollection.py +0 -99
  253. edsl/jobs/interviews/InterviewStatisticsCollection.py +0 -25
  254. edsl/jobs/runners/JobsRunnerAsyncio.py +0 -163
  255. edsl/jobs/runners/JobsRunnerStatusData.py +0 -0
  256. edsl/jobs/tasks/TaskCreators.py +0 -64
  257. edsl/jobs/tasks/TaskStatusLog.py +0 -23
  258. edsl/jobs/tokens/InterviewTokenUsage.py +0 -27
  259. edsl/language_models/LanguageModel.py +0 -635
  260. edsl/language_models/ServiceDataSources.py +0 -0
  261. edsl/language_models/key_management/KeyLookup.py +0 -63
  262. edsl/language_models/key_management/KeyLookupCollection.py +0 -38
  263. edsl/language_models/key_management/models.py +0 -137
  264. edsl/questions/QuestionBase.py +0 -544
  265. edsl/questions/QuestionFreeText.py +0 -130
  266. edsl/questions/derived/QuestionLikertFive.py +0 -76
  267. edsl/results/ResultsExportMixin.py +0 -45
  268. edsl/results/TextEditor.py +0 -50
  269. edsl/results/results_fetch_mixin.py +0 -33
  270. edsl/results/results_tools_mixin.py +0 -98
  271. edsl/scenarios/DocumentChunker.py +0 -104
  272. edsl/scenarios/Scenario.py +0 -548
  273. edsl/scenarios/ScenarioHtmlMixin.py +0 -65
  274. edsl/scenarios/ScenarioListExportMixin.py +0 -45
  275. edsl/scenarios/handlers/latex.py +0 -5
  276. edsl/shared.py +0 -1
  277. edsl/surveys/Survey.py +0 -1301
  278. edsl/surveys/SurveyQualtricsImport.py +0 -284
  279. edsl/surveys/SurveyToApp.py +0 -141
  280. edsl/surveys/instructions/__init__.py +0 -0
  281. edsl/tools/__init__.py +0 -1
  282. edsl/tools/clusters.py +0 -192
  283. edsl/tools/embeddings.py +0 -27
  284. edsl/tools/embeddings_plotting.py +0 -118
  285. edsl/tools/plotting.py +0 -112
  286. edsl/tools/summarize.py +0 -18
  287. edsl/utilities/data/Registry.py +0 -6
  288. edsl/utilities/data/__init__.py +0 -1
  289. edsl/utilities/data/scooter_results.json +0 -1
  290. edsl-0.1.47.dist-info/RECORD +0 -354
  291. /edsl/coop/{CoopFunctionsMixin.py → coop_functions.py} +0 -0
  292. /edsl/{results → dataset/display}/CSSParameterizer.py +0 -0
  293. /edsl/{language_models/key_management → dataset/display}/__init__.py +0 -0
  294. /edsl/{results → dataset/display}/table_data_class.py +0 -0
  295. /edsl/{results → dataset/display}/table_display.css +0 -0
  296. /edsl/{results/ResultsGGMixin.py → dataset/r/ggplot.py} +0 -0
  297. /edsl/{results → dataset}/tree_explore.py +0 -0
  298. /edsl/{surveys/instructions/ChangeInstruction.py → instructions/change_instruction.py} +0 -0
  299. /edsl/{jobs/interviews → interviews}/interview_status_enum.py +0 -0
  300. /edsl/jobs/{runners/JobsRunnerStatus.py → jobs_runner_status.py} +0 -0
  301. /edsl/language_models/{PriceManager.py → price_manager.py} +0 -0
  302. /edsl/language_models/{fake_openai_call.py → unused/fake_openai_call.py} +0 -0
  303. /edsl/language_models/{fake_openai_service.py → unused/fake_openai_service.py} +0 -0
  304. /edsl/notebooks/{NotebookToLaTeX.py → notebook_to_latex.py} +0 -0
  305. /edsl/{exceptions/questions.py → questions/exceptions.py} +0 -0
  306. /edsl/questions/{SimpleAskMixin.py → simple_ask_mixin.py} +0 -0
  307. /edsl/surveys/{Memory.py → memory/memory.py} +0 -0
  308. /edsl/surveys/{MemoryManagement.py → memory/memory_management.py} +0 -0
  309. /edsl/surveys/{SurveyCSS.py → survey_css.py} +0 -0
  310. /edsl/{jobs/tokens/TokenUsage.py → tokens/token_usage.py} +0 -0
  311. /edsl/{results/MarkdownToDocx.py → utilities/markdown_to_docx.py} +0 -0
  312. /edsl/{TemplateLoader.py → utilities/template_loader.py} +0 -0
  313. {edsl-0.1.47.dist-info → edsl-0.1.48.dist-info}/LICENSE +0 -0
  314. {edsl-0.1.47.dist-info → edsl-0.1.48.dist-info}/WHEEL +0 -0
@@ -1,15 +1,56 @@
1
- # directory_scanner.py
1
+ """
2
+ The DirectoryScanner module provides functionality for finding and processing files in directories.
3
+
4
+ This module implements the DirectoryScanner class, which is designed to scan directories
5
+ for files matching specific criteria and process them using a factory function. It supports
6
+ recursive scanning, filtering by file extensions, and both eager and lazy iteration over
7
+ the matching files.
8
+ """
9
+
2
10
  from dataclasses import dataclass
3
11
  from typing import Optional, List, Iterator, TypeVar, Generic, Callable, Any
4
12
  import os
13
+ from pathlib import Path
5
14
 
15
+ # Generic type variable for the factory function's return type
6
16
  T = TypeVar("T")
7
17
 
8
18
 
9
19
  @dataclass
10
20
  class DirectoryScanner:
11
21
  """
12
- Scanner for finding files in a directory based on various criteria.
22
+ A utility class for finding and processing files in directories.
23
+
24
+ DirectoryScanner provides methods to scan directories for files that match specific
25
+ criteria, such as file extensions. It can process matching files using a factory
26
+ function that converts file paths to objects of a specified type.
27
+
28
+ The scanner supports both eager (scan) and lazy (iter_scan) iteration, recursive
29
+ directory traversal, and flexible filtering based on file extensions.
30
+
31
+ Attributes:
32
+ directory_path: The path to the directory to scan.
33
+
34
+ Examples:
35
+ >>> import tempfile
36
+ >>> import os
37
+ >>> # Create a temporary directory with some files
38
+ >>> with tempfile.TemporaryDirectory() as tmpdir:
39
+ ... # Create a few files with different extensions
40
+ ... _ = open(os.path.join(tmpdir, "file1.txt"), "w").write("content")
41
+ ... _ = open(os.path.join(tmpdir, "file2.txt"), "w").write("content")
42
+ ... _ = open(os.path.join(tmpdir, "image.jpg"), "w").write("content")
43
+ ... # Create a scanner and find all text files
44
+ ... scanner = DirectoryScanner(tmpdir)
45
+ ... txt_files = scanner.scan(lambda path: path, suffix_allow_list=["txt"])
46
+ ... len(txt_files)
47
+ ... # Use a factory to process files
48
+ ... def get_filename(path):
49
+ ... return os.path.basename(path)
50
+ ... filenames = scanner.scan(get_filename)
51
+ ... sorted(filenames)
52
+ 2
53
+ ['file1.txt', 'file2.txt', 'image.jpg']
13
54
  """
14
55
 
15
56
  directory_path: str
@@ -24,15 +65,56 @@ class DirectoryScanner:
24
65
  include_no_extension: bool = True,
25
66
  ) -> List[T]:
26
67
  """
27
- Eagerly scan directory and return list of objects created by factory.
28
-
68
+ Eagerly scan directory and return a list of objects created by the factory function.
69
+
70
+ This method performs a scan of the directory, filtering files based on the provided
71
+ criteria, and applies the factory function to each matching file path. It returns
72
+ a complete list of processed results.
73
+
29
74
  Args:
30
- factory: Callable that creates objects from file paths
31
- recursive: If True, recursively traverse subdirectories
32
- suffix_allow_list: List of allowed file extensions (without dots)
33
- suffix_exclude_list: List of excluded file extensions (takes precedence over allow list)
34
- example_suffix: If provided, only include files with this example suffix
35
- include_no_extension: Whether to include files without extensions
75
+ factory: A callable that takes a file path string and returns an object of type T.
76
+ This is applied to each matching file path.
77
+ recursive: If True, traverses subdirectories recursively. If False, only scans
78
+ the top-level directory.
79
+ suffix_allow_list: A list of file extensions (without dots) to include.
80
+ If provided, only files with these extensions are included.
81
+ suffix_exclude_list: A list of file extensions to exclude. This takes precedence
82
+ over suffix_allow_list.
83
+ example_suffix: If provided, only include files ending with this exact suffix.
84
+ This checks the entire filename, not just the extension.
85
+ include_no_extension: Whether to include files without extensions. Defaults to True.
86
+
87
+ Returns:
88
+ A list of objects created by applying the factory function to each matching file path.
89
+
90
+ Examples:
91
+ >>> import tempfile
92
+ >>> import os
93
+ >>> with tempfile.TemporaryDirectory() as tmpdir:
94
+ ... # Create test files
95
+ ... _ = open(os.path.join(tmpdir, "doc1.txt"), "w").write("content")
96
+ ... _ = open(os.path.join(tmpdir, "doc2.md"), "w").write("content")
97
+ ... os.mkdir(os.path.join(tmpdir, "subdir"))
98
+ ... _ = open(os.path.join(tmpdir, "subdir", "doc3.txt"), "w").write("content")
99
+ ... # Scan for text files only
100
+ ... scanner = DirectoryScanner(tmpdir)
101
+ ... paths = scanner.scan(lambda p: p, suffix_allow_list=["txt"])
102
+ ... len(paths)
103
+ ... # Recursive scan for all files
104
+ ... all_paths = scanner.scan(lambda p: p, recursive=True)
105
+ ... len(all_paths)
106
+ ... # Exclude specific extensions
107
+ ... no_md = scanner.scan(lambda p: p, recursive=True, suffix_exclude_list=["md"])
108
+ ... len(no_md)
109
+ 1
110
+ 3
111
+ 2
112
+
113
+ Notes:
114
+ - This method is eager and collects all results into memory. For large directories,
115
+ consider using iter_scan instead.
116
+ - The filtering logic applies filters in this order: exclude list, example suffix,
117
+ allow list, and no extension.
36
118
  """
37
119
  return list(
38
120
  self.iter_scan(
@@ -55,12 +137,64 @@ class DirectoryScanner:
55
137
  include_no_extension: bool = True,
56
138
  ) -> Iterator[T]:
57
139
  """
58
- Lazily scan directory and yield objects created by factory.
140
+ Lazily scan directory and yield objects created by the factory function.
141
+
142
+ This method performs a lazy scan of the directory, filtering files based on the provided
143
+ criteria, and applies the factory function to each matching file path. It yields
144
+ results one by one, allowing for memory-efficient processing of large directories.
145
+
146
+ Args:
147
+ factory: A callable that takes a file path string and returns an object of type T.
148
+ This is applied to each matching file path.
149
+ recursive: If True, traverses subdirectories recursively. If False, only scans
150
+ the top-level directory.
151
+ suffix_allow_list: A list of file extensions (without dots) to include.
152
+ If provided, only files with these extensions are included.
153
+ suffix_exclude_list: A list of file extensions to exclude. This takes precedence
154
+ over suffix_allow_list.
155
+ example_suffix: If provided, only include files ending with this exact suffix.
156
+ This checks the entire filename, not just the extension.
157
+ include_no_extension: Whether to include files without extensions. Defaults to True.
158
+
159
+ Yields:
160
+ Objects created by applying the factory function to each matching file path,
161
+ yielded one at a time.
162
+
163
+ Examples:
164
+ >>> import tempfile
165
+ >>> import os
166
+ >>> with tempfile.TemporaryDirectory() as tmpdir:
167
+ ... # Create test files
168
+ ... _ = open(os.path.join(tmpdir, "doc1.txt"), "w").write("content")
169
+ ... _ = open(os.path.join(tmpdir, "doc2.md"), "w").write("content")
170
+ ... # Process files lazily
171
+ ... scanner = DirectoryScanner(tmpdir)
172
+ ... for path in scanner.iter_scan(lambda p: p):
173
+ ... # Process each file path without loading all into memory
174
+ ... file_exists = os.path.exists(path)
175
+ ... assert file_exists
176
+
177
+ Notes:
178
+ - This method is lazy and yields results as they are processed, making it
179
+ suitable for memory-efficient processing of large directories.
180
+ - The filtering logic is identical to the scan method.
59
181
  """
60
182
 
61
183
  def should_include_file(filepath: str) -> bool:
184
+ """
185
+ Determine if a file should be included based on filtering criteria.
186
+
187
+ This helper function applies all the filtering rules to determine
188
+ if a given file path should be included in the results.
189
+
190
+ Args:
191
+ filepath: The path to the file to check.
192
+
193
+ Returns:
194
+ True if the file should be included, False otherwise.
195
+ """
62
196
  _, ext = os.path.splitext(filepath)
63
- ext = ext[1:] if ext else ""
197
+ ext = ext[1:] if ext else "" # Remove leading dot from extension
64
198
 
65
199
  # Handle no extension case
66
200
  if not ext:
@@ -80,7 +214,16 @@ class DirectoryScanner:
80
214
 
81
215
  return True
82
216
 
83
- def iter_files():
217
+ def iter_files() -> Iterator[str]:
218
+ """
219
+ Generate paths to all files in the directory, optionally recursively.
220
+
221
+ This helper function yields file paths from the directory, handling
222
+ the recursive option appropriately.
223
+
224
+ Yields:
225
+ Paths to files in the directory.
226
+ """
84
227
  if recursive:
85
228
  for root, _, files in os.walk(self.directory_path):
86
229
  for file in files:
@@ -0,0 +1,186 @@
1
+ """
2
+ The DocumentChunker module provides functionality for splitting text into manageable chunks.
3
+
4
+ This module implements the DocumentChunker class, which is responsible for chunking
5
+ text content in Scenarios based on word or line counts. This is particularly useful
6
+ when working with large text documents that need to be processed in smaller pieces,
7
+ such as for summarization, analysis, or when dealing with models that have token
8
+ limits.
9
+ """
10
+
11
+ from __future__ import annotations
12
+ from typing import Optional, Generator, TYPE_CHECKING, List, Union
13
+ import copy
14
+ import hashlib
15
+
16
+ from .scenario import Scenario
17
+ from .scenario_list import ScenarioList
18
+
19
+
20
+ class DocumentChunker:
21
+ """
22
+ A utility class for splitting text in a Scenario into manageable chunks.
23
+
24
+ DocumentChunker provides methods to split text content from a Scenario field into
25
+ smaller chunks based on either word count or line count. It's primarily used by the
26
+ Scenario.chunk() method but can also be used directly for more control over the
27
+ chunking process.
28
+
29
+ Attributes:
30
+ scenario: The Scenario object containing the text to be chunked.
31
+ """
32
+
33
+ def __init__(self, scenario: "Scenario"):
34
+ """
35
+ Initialize a DocumentChunker for a specific Scenario.
36
+
37
+ Args:
38
+ scenario: The Scenario object containing the text field to be chunked.
39
+ """
40
+ self.scenario = scenario
41
+
42
+ @staticmethod
43
+ def _line_chunks(text: str, num_lines: int) -> Generator[str, None, None]:
44
+ """
45
+ Split text into chunks based on a specified number of lines per chunk.
46
+
47
+ This method divides a text string into chunks, where each chunk contains
48
+ at most the specified number of lines. It processes the text by splitting
49
+ on newline characters and then groups the lines into chunks.
50
+
51
+ Args:
52
+ text: The text string to split into chunks.
53
+ num_lines: The maximum number of lines to include in each chunk.
54
+
55
+ Yields:
56
+ String chunks containing at most num_lines lines each.
57
+
58
+ Examples:
59
+ >>> list(DocumentChunker._line_chunks("This is a test.\\nThis is a test. This is a test.", 1))
60
+ ['This is a test.', 'This is a test. This is a test.']
61
+
62
+ >>> list(DocumentChunker._line_chunks("Line 1\\nLine 2\\nLine 3\\nLine 4", 2))
63
+ ['Line 1\\nLine 2', 'Line 3\\nLine 4']
64
+ """
65
+ lines = text.split("\n")
66
+ for i in range(0, len(lines), num_lines):
67
+ chunk = "\n".join(lines[i : i + num_lines])
68
+ yield chunk
69
+
70
+ @staticmethod
71
+ def _word_chunks(text: str, num_words: int) -> Generator[str, None, None]:
72
+ """
73
+ Split text into chunks based on a specified number of words per chunk.
74
+
75
+ This method divides a text string into chunks, where each chunk contains
76
+ at most the specified number of words. It processes the text by splitting
77
+ on whitespace and then groups the words into chunks.
78
+
79
+ Args:
80
+ text: The text string to split into chunks.
81
+ num_words: The maximum number of words to include in each chunk.
82
+
83
+ Yields:
84
+ String chunks containing at most num_words words each.
85
+
86
+ Examples:
87
+ >>> list(DocumentChunker._word_chunks("This is a test.", 2))
88
+ ['This is', 'a test.']
89
+
90
+ >>> list(DocumentChunker._word_chunks("One two three four five", 3))
91
+ ['One two three', 'four five']
92
+ """
93
+ words = text.split()
94
+ for i in range(0, len(words), num_words):
95
+ chunk = " ".join(words[i : i + num_words])
96
+ yield chunk
97
+
98
+ def chunk(
99
+ self,
100
+ field: str,
101
+ num_words: Optional[int] = None,
102
+ num_lines: Optional[int] = None,
103
+ include_original: bool = False,
104
+ hash_original: bool = False,
105
+ ) -> ScenarioList:
106
+ """
107
+ Split a text field in the Scenario into chunks and create a ScenarioList.
108
+
109
+ This method takes a field containing text from the Scenario and divides it into
110
+ smaller chunks based on either word count or line count. For each chunk, it creates
111
+ a new Scenario with additional metadata about the chunk.
112
+
113
+ Args:
114
+ field: The key name of the field in the Scenario to split.
115
+ num_words: The number of words to include in each chunk. Mutually exclusive
116
+ with num_lines.
117
+ num_lines: The number of lines to include in each chunk. Mutually exclusive
118
+ with num_words.
119
+ include_original: If True, includes the original complete text in each chunk
120
+ with a "_original" suffix.
121
+ hash_original: If True and include_original is True, stores a hash of the
122
+ original text instead of the full text.
123
+
124
+ Returns:
125
+ A ScenarioList containing multiple Scenarios, each with a chunk of the
126
+ original text and metadata about the chunk.
127
+
128
+ Raises:
129
+ ValueError: If neither num_words nor num_lines is specified, or if both are.
130
+ KeyError: If the specified field doesn't exist in the Scenario.
131
+
132
+ Notes:
133
+ - Each chunk is assigned a sequential index in the '{field}_chunk' field
134
+ - Character and word counts for each chunk are included in '{field}_char_count'
135
+ and '{field}_word_count' fields
136
+ - When include_original is True, the original text is preserved in each chunk
137
+ in the '{field}_original' field
138
+ - The hash_original option is useful to save space while maintaining traceability
139
+ """
140
+ # Check if field exists in the scenario
141
+ if field not in self.scenario:
142
+ raise KeyError(f"Field '{field}' not found in the scenario")
143
+
144
+ # Validate parameters
145
+ if num_words is None and num_lines is None:
146
+ raise ValueError("You must specify either num_words or num_lines.")
147
+
148
+ if num_words is not None and num_lines is not None:
149
+ raise ValueError(
150
+ "You must specify either num_words or num_lines, but not both."
151
+ )
152
+
153
+ # Get appropriate chunks based on the specified chunking method
154
+ if num_words is not None:
155
+ chunks = list(self._word_chunks(self.scenario[field], num_words))
156
+ else: # num_lines is not None
157
+ chunks = list(self._line_chunks(self.scenario[field], num_lines))
158
+
159
+ # Create a new scenario for each chunk with metadata
160
+ scenarios = []
161
+ for i, chunk in enumerate(chunks):
162
+ new_scenario = copy.deepcopy(self.scenario)
163
+ new_scenario[field] = chunk
164
+ new_scenario[field + "_chunk"] = i
165
+ new_scenario[field + "_char_count"] = len(chunk)
166
+ new_scenario[field + "_word_count"] = len(chunk.split())
167
+
168
+ # Include the original text if requested
169
+ if include_original:
170
+ if hash_original:
171
+ # Use MD5 hash for brevity, not for cryptographic security
172
+ new_scenario[field + "_original"] = hashlib.md5(
173
+ self.scenario[field].encode()
174
+ ).hexdigest()
175
+ else:
176
+ new_scenario[field + "_original"] = self.scenario[field]
177
+
178
+ scenarios.append(new_scenario)
179
+
180
+ return ScenarioList(scenarios)
181
+
182
+
183
+ if __name__ == "__main__":
184
+ import doctest
185
+
186
+ doctest.testmod()
@@ -0,0 +1,101 @@
1
+ """
2
+ Exceptions module for the scenarios package.
3
+
4
+ This module defines custom exception classes used throughout the scenarios module.
5
+ These exceptions provide specific error information for different types of errors
6
+ that can occur when working with Scenarios, ScenarioLists, and related components.
7
+ """
8
+
9
+ import re
10
+ from typing import List
11
+
12
+ from ..base import BaseException
13
+
14
+
15
+ class AgentListError(BaseException):
16
+ """
17
+ Exception raised for errors related to AgentList operations.
18
+
19
+ This exception is raised when there are issues with creating, modifying,
20
+ or using an AgentList in conjunction with scenarios.
21
+
22
+ Args:
23
+ message: A description of the error that occurred.
24
+ """
25
+
26
+ def __init__(self, message: str):
27
+ """
28
+ Initialize the AgentListError with a message.
29
+
30
+ Args:
31
+ message: A description of the error that occurred.
32
+ """
33
+ super().__init__(message)
34
+
35
+
36
+ class ScenarioError(BaseException):
37
+ """
38
+ Exception raised for errors related to Scenario operations.
39
+
40
+ This exception is raised when there are issues with creating, modifying,
41
+ or using Scenarios. It automatically includes a link to the documentation
42
+ in the error message and makes URLs clickable in terminal output.
43
+
44
+ Args:
45
+ message: A description of the error that occurred.
46
+ """
47
+
48
+ documentation = "https://docs.expectedparrot.com/en/latest/scenarios.html#module-edsl.scenarios.Scenario"
49
+
50
+ def __init__(self, message: str):
51
+ """
52
+ Initialize the ScenarioError with a message and add documentation link.
53
+
54
+ Args:
55
+ message: A description of the error that occurred.
56
+ """
57
+ self.message = message + "\n" + "Documentation: " + self.documentation
58
+ super().__init__(self.message)
59
+
60
+ def __str__(self) -> str:
61
+ """
62
+ Return a string representation of the error with clickable URLs.
63
+
64
+ This method makes any URLs in the error message clickable when displayed
65
+ in terminal environments that support ANSI escape sequences.
66
+
67
+ Returns:
68
+ The error message with clickable URLs.
69
+ """
70
+ return self.make_urls_clickable(self.message)
71
+
72
+ @staticmethod
73
+ def make_urls_clickable(text: str) -> str:
74
+ """
75
+ Convert URLs in text to clickable links in terminal output.
76
+
77
+ This method finds all URLs in the given text and wraps them in ANSI
78
+ escape sequences that make them clickable in supporting terminals.
79
+
80
+ Args:
81
+ text: The text containing URLs to make clickable.
82
+
83
+ Returns:
84
+ The text with URLs converted to clickable links.
85
+
86
+ Example:
87
+ >>> error = ScenarioError("See docs at https://example.com")
88
+ >>> s = str(error) # Returns the message with clickable link
89
+ ...
90
+ """
91
+ url_pattern = r"https?://[^\s]+"
92
+ urls = re.findall(url_pattern, text)
93
+ for url in urls:
94
+ clickable_url = f"\033]8;;{url}\007{url}\033]8;;\007"
95
+ text = text.replace(url, clickable_url)
96
+ return text
97
+
98
+
99
+ if __name__ == "__main__":
100
+ import doctest
101
+ doctest.testmod(optionflags=doctest.ELLIPSIS)
@@ -2,8 +2,7 @@ from typing import Optional, Dict, Type
2
2
  from abc import ABC, abstractmethod
3
3
  import importlib.metadata
4
4
  import importlib.util
5
-
6
- from edsl.utilities.is_notebook import is_notebook
5
+ from ..utilities import is_notebook
7
6
 
8
7
 
9
8
  class FileMethods(ABC):
@@ -30,7 +29,7 @@ class FileMethods(ABC):
30
29
  def load_plugins(cls):
31
30
  """Load all file handler plugins including built-ins and external plugins."""
32
31
 
33
- from edsl.scenarios import handlers
32
+ from . import handlers # noqa: F401 - import needed for handler registration
34
33
 
35
34
  # Then load any external plugins
36
35
  try: