edsl 0.1.39__py3-none-any.whl → 0.1.39.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (212) hide show
  1. edsl/Base.py +116 -197
  2. edsl/__init__.py +7 -15
  3. edsl/__version__.py +1 -1
  4. edsl/agents/Agent.py +147 -351
  5. edsl/agents/AgentList.py +73 -211
  6. edsl/agents/Invigilator.py +50 -101
  7. edsl/agents/InvigilatorBase.py +70 -62
  8. edsl/agents/PromptConstructor.py +225 -143
  9. edsl/agents/__init__.py +1 -0
  10. edsl/agents/prompt_helpers.py +3 -3
  11. edsl/auto/AutoStudy.py +5 -18
  12. edsl/auto/StageBase.py +40 -53
  13. edsl/auto/StageQuestions.py +1 -2
  14. edsl/auto/utilities.py +6 -0
  15. edsl/config.py +2 -22
  16. edsl/conversation/car_buying.py +1 -2
  17. edsl/coop/PriceFetcher.py +1 -1
  18. edsl/coop/coop.py +47 -125
  19. edsl/coop/utils.py +14 -14
  20. edsl/data/Cache.py +27 -45
  21. edsl/data/CacheEntry.py +15 -12
  22. edsl/data/CacheHandler.py +12 -31
  23. edsl/data/RemoteCacheSync.py +46 -154
  24. edsl/data/__init__.py +3 -4
  25. edsl/data_transfer_models.py +1 -2
  26. edsl/enums.py +0 -27
  27. edsl/exceptions/__init__.py +50 -50
  28. edsl/exceptions/agents.py +0 -12
  29. edsl/exceptions/questions.py +6 -24
  30. edsl/exceptions/scenarios.py +0 -7
  31. edsl/inference_services/AnthropicService.py +19 -38
  32. edsl/inference_services/AwsBedrock.py +2 -0
  33. edsl/inference_services/AzureAI.py +2 -0
  34. edsl/inference_services/GoogleService.py +12 -7
  35. edsl/inference_services/InferenceServiceABC.py +85 -18
  36. edsl/inference_services/InferenceServicesCollection.py +79 -120
  37. edsl/inference_services/MistralAIService.py +3 -0
  38. edsl/inference_services/OpenAIService.py +35 -47
  39. edsl/inference_services/PerplexityService.py +3 -0
  40. edsl/inference_services/TestService.py +10 -11
  41. edsl/inference_services/TogetherAIService.py +3 -5
  42. edsl/jobs/Answers.py +14 -1
  43. edsl/jobs/Jobs.py +431 -356
  44. edsl/jobs/JobsChecks.py +10 -35
  45. edsl/jobs/JobsPrompts.py +4 -6
  46. edsl/jobs/JobsRemoteInferenceHandler.py +133 -205
  47. edsl/jobs/buckets/BucketCollection.py +3 -44
  48. edsl/jobs/buckets/TokenBucket.py +21 -53
  49. edsl/jobs/interviews/Interview.py +408 -143
  50. edsl/jobs/runners/JobsRunnerAsyncio.py +403 -88
  51. edsl/jobs/runners/JobsRunnerStatus.py +165 -133
  52. edsl/jobs/tasks/QuestionTaskCreator.py +19 -21
  53. edsl/jobs/tasks/TaskHistory.py +18 -38
  54. edsl/jobs/tasks/task_status_enum.py +2 -0
  55. edsl/language_models/KeyLookup.py +30 -0
  56. edsl/language_models/LanguageModel.py +236 -194
  57. edsl/language_models/ModelList.py +19 -28
  58. edsl/language_models/__init__.py +2 -1
  59. edsl/language_models/registry.py +190 -0
  60. edsl/language_models/repair.py +2 -2
  61. edsl/language_models/unused/ReplicateBase.py +83 -0
  62. edsl/language_models/utilities.py +4 -5
  63. edsl/notebooks/Notebook.py +14 -19
  64. edsl/prompts/Prompt.py +39 -29
  65. edsl/questions/{answer_validator_mixin.py → AnswerValidatorMixin.py} +2 -47
  66. edsl/questions/QuestionBase.py +214 -68
  67. edsl/questions/{question_base_gen_mixin.py → QuestionBaseGenMixin.py} +50 -57
  68. edsl/questions/QuestionBasePromptsMixin.py +3 -7
  69. edsl/questions/QuestionBudget.py +1 -1
  70. edsl/questions/QuestionCheckBox.py +3 -3
  71. edsl/questions/QuestionExtract.py +7 -5
  72. edsl/questions/QuestionFreeText.py +3 -2
  73. edsl/questions/QuestionList.py +18 -10
  74. edsl/questions/QuestionMultipleChoice.py +23 -67
  75. edsl/questions/QuestionNumerical.py +4 -2
  76. edsl/questions/QuestionRank.py +17 -7
  77. edsl/questions/{response_validator_abc.py → ResponseValidatorABC.py} +26 -40
  78. edsl/questions/SimpleAskMixin.py +3 -4
  79. edsl/questions/__init__.py +1 -2
  80. edsl/questions/derived/QuestionLinearScale.py +3 -6
  81. edsl/questions/derived/QuestionTopK.py +1 -1
  82. edsl/questions/descriptors.py +3 -17
  83. edsl/questions/question_registry.py +1 -1
  84. edsl/results/CSSParameterizer.py +1 -1
  85. edsl/results/Dataset.py +7 -170
  86. edsl/results/DatasetExportMixin.py +305 -168
  87. edsl/results/DatasetTree.py +8 -28
  88. edsl/results/Result.py +206 -298
  89. edsl/results/Results.py +131 -149
  90. edsl/results/ResultsDBMixin.py +238 -0
  91. edsl/results/ResultsExportMixin.py +0 -2
  92. edsl/results/{results_selector.py → Selector.py} +13 -23
  93. edsl/results/TableDisplay.py +171 -98
  94. edsl/results/__init__.py +1 -1
  95. edsl/scenarios/FileStore.py +239 -150
  96. edsl/scenarios/Scenario.py +193 -90
  97. edsl/scenarios/ScenarioHtmlMixin.py +3 -4
  98. edsl/scenarios/{scenario_join.py → ScenarioJoin.py} +6 -10
  99. edsl/scenarios/ScenarioList.py +244 -415
  100. edsl/scenarios/ScenarioListExportMixin.py +7 -0
  101. edsl/scenarios/ScenarioListPdfMixin.py +37 -15
  102. edsl/scenarios/__init__.py +2 -1
  103. edsl/study/ObjectEntry.py +1 -1
  104. edsl/study/SnapShot.py +1 -1
  105. edsl/study/Study.py +12 -5
  106. edsl/surveys/Rule.py +4 -5
  107. edsl/surveys/RuleCollection.py +27 -25
  108. edsl/surveys/Survey.py +791 -270
  109. edsl/surveys/SurveyCSS.py +8 -20
  110. edsl/surveys/{SurveyFlowVisualization.py → SurveyFlowVisualizationMixin.py} +9 -11
  111. edsl/surveys/__init__.py +2 -4
  112. edsl/surveys/descriptors.py +2 -6
  113. edsl/surveys/instructions/ChangeInstruction.py +2 -1
  114. edsl/surveys/instructions/Instruction.py +13 -4
  115. edsl/surveys/instructions/InstructionCollection.py +6 -11
  116. edsl/templates/error_reporting/interview_details.html +1 -1
  117. edsl/templates/error_reporting/report.html +1 -1
  118. edsl/tools/plotting.py +1 -1
  119. edsl/utilities/utilities.py +23 -35
  120. {edsl-0.1.39.dist-info → edsl-0.1.39.dev1.dist-info}/METADATA +10 -12
  121. edsl-0.1.39.dev1.dist-info/RECORD +277 -0
  122. {edsl-0.1.39.dist-info → edsl-0.1.39.dev1.dist-info}/WHEEL +1 -1
  123. edsl/agents/QuestionInstructionPromptBuilder.py +0 -128
  124. edsl/agents/QuestionTemplateReplacementsBuilder.py +0 -137
  125. edsl/agents/question_option_processor.py +0 -172
  126. edsl/coop/CoopFunctionsMixin.py +0 -15
  127. edsl/coop/ExpectedParrotKeyHandler.py +0 -125
  128. edsl/exceptions/inference_services.py +0 -5
  129. edsl/inference_services/AvailableModelCacheHandler.py +0 -184
  130. edsl/inference_services/AvailableModelFetcher.py +0 -215
  131. edsl/inference_services/ServiceAvailability.py +0 -135
  132. edsl/inference_services/data_structures.py +0 -134
  133. edsl/jobs/AnswerQuestionFunctionConstructor.py +0 -223
  134. edsl/jobs/FetchInvigilator.py +0 -47
  135. edsl/jobs/InterviewTaskManager.py +0 -98
  136. edsl/jobs/InterviewsConstructor.py +0 -50
  137. edsl/jobs/JobsComponentConstructor.py +0 -189
  138. edsl/jobs/JobsRemoteInferenceLogger.py +0 -239
  139. edsl/jobs/RequestTokenEstimator.py +0 -30
  140. edsl/jobs/async_interview_runner.py +0 -138
  141. edsl/jobs/buckets/TokenBucketAPI.py +0 -211
  142. edsl/jobs/buckets/TokenBucketClient.py +0 -191
  143. edsl/jobs/check_survey_scenario_compatibility.py +0 -85
  144. edsl/jobs/data_structures.py +0 -120
  145. edsl/jobs/decorators.py +0 -35
  146. edsl/jobs/jobs_status_enums.py +0 -9
  147. edsl/jobs/loggers/HTMLTableJobLogger.py +0 -304
  148. edsl/jobs/results_exceptions_handler.py +0 -98
  149. edsl/language_models/ComputeCost.py +0 -63
  150. edsl/language_models/PriceManager.py +0 -127
  151. edsl/language_models/RawResponseHandler.py +0 -106
  152. edsl/language_models/ServiceDataSources.py +0 -0
  153. edsl/language_models/key_management/KeyLookup.py +0 -63
  154. edsl/language_models/key_management/KeyLookupBuilder.py +0 -273
  155. edsl/language_models/key_management/KeyLookupCollection.py +0 -38
  156. edsl/language_models/key_management/__init__.py +0 -0
  157. edsl/language_models/key_management/models.py +0 -131
  158. edsl/language_models/model.py +0 -256
  159. edsl/notebooks/NotebookToLaTeX.py +0 -142
  160. edsl/questions/ExceptionExplainer.py +0 -77
  161. edsl/questions/HTMLQuestion.py +0 -103
  162. edsl/questions/QuestionMatrix.py +0 -265
  163. edsl/questions/data_structures.py +0 -20
  164. edsl/questions/loop_processor.py +0 -149
  165. edsl/questions/response_validator_factory.py +0 -34
  166. edsl/questions/templates/matrix/__init__.py +0 -1
  167. edsl/questions/templates/matrix/answering_instructions.jinja +0 -5
  168. edsl/questions/templates/matrix/question_presentation.jinja +0 -20
  169. edsl/results/MarkdownToDocx.py +0 -122
  170. edsl/results/MarkdownToPDF.py +0 -111
  171. edsl/results/TextEditor.py +0 -50
  172. edsl/results/file_exports.py +0 -252
  173. edsl/results/smart_objects.py +0 -96
  174. edsl/results/table_data_class.py +0 -12
  175. edsl/results/table_renderers.py +0 -118
  176. edsl/scenarios/ConstructDownloadLink.py +0 -109
  177. edsl/scenarios/DocumentChunker.py +0 -102
  178. edsl/scenarios/DocxScenario.py +0 -16
  179. edsl/scenarios/PdfExtractor.py +0 -40
  180. edsl/scenarios/directory_scanner.py +0 -96
  181. edsl/scenarios/file_methods.py +0 -85
  182. edsl/scenarios/handlers/__init__.py +0 -13
  183. edsl/scenarios/handlers/csv.py +0 -49
  184. edsl/scenarios/handlers/docx.py +0 -76
  185. edsl/scenarios/handlers/html.py +0 -37
  186. edsl/scenarios/handlers/json.py +0 -111
  187. edsl/scenarios/handlers/latex.py +0 -5
  188. edsl/scenarios/handlers/md.py +0 -51
  189. edsl/scenarios/handlers/pdf.py +0 -68
  190. edsl/scenarios/handlers/png.py +0 -39
  191. edsl/scenarios/handlers/pptx.py +0 -105
  192. edsl/scenarios/handlers/py.py +0 -294
  193. edsl/scenarios/handlers/sql.py +0 -313
  194. edsl/scenarios/handlers/sqlite.py +0 -149
  195. edsl/scenarios/handlers/txt.py +0 -33
  196. edsl/scenarios/scenario_selector.py +0 -156
  197. edsl/surveys/ConstructDAG.py +0 -92
  198. edsl/surveys/EditSurvey.py +0 -221
  199. edsl/surveys/InstructionHandler.py +0 -100
  200. edsl/surveys/MemoryManagement.py +0 -72
  201. edsl/surveys/RuleManager.py +0 -172
  202. edsl/surveys/Simulator.py +0 -75
  203. edsl/surveys/SurveyToApp.py +0 -141
  204. edsl/utilities/PrettyList.py +0 -56
  205. edsl/utilities/is_notebook.py +0 -18
  206. edsl/utilities/is_valid_variable_name.py +0 -11
  207. edsl/utilities/remove_edsl_version.py +0 -24
  208. edsl-0.1.39.dist-info/RECORD +0 -358
  209. /edsl/questions/{register_questions_meta.py → RegisterQuestionsMeta.py} +0 -0
  210. /edsl/results/{results_fetch_mixin.py → ResultsFetchMixin.py} +0 -0
  211. /edsl/results/{results_tools_mixin.py → ResultsToolsMixin.py} +0 -0
  212. {edsl-0.1.39.dist-info → edsl-0.1.39.dev1.dist-info}/LICENSE +0 -0
@@ -1,118 +0,0 @@
1
- from abc import ABC, abstractmethod
2
- from edsl.results.table_data_class import TableData
3
-
4
-
5
- class DataTablesRendererABC(ABC):
6
- def __init__(self, table_data: TableData):
7
- self.table_data = table_data
8
-
9
- @abstractmethod
10
- def render_html(self) -> str:
11
- pass
12
-
13
-
14
- class DataTablesRenderer(DataTablesRendererABC):
15
- """Interactive DataTables renderer implementation"""
16
-
17
- def render_html(self) -> str:
18
- html_template = """
19
- <!DOCTYPE html>
20
- <html>
21
- <head>
22
- <link href="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/5.3.0/css/bootstrap.min.css" rel="stylesheet">
23
- <link href="https://cdnjs.cloudflare.com/ajax/libs/datatables.net-bs5/1.13.6/dataTables.bootstrap5.min.css" rel="stylesheet">
24
- <link href="https://cdnjs.cloudflare.com/ajax/libs/datatables.net-buttons-bs5/2.4.1/buttons.bootstrap5.min.css" rel="stylesheet">
25
- <link href="https://cdnjs.cloudflare.com/ajax/libs/datatables.net-responsive-bs5/2.4.1/responsive.bootstrap5.min.css" rel="stylesheet">
26
- <style>
27
- {css}
28
- </style>
29
- </head>
30
- <body>
31
- <div class="container">
32
- <table id="interactive-table" class="table table-striped" style="width:100%">
33
- <thead>
34
- <tr>{header_cells}</tr>
35
- </thead>
36
- <tbody>{body_rows}</tbody>
37
- </table>
38
- </div>
39
- <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.7.0/jquery.min.js"></script>
40
- <script src="https://cdnjs.cloudflare.com/ajax/libs/datatables.net/1.13.6/jquery.dataTables.min.js"></script>
41
- <script src="https://cdnjs.cloudflare.com/ajax/libs/datatables.net-bs5/1.13.6/dataTables.bootstrap5.min.js"></script>
42
- <script>
43
- $(document).ready(function() {{
44
- $('#interactive-table').DataTable({{
45
- pageLength: 10,
46
- lengthMenu: [[5, 10, 25, -1], [5, 10, 25, "All"]],
47
- scrollX: true,
48
- responsive: true,
49
- dom: 'Bfrtip',
50
- buttons: [
51
- {{
52
- extend: 'colvis',
53
- text: 'Show/Hide Columns'
54
- }}
55
- ]
56
- }});
57
- }});
58
- </script>
59
- </body>
60
- </html>
61
- """
62
-
63
- header_cells = "".join(
64
- f"<th>{header}</th>" for header in self.table_data.headers
65
- )
66
- body_rows = ""
67
- for row in self.table_data.data:
68
- body_rows += "<tr>"
69
- body_rows += "".join(f"<td>{cell}</td>" for cell in row)
70
- body_rows += "</tr>"
71
-
72
- parameters = self.table_data.parameters or {}
73
- css = self.get_css()
74
- if hasattr(self, "css_parameterizer"):
75
- css = self.css_parameterizer(css).apply_parameters(parameters)
76
-
77
- return html_template.format(
78
- css=css, header_cells=header_cells, body_rows=body_rows
79
- )
80
-
81
- @classmethod
82
- def get_css(cls) -> str:
83
- """Load CSS content from the file next to this module"""
84
- css_path = Path(__file__).parent / "table_display.css"
85
- return css_path.read_text()
86
-
87
-
88
- class PandasStyleRenderer(DataTablesRendererABC):
89
- """Pandas-based styled renderer implementation"""
90
-
91
- def render_html(self) -> str:
92
- import pandas as pd
93
-
94
- from contextlib import redirect_stderr
95
- import io
96
-
97
- stderr = io.StringIO()
98
- with redirect_stderr(stderr):
99
- if self.table_data.raw_data_set is not None and hasattr(
100
- self.table_data.raw_data_set, "to_pandas"
101
- ):
102
- df = self.table_data.raw_data_set.to_pandas()
103
- else:
104
- df = pd.DataFrame(self.table_data.data, columns=self.table_data.headers)
105
-
106
- styled_df = df.style.set_properties(
107
- **{"text-align": "left"}
108
- ).background_gradient()
109
-
110
- return f"""
111
- <div style="max-height: 500px; overflow-y: auto;">
112
- {styled_df.to_html()}
113
- </div>
114
- """
115
-
116
- @classmethod
117
- def get_css(cls) -> str:
118
- return "" # Pandas styling handles its own CSS
@@ -1,109 +0,0 @@
1
- import os
2
- import mimetypes
3
-
4
-
5
- class ConstructDownloadLink:
6
- """
7
- A class to create HTML download links for FileStore objects.
8
- The links can be displayed in Jupyter notebooks or other web interfaces.
9
- """
10
-
11
- def __init__(self, filestore):
12
- """
13
- Initialize with a FileStore object.
14
-
15
- Args:
16
- filestore: A FileStore object containing the file to be made downloadable
17
- """
18
- self.filestore = filestore
19
-
20
- def create_link(self, custom_filename=None, style=None):
21
- from IPython.display import HTML
22
-
23
- html = self.html_create_link(custom_filename, style)
24
- return HTML(html)
25
-
26
- def html_create_link(self, custom_filename=None, style=None):
27
- """
28
- Create an HTML download link for the file.
29
-
30
- Args:
31
- custom_filename (str, optional): Custom name for the downloaded file.
32
- If None, uses original filename.
33
- style (dict, optional): Custom CSS styles for the download button.
34
- If None, uses default styling.
35
-
36
- Returns:
37
- IPython.display.HTML: HTML object containing the download link
38
- """
39
-
40
- # Get filename from path or use custom filename
41
- original_filename = os.path.basename(self.filestore.path)
42
- filename = custom_filename or original_filename
43
-
44
- # Use the base64 string already stored in FileStore
45
- b64_data = self.filestore.base64_string
46
-
47
- # Use mime type from FileStore or guess it
48
- mime_type = self.filestore.mime_type
49
-
50
- # Default style if none provided
51
- default_style = {
52
- "background-color": "#4CAF50",
53
- "color": "white",
54
- "padding": "10px 20px",
55
- "text-decoration": "none",
56
- "border-radius": "4px",
57
- "display": "inline-block",
58
- "margin": "10px 0",
59
- "font-family": "sans-serif",
60
- "cursor": "pointer",
61
- }
62
-
63
- button_style = style or default_style
64
- style_str = "; ".join(f"{k}: {v}" for k, v in button_style.items())
65
-
66
- html = f"""
67
- <a download="{filename}"
68
- href="data:{mime_type};base64,{b64_data}"
69
- style="{style_str}">
70
- Download {filename}
71
- </a>
72
- """
73
- return html
74
-
75
- def create_multiple_links(self, files, custom_filenames=None, style=None):
76
- """
77
- Create multiple download links at once.
78
- Useful when you want to provide different versions of the same file
79
- or related files together.
80
-
81
- Args:
82
- files (list): List of FileStore objects
83
- custom_filenames (list, optional): List of custom filenames for downloads
84
- style (dict, optional): Custom CSS styles for the download buttons
85
-
86
- Returns:
87
- IPython.display.HTML: HTML object containing all download links
88
- """
89
- if custom_filenames is None:
90
- custom_filenames = [None] * len(files)
91
-
92
- html_parts = []
93
- for file_obj, custom_name in zip(files, custom_filenames):
94
- link_creator = ConstructDownloadLink(file_obj)
95
- html_parts.append(
96
- link_creator.create_link(
97
- custom_filename=custom_name, style=style
98
- )._repr_html_()
99
- )
100
-
101
- return HTML(
102
- '<div style="display: flex; gap: 10px;">' + "".join(html_parts) + "</div>"
103
- )
104
-
105
-
106
- if __name__ == "__main__":
107
- import doctest
108
-
109
- doctest.testmod()
@@ -1,102 +0,0 @@
1
- from __future__ import annotations
2
- from typing import Optional, Generator, TYPE_CHECKING
3
- import copy
4
-
5
- if TYPE_CHECKING:
6
- from edsl.scenarios.Scenario import Scenario
7
- from edsl.scenarios.ScenarioList import ScenarioList
8
-
9
-
10
- class DocumentChunker:
11
- def __init__(self, scenario: "Scenario"):
12
- self.scenario = scenario
13
-
14
- @staticmethod
15
- def _line_chunks(text, num_lines: int) -> Generator[str, None, None]:
16
- """Split a text into chunks of a given size.
17
-
18
- :param text: The text to split.
19
- :param num_lines: The number of lines in each chunk.
20
-
21
- Example:
22
-
23
- >>> list(DocumentChunker._line_chunks("This is a test.\\nThis is a test. This is a test.", 1))
24
- ['This is a test.', 'This is a test. This is a test.']
25
- """
26
- lines = text.split("\n")
27
- for i in range(0, len(lines), num_lines):
28
- chunk = "\n".join(lines[i : i + num_lines])
29
- yield chunk
30
-
31
- @staticmethod
32
- def _word_chunks(text, num_words: int) -> Generator[str, None, None]:
33
- """Split a text into chunks of a given size.
34
-
35
- :param text: The text to split.
36
- :param num_words: The number of words in each chunk.
37
-
38
- Example:
39
-
40
- >>> list(DocumentChunker._word_chunks("This is a test.", 2))
41
- ['This is', 'a test.']
42
- """
43
- words = text.split()
44
- for i in range(0, len(words), num_words):
45
- chunk = " ".join(words[i : i + num_words])
46
- yield chunk
47
-
48
- def chunk(
49
- self,
50
- field,
51
- num_words: Optional[int] = None,
52
- num_lines: Optional[int] = None,
53
- include_original=False,
54
- hash_original=False,
55
- ) -> ScenarioList:
56
- """Split a field into chunks of a given size.
57
-
58
- :param field: The field to split.
59
- :param num_words: The number of words in each chunk.
60
- :param num_lines: The number of lines in each chunk.
61
- :param include_original: Whether to include the original field in the new scenarios.
62
- :param hash_original: Whether to hash the original field in the new scenarios.
63
-
64
- If you specify `include_original=True`, the original field will be included in the new scenarios with an "_original" suffix.
65
- """
66
- from edsl.scenarios.ScenarioList import ScenarioList
67
- import hashlib
68
-
69
- if num_words is not None:
70
- chunks = list(self._word_chunks(self.scenario[field], num_words))
71
-
72
- if num_lines is not None:
73
- chunks = list(self._line_chunks(self.scenario[field], num_lines))
74
-
75
- if num_words is None and num_lines is None:
76
- raise ValueError("You must specify either num_words or num_lines.")
77
-
78
- if num_words is not None and num_lines is not None:
79
- raise ValueError(
80
- "You must specify either num_words or num_lines, but not both."
81
- )
82
-
83
- scenarios = []
84
- for i, chunk in enumerate(chunks):
85
- new_scenario = copy.deepcopy(self.scenario)
86
- new_scenario[field] = chunk
87
- new_scenario[field + "_chunk"] = i
88
- if include_original:
89
- if hash_original:
90
- new_scenario[field + "_original"] = hashlib.md5(
91
- self.scenario[field].encode()
92
- ).hexdigest()
93
- else:
94
- new_scenario[field + "_original"] = self.scenario[field]
95
- scenarios.append(new_scenario)
96
- return ScenarioList(scenarios)
97
-
98
-
99
- if __name__ == "__main__":
100
- import doctest
101
-
102
- doctest.testmod()
@@ -1,16 +0,0 @@
1
- class DocxScenario:
2
- def __init__(self, docx_path: str):
3
- from docx import Document
4
-
5
- self.doc = Document(docx_path)
6
- self.docx_path = docx_path
7
-
8
- def get_scenario_dict(self) -> dict:
9
- # Extract all text
10
- full_text = []
11
- for para in self.doc.paragraphs:
12
- full_text.append(para.text)
13
-
14
- # Join the text from all paragraphs
15
- text = "\n".join(full_text)
16
- return {"file_path": self.docx_path, "text": text}
@@ -1,40 +0,0 @@
1
- import os
2
-
3
-
4
- class PdfExtractor:
5
- def __init__(self, pdf_path: str, parent_object: object):
6
- self.pdf_path = pdf_path
7
- self.constructor = parent_object.__class__
8
-
9
- def get_object(self) -> object:
10
- return self.constructor(self._get_pdf_dict())
11
-
12
- def _get_pdf_dict(self) -> dict:
13
- # Ensure the file exists
14
- import fitz
15
-
16
- if not os.path.exists(self.pdf_path):
17
- raise FileNotFoundError(f"The file {self.pdf_path} does not exist.")
18
-
19
- # Open the PDF file
20
- document = fitz.open(self.pdf_path)
21
-
22
- # Get the filename from the path
23
- filename = os.path.basename(self.pdf_path)
24
-
25
- # Iterate through each page and extract text
26
- text = ""
27
- for page_num in range(len(document)):
28
- page = document.load_page(page_num)
29
- blocks = page.get_text("blocks") # Extract text blocks
30
-
31
- # Sort blocks by their vertical position (y0) to maintain reading order
32
- blocks.sort(key=lambda b: (b[1], b[0])) # Sort by y0 first, then x0
33
-
34
- # Combine the text blocks in order
35
- for block in blocks:
36
- text += block[4] + "\n"
37
-
38
- # Create a dictionary for the combined text
39
- page_info = {"filename": filename, "text": text}
40
- return page_info
@@ -1,96 +0,0 @@
1
- # directory_scanner.py
2
- from dataclasses import dataclass
3
- from typing import Optional, List, Iterator, TypeVar, Generic, Callable, Any
4
- import os
5
-
6
- T = TypeVar("T")
7
-
8
-
9
- @dataclass
10
- class DirectoryScanner:
11
- """
12
- Scanner for finding files in a directory based on various criteria.
13
- """
14
-
15
- directory_path: str
16
-
17
- def scan(
18
- self,
19
- factory: Callable[[str], T],
20
- recursive: bool = False,
21
- suffix_allow_list: Optional[List[str]] = None,
22
- suffix_exclude_list: Optional[List[str]] = None,
23
- example_suffix: Optional[str] = None,
24
- include_no_extension: bool = True,
25
- ) -> List[T]:
26
- """
27
- Eagerly scan directory and return list of objects created by factory.
28
-
29
- Args:
30
- factory: Callable that creates objects from file paths
31
- recursive: If True, recursively traverse subdirectories
32
- suffix_allow_list: List of allowed file extensions (without dots)
33
- suffix_exclude_list: List of excluded file extensions (takes precedence over allow list)
34
- example_suffix: If provided, only include files with this example suffix
35
- include_no_extension: Whether to include files without extensions
36
- """
37
- return list(
38
- self.iter_scan(
39
- factory,
40
- recursive=recursive,
41
- suffix_allow_list=suffix_allow_list,
42
- suffix_exclude_list=suffix_exclude_list,
43
- example_suffix=example_suffix,
44
- include_no_extension=include_no_extension,
45
- )
46
- )
47
-
48
- def iter_scan(
49
- self,
50
- factory: Callable[[str], T],
51
- recursive: bool = False,
52
- suffix_allow_list: Optional[List[str]] = None,
53
- suffix_exclude_list: Optional[List[str]] = None,
54
- example_suffix: Optional[str] = None,
55
- include_no_extension: bool = True,
56
- ) -> Iterator[T]:
57
- """
58
- Lazily scan directory and yield objects created by factory.
59
- """
60
-
61
- def should_include_file(filepath: str) -> bool:
62
- _, ext = os.path.splitext(filepath)
63
- ext = ext[1:] if ext else ""
64
-
65
- # Handle no extension case
66
- if not ext:
67
- return include_no_extension
68
-
69
- # Check exclusions first (they take precedence)
70
- if suffix_exclude_list and ext in suffix_exclude_list:
71
- return False
72
-
73
- # Check example suffix if specified
74
- if example_suffix and not filepath.endswith(example_suffix):
75
- return False
76
-
77
- # Check allowed suffixes if specified
78
- if suffix_allow_list and ext not in suffix_allow_list:
79
- return False
80
-
81
- return True
82
-
83
- def iter_files():
84
- if recursive:
85
- for root, _, files in os.walk(self.directory_path):
86
- for file in files:
87
- yield os.path.join(root, file)
88
- else:
89
- for file in os.listdir(self.directory_path):
90
- file_path = os.path.join(self.directory_path, file)
91
- if os.path.isfile(file_path):
92
- yield file_path
93
-
94
- for file_path in iter_files():
95
- if should_include_file(file_path):
96
- yield factory(file_path)
@@ -1,85 +0,0 @@
1
- from typing import Optional, Dict, Type
2
- from abc import ABC, abstractmethod
3
- import importlib.metadata
4
- import importlib.util
5
-
6
- from edsl.utilities.is_notebook import is_notebook
7
-
8
-
9
- class FileMethods(ABC):
10
- _handlers: Dict[str, Type["FileMethods"]] = {}
11
-
12
- def __init__(self, path: Optional[str] = None):
13
- self.path = path
14
-
15
- def __init_subclass__(cls) -> None:
16
- """Register subclasses automatically when they're defined."""
17
- super().__init_subclass__()
18
- if hasattr(cls, "suffix"):
19
- FileMethods._handlers[cls.suffix] = cls
20
-
21
- @classmethod
22
- def get_handler(cls, suffix: str) -> Optional[Type["FileMethods"]]:
23
- """Get the appropriate handler class for a given suffix."""
24
- # Load plugins if they haven't been loaded yet
25
- if not cls._handlers:
26
- cls.load_plugins()
27
- return cls._handlers.get(suffix.lower())
28
-
29
- @classmethod
30
- def load_plugins(cls):
31
- """Load all file handler plugins including built-ins and external plugins."""
32
-
33
- from edsl.scenarios import handlers
34
-
35
- # Then load any external plugins
36
- try:
37
- entries = importlib.metadata.entry_points(group="file_handlers")
38
- except TypeError: # some Python 3.9 bullshit
39
- # entries = importlib.metadata.entry_points()
40
- entries = []
41
-
42
- for ep in entries:
43
- try:
44
- handler_class = ep.load()
45
- # Registration happens automatically via __init_subclass__
46
- except Exception as e:
47
- print(f"Failed to load external handler {ep.name}: {e}")
48
-
49
- @classmethod
50
- def get_handler_for_path(cls, path: str) -> Optional[Type["FileMethods"]]:
51
- """Get the appropriate handler class for a file path."""
52
- suffix = path.split(".")[-1].lower() if "." in path else ""
53
- return cls.get_handler(suffix)
54
-
55
- @classmethod
56
- def create(cls, path: str) -> Optional["FileMethods"]:
57
- """Create an appropriate handler instance for the given path."""
58
- handler_class = cls.get_handler_for_path(path)
59
- if handler_class:
60
- return handler_class(path)
61
- return None
62
-
63
- @classmethod
64
- def supported_file_types(cls):
65
- if not cls._handlers:
66
- cls.load_plugins()
67
- return list(cls._handlers.keys())
68
-
69
- @abstractmethod
70
- def view_system(self):
71
- ...
72
-
73
- @abstractmethod
74
- def view_notebook(self):
75
- ...
76
-
77
- def view(self):
78
- if is_notebook():
79
- self.view_notebook()
80
- else:
81
- self.view_system()
82
-
83
- @abstractmethod
84
- def example(self):
85
- ...
@@ -1,13 +0,0 @@
1
- from .pdf import PdfMethods
2
- from .docx import DocxMethods
3
- from .png import PngMethods
4
- from .txt import TxtMethods
5
- from .html import HtmlMethods
6
- from .md import MarkdownMethods
7
- from .csv import CsvMethods
8
- from .json import JsonMethods
9
- from .sql import SqlMethods
10
- from .pptx import PptxMethods
11
- from .latex import LaTeXMethods
12
- from .py import PyMethods
13
- from .sqlite import SQLiteMethods
@@ -1,49 +0,0 @@
1
- import tempfile
2
- from edsl.scenarios.file_methods import FileMethods
3
-
4
-
5
- class CsvMethods(FileMethods):
6
- suffix = "csv"
7
-
8
- def view_system(self):
9
- import os
10
- import subprocess
11
-
12
- if os.path.exists(self.path):
13
- try:
14
- if (os_name := os.name) == "posix":
15
- subprocess.run(["open", self.path], check=True) # macOS
16
- elif os_name == "nt":
17
- os.startfile(self.path) # Windows
18
- else:
19
- subprocess.run(["xdg-open", self.path], check=True) # Linux
20
- except Exception as e:
21
- print(f"Error opening CSV: {e}")
22
- else:
23
- print("CSV file was not found.")
24
-
25
- def view_notebook(self):
26
- import pandas as pd
27
- from IPython.display import display
28
-
29
- df = pd.read_csv(self.path)
30
- display(df)
31
-
32
- def example(self):
33
- import pandas as pd
34
-
35
- df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
36
- with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as f:
37
- df.to_csv(f.name, index=False)
38
- return f.name
39
-
40
- def to_pandas(self):
41
- """
42
- Convert the CSV file to a pandas DataFrame.
43
-
44
- Returns:
45
- pandas.DataFrame: The data from the CSV as a DataFrame
46
- """
47
- import pandas as pd
48
-
49
- return pd.read_csv(self.path)