edsl 0.1.38.dev4__py3-none-any.whl → 0.1.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (212) hide show
  1. edsl/Base.py +197 -116
  2. edsl/__init__.py +15 -7
  3. edsl/__version__.py +1 -1
  4. edsl/agents/Agent.py +351 -147
  5. edsl/agents/AgentList.py +211 -73
  6. edsl/agents/Invigilator.py +101 -50
  7. edsl/agents/InvigilatorBase.py +62 -70
  8. edsl/agents/PromptConstructor.py +143 -225
  9. edsl/agents/QuestionInstructionPromptBuilder.py +128 -0
  10. edsl/agents/QuestionTemplateReplacementsBuilder.py +137 -0
  11. edsl/agents/__init__.py +0 -1
  12. edsl/agents/prompt_helpers.py +3 -3
  13. edsl/agents/question_option_processor.py +172 -0
  14. edsl/auto/AutoStudy.py +18 -5
  15. edsl/auto/StageBase.py +53 -40
  16. edsl/auto/StageQuestions.py +2 -1
  17. edsl/auto/utilities.py +0 -6
  18. edsl/config.py +22 -2
  19. edsl/conversation/car_buying.py +2 -1
  20. edsl/coop/CoopFunctionsMixin.py +15 -0
  21. edsl/coop/ExpectedParrotKeyHandler.py +125 -0
  22. edsl/coop/PriceFetcher.py +1 -1
  23. edsl/coop/coop.py +125 -47
  24. edsl/coop/utils.py +14 -14
  25. edsl/data/Cache.py +45 -27
  26. edsl/data/CacheEntry.py +12 -15
  27. edsl/data/CacheHandler.py +31 -12
  28. edsl/data/RemoteCacheSync.py +154 -46
  29. edsl/data/__init__.py +4 -3
  30. edsl/data_transfer_models.py +2 -1
  31. edsl/enums.py +27 -0
  32. edsl/exceptions/__init__.py +50 -50
  33. edsl/exceptions/agents.py +12 -0
  34. edsl/exceptions/inference_services.py +5 -0
  35. edsl/exceptions/questions.py +24 -6
  36. edsl/exceptions/scenarios.py +7 -0
  37. edsl/inference_services/AnthropicService.py +38 -19
  38. edsl/inference_services/AvailableModelCacheHandler.py +184 -0
  39. edsl/inference_services/AvailableModelFetcher.py +215 -0
  40. edsl/inference_services/AwsBedrock.py +0 -2
  41. edsl/inference_services/AzureAI.py +0 -2
  42. edsl/inference_services/GoogleService.py +7 -12
  43. edsl/inference_services/InferenceServiceABC.py +18 -85
  44. edsl/inference_services/InferenceServicesCollection.py +120 -79
  45. edsl/inference_services/MistralAIService.py +0 -3
  46. edsl/inference_services/OpenAIService.py +47 -35
  47. edsl/inference_services/PerplexityService.py +0 -3
  48. edsl/inference_services/ServiceAvailability.py +135 -0
  49. edsl/inference_services/TestService.py +11 -10
  50. edsl/inference_services/TogetherAIService.py +5 -3
  51. edsl/inference_services/data_structures.py +134 -0
  52. edsl/jobs/AnswerQuestionFunctionConstructor.py +223 -0
  53. edsl/jobs/Answers.py +1 -14
  54. edsl/jobs/FetchInvigilator.py +47 -0
  55. edsl/jobs/InterviewTaskManager.py +98 -0
  56. edsl/jobs/InterviewsConstructor.py +50 -0
  57. edsl/jobs/Jobs.py +356 -431
  58. edsl/jobs/JobsChecks.py +35 -10
  59. edsl/jobs/JobsComponentConstructor.py +189 -0
  60. edsl/jobs/JobsPrompts.py +6 -4
  61. edsl/jobs/JobsRemoteInferenceHandler.py +205 -133
  62. edsl/jobs/JobsRemoteInferenceLogger.py +239 -0
  63. edsl/jobs/RequestTokenEstimator.py +30 -0
  64. edsl/jobs/async_interview_runner.py +138 -0
  65. edsl/jobs/buckets/BucketCollection.py +44 -3
  66. edsl/jobs/buckets/TokenBucket.py +53 -21
  67. edsl/jobs/buckets/TokenBucketAPI.py +211 -0
  68. edsl/jobs/buckets/TokenBucketClient.py +191 -0
  69. edsl/jobs/check_survey_scenario_compatibility.py +85 -0
  70. edsl/jobs/data_structures.py +120 -0
  71. edsl/jobs/decorators.py +35 -0
  72. edsl/jobs/interviews/Interview.py +143 -408
  73. edsl/jobs/jobs_status_enums.py +9 -0
  74. edsl/jobs/loggers/HTMLTableJobLogger.py +304 -0
  75. edsl/jobs/results_exceptions_handler.py +98 -0
  76. edsl/jobs/runners/JobsRunnerAsyncio.py +88 -403
  77. edsl/jobs/runners/JobsRunnerStatus.py +133 -165
  78. edsl/jobs/tasks/QuestionTaskCreator.py +21 -19
  79. edsl/jobs/tasks/TaskHistory.py +38 -18
  80. edsl/jobs/tasks/task_status_enum.py +0 -2
  81. edsl/language_models/ComputeCost.py +63 -0
  82. edsl/language_models/LanguageModel.py +194 -236
  83. edsl/language_models/ModelList.py +28 -19
  84. edsl/language_models/PriceManager.py +127 -0
  85. edsl/language_models/RawResponseHandler.py +106 -0
  86. edsl/language_models/ServiceDataSources.py +0 -0
  87. edsl/language_models/__init__.py +1 -2
  88. edsl/language_models/key_management/KeyLookup.py +63 -0
  89. edsl/language_models/key_management/KeyLookupBuilder.py +273 -0
  90. edsl/language_models/key_management/KeyLookupCollection.py +38 -0
  91. edsl/language_models/key_management/__init__.py +0 -0
  92. edsl/language_models/key_management/models.py +131 -0
  93. edsl/language_models/model.py +256 -0
  94. edsl/language_models/repair.py +2 -2
  95. edsl/language_models/utilities.py +5 -4
  96. edsl/notebooks/Notebook.py +19 -14
  97. edsl/notebooks/NotebookToLaTeX.py +142 -0
  98. edsl/prompts/Prompt.py +29 -39
  99. edsl/questions/ExceptionExplainer.py +77 -0
  100. edsl/questions/HTMLQuestion.py +103 -0
  101. edsl/questions/QuestionBase.py +68 -214
  102. edsl/questions/QuestionBasePromptsMixin.py +7 -3
  103. edsl/questions/QuestionBudget.py +1 -1
  104. edsl/questions/QuestionCheckBox.py +3 -3
  105. edsl/questions/QuestionExtract.py +5 -7
  106. edsl/questions/QuestionFreeText.py +2 -3
  107. edsl/questions/QuestionList.py +10 -18
  108. edsl/questions/QuestionMatrix.py +265 -0
  109. edsl/questions/QuestionMultipleChoice.py +67 -23
  110. edsl/questions/QuestionNumerical.py +2 -4
  111. edsl/questions/QuestionRank.py +7 -17
  112. edsl/questions/SimpleAskMixin.py +4 -3
  113. edsl/questions/__init__.py +2 -1
  114. edsl/questions/{AnswerValidatorMixin.py → answer_validator_mixin.py} +47 -2
  115. edsl/questions/data_structures.py +20 -0
  116. edsl/questions/derived/QuestionLinearScale.py +6 -3
  117. edsl/questions/derived/QuestionTopK.py +1 -1
  118. edsl/questions/descriptors.py +17 -3
  119. edsl/questions/loop_processor.py +149 -0
  120. edsl/questions/{QuestionBaseGenMixin.py → question_base_gen_mixin.py} +57 -50
  121. edsl/questions/question_registry.py +1 -1
  122. edsl/questions/{ResponseValidatorABC.py → response_validator_abc.py} +40 -26
  123. edsl/questions/response_validator_factory.py +34 -0
  124. edsl/questions/templates/matrix/__init__.py +1 -0
  125. edsl/questions/templates/matrix/answering_instructions.jinja +5 -0
  126. edsl/questions/templates/matrix/question_presentation.jinja +20 -0
  127. edsl/results/CSSParameterizer.py +1 -1
  128. edsl/results/Dataset.py +170 -7
  129. edsl/results/DatasetExportMixin.py +168 -305
  130. edsl/results/DatasetTree.py +28 -8
  131. edsl/results/MarkdownToDocx.py +122 -0
  132. edsl/results/MarkdownToPDF.py +111 -0
  133. edsl/results/Result.py +298 -206
  134. edsl/results/Results.py +149 -131
  135. edsl/results/ResultsExportMixin.py +2 -0
  136. edsl/results/TableDisplay.py +98 -171
  137. edsl/results/TextEditor.py +50 -0
  138. edsl/results/__init__.py +1 -1
  139. edsl/results/file_exports.py +252 -0
  140. edsl/results/{Selector.py → results_selector.py} +23 -13
  141. edsl/results/smart_objects.py +96 -0
  142. edsl/results/table_data_class.py +12 -0
  143. edsl/results/table_renderers.py +118 -0
  144. edsl/scenarios/ConstructDownloadLink.py +109 -0
  145. edsl/scenarios/DocumentChunker.py +102 -0
  146. edsl/scenarios/DocxScenario.py +16 -0
  147. edsl/scenarios/FileStore.py +150 -239
  148. edsl/scenarios/PdfExtractor.py +40 -0
  149. edsl/scenarios/Scenario.py +90 -193
  150. edsl/scenarios/ScenarioHtmlMixin.py +4 -3
  151. edsl/scenarios/ScenarioList.py +415 -244
  152. edsl/scenarios/ScenarioListExportMixin.py +0 -7
  153. edsl/scenarios/ScenarioListPdfMixin.py +15 -37
  154. edsl/scenarios/__init__.py +1 -2
  155. edsl/scenarios/directory_scanner.py +96 -0
  156. edsl/scenarios/file_methods.py +85 -0
  157. edsl/scenarios/handlers/__init__.py +13 -0
  158. edsl/scenarios/handlers/csv.py +49 -0
  159. edsl/scenarios/handlers/docx.py +76 -0
  160. edsl/scenarios/handlers/html.py +37 -0
  161. edsl/scenarios/handlers/json.py +111 -0
  162. edsl/scenarios/handlers/latex.py +5 -0
  163. edsl/scenarios/handlers/md.py +51 -0
  164. edsl/scenarios/handlers/pdf.py +68 -0
  165. edsl/scenarios/handlers/png.py +39 -0
  166. edsl/scenarios/handlers/pptx.py +105 -0
  167. edsl/scenarios/handlers/py.py +294 -0
  168. edsl/scenarios/handlers/sql.py +313 -0
  169. edsl/scenarios/handlers/sqlite.py +149 -0
  170. edsl/scenarios/handlers/txt.py +33 -0
  171. edsl/scenarios/{ScenarioJoin.py → scenario_join.py} +10 -6
  172. edsl/scenarios/scenario_selector.py +156 -0
  173. edsl/study/ObjectEntry.py +1 -1
  174. edsl/study/SnapShot.py +1 -1
  175. edsl/study/Study.py +5 -12
  176. edsl/surveys/ConstructDAG.py +92 -0
  177. edsl/surveys/EditSurvey.py +221 -0
  178. edsl/surveys/InstructionHandler.py +100 -0
  179. edsl/surveys/MemoryManagement.py +72 -0
  180. edsl/surveys/Rule.py +5 -4
  181. edsl/surveys/RuleCollection.py +25 -27
  182. edsl/surveys/RuleManager.py +172 -0
  183. edsl/surveys/Simulator.py +75 -0
  184. edsl/surveys/Survey.py +270 -791
  185. edsl/surveys/SurveyCSS.py +20 -8
  186. edsl/surveys/{SurveyFlowVisualizationMixin.py → SurveyFlowVisualization.py} +11 -9
  187. edsl/surveys/SurveyToApp.py +141 -0
  188. edsl/surveys/__init__.py +4 -2
  189. edsl/surveys/descriptors.py +6 -2
  190. edsl/surveys/instructions/ChangeInstruction.py +1 -2
  191. edsl/surveys/instructions/Instruction.py +4 -13
  192. edsl/surveys/instructions/InstructionCollection.py +11 -6
  193. edsl/templates/error_reporting/interview_details.html +1 -1
  194. edsl/templates/error_reporting/report.html +1 -1
  195. edsl/tools/plotting.py +1 -1
  196. edsl/utilities/PrettyList.py +56 -0
  197. edsl/utilities/is_notebook.py +18 -0
  198. edsl/utilities/is_valid_variable_name.py +11 -0
  199. edsl/utilities/remove_edsl_version.py +24 -0
  200. edsl/utilities/utilities.py +35 -23
  201. {edsl-0.1.38.dev4.dist-info → edsl-0.1.39.dist-info}/METADATA +12 -10
  202. edsl-0.1.39.dist-info/RECORD +358 -0
  203. {edsl-0.1.38.dev4.dist-info → edsl-0.1.39.dist-info}/WHEEL +1 -1
  204. edsl/language_models/KeyLookup.py +0 -30
  205. edsl/language_models/registry.py +0 -190
  206. edsl/language_models/unused/ReplicateBase.py +0 -83
  207. edsl/results/ResultsDBMixin.py +0 -238
  208. edsl-0.1.38.dev4.dist-info/RECORD +0 -277
  209. /edsl/questions/{RegisterQuestionsMeta.py → register_questions_meta.py} +0 -0
  210. /edsl/results/{ResultsFetchMixin.py → results_fetch_mixin.py} +0 -0
  211. /edsl/results/{ResultsToolsMixin.py → results_tools_mixin.py} +0 -0
  212. {edsl-0.1.38.dev4.dist-info → edsl-0.1.39.dist-info}/LICENSE +0 -0
@@ -4,111 +4,11 @@ import tempfile
4
4
  import mimetypes
5
5
  import os
6
6
  from typing import Dict, Any, IO, Optional
7
- import requests
8
- from urllib.parse import urlparse
9
7
 
10
- import google.generativeai as genai
8
+ from edsl.scenarios.Scenario import Scenario
9
+ from edsl.utilities.remove_edsl_version import remove_edsl_version
11
10
 
12
- from edsl import Scenario
13
- from edsl.utilities.decorators import add_edsl_version, remove_edsl_version
14
- from edsl.utilities.utilities import is_notebook
15
-
16
-
17
- def view_csv(csv_path):
18
- import pandas as pd
19
-
20
- df = pd.read_csv(csv_path)
21
- return df
22
-
23
-
24
- def view_html(html_path):
25
- import os
26
- import subprocess
27
- from IPython.display import IFrame, display, HTML
28
-
29
- if os.path.exists(html_path):
30
- if is_notebook():
31
- # Display the HTML inline in Jupyter Notebook
32
- display(IFrame(src=html_path, width=700, height=600))
33
- display(
34
- HTML(
35
- f'<a href="{html_path}" target="_blank">Open HTML in a new tab</a>'
36
- )
37
- )
38
- else:
39
- try:
40
- if (os_name := os.name) == "posix":
41
- # Open with the default browser on macOS
42
- subprocess.run(["open", html_path], check=True)
43
- elif os_name == "nt":
44
- # Open with the default browser on Windows
45
- os.startfile(html_path)
46
- else:
47
- # Open with the default browser on Linux
48
- subprocess.run(["xdg-open", html_path], check=True)
49
- except Exception as e:
50
- print(f"Error opening HTML file: {e}")
51
- else:
52
- print("HTML file was not found.")
53
-
54
-
55
- def view_html(html_path):
56
- import os
57
- from IPython.display import display, HTML
58
-
59
- if is_notebook():
60
- with open(html_path, "r") as f:
61
- html_content = f.read()
62
- display(HTML(html_content))
63
- else:
64
- if os.path.exists(html_path):
65
- try:
66
- if (os_name := os.name) == "posix":
67
- subprocess.run(["open", html_path], check=True)
68
- elif os_name == "nt":
69
- os.startfile(html_path)
70
- else:
71
- subprocess.run(["xdg-open", html_path], check=True)
72
- except Exception as e:
73
- print(f"Error opening file: {e}")
74
- else:
75
- print("File was not created successfully.")
76
-
77
-
78
- def view_pdf(pdf_path):
79
- import os
80
- import subprocess
81
- import os
82
- from IPython.display import HTML, display
83
-
84
- if is_notebook():
85
- # Convert to absolute path if needed
86
- with open(pdf_path, "rb") as f:
87
- base64_pdf = base64.b64encode(f.read()).decode("utf-8")
88
-
89
- html = f"""
90
- <iframe
91
- src="data:application/pdf;base64,{base64_pdf}"
92
- width="800px"
93
- height="800px"
94
- type="application/pdf"
95
- ></iframe>
96
- """
97
- display(HTML(html))
98
-
99
- if os.path.exists(pdf_path):
100
- try:
101
- if (os_name := os.name) == "posix":
102
- # for cool kids
103
- subprocess.run(["open", pdf_path], check=True) # macOS
104
- elif os_name == "nt":
105
- os.startfile(pdf_path) # Windows
106
- else:
107
- subprocess.run(["xdg-open", pdf_path], check=True) # Linux
108
- except Exception as e:
109
- print(f"Error opening PDF: {e}")
110
- else:
111
- print("PDF file was not created successfully.")
11
+ from edsl.scenarios.file_methods import FileMethods
112
12
 
113
13
 
114
14
  class FileStore(Scenario):
@@ -122,6 +22,7 @@ class FileStore(Scenario):
122
22
  suffix: Optional[str] = None,
123
23
  base64_string: Optional[str] = None,
124
24
  external_locations: Optional[Dict[str, str]] = None,
25
+ extracted_text: Optional[str] = None,
125
26
  **kwargs,
126
27
  ):
127
28
  if path is None and "filename" in kwargs:
@@ -137,6 +38,11 @@ class FileStore(Scenario):
137
38
  )
138
39
  self.base64_string = base64_string or self.encode_file_to_base64_string(path)
139
40
  self.external_locations = external_locations or {}
41
+
42
+ self.extracted_text = (
43
+ self.extract_text() if extracted_text is None else extracted_text
44
+ )
45
+
140
46
  super().__init__(
141
47
  {
142
48
  "path": path,
@@ -145,6 +51,7 @@ class FileStore(Scenario):
145
51
  "suffix": self.suffix,
146
52
  "mime_type": self.mime_type,
147
53
  "external_locations": self.external_locations,
54
+ "extracted_text": self.extracted_text,
148
55
  }
149
56
  )
150
57
 
@@ -170,88 +77,12 @@ class FileStore(Scenario):
170
77
  return "FileStore: self.path"
171
78
 
172
79
  @classmethod
173
- def example(cls, example_type="text"):
174
- import textwrap
175
- import tempfile
176
-
177
- if example_type == "png" or example_type == "image":
178
- import importlib.resources
179
- from pathlib import Path
180
-
181
- # Get package root directory
182
- package_root = Path(__file__).parent.parent.parent
183
- logo_path = package_root / "static" / "logo.png"
184
- return cls(str(logo_path))
185
-
186
- if example_type == "text":
187
- with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as f:
188
- f.write(b"Hello, World!")
189
-
190
- return cls(path=f.name)
191
-
192
- elif example_type == "csv":
193
- from edsl.results.Results import Results
194
-
195
- r = Results.example()
196
-
197
- with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as f:
198
- r.to_csv(filename=f.name)
199
- return cls(f.name)
200
-
201
- elif example_type == "pdf":
202
- pdf_string = textwrap.dedent(
203
- """\
204
- %PDF-1.4
205
- 1 0 obj
206
- << /Type /Catalog /Pages 2 0 R >>
207
- endobj
208
- 2 0 obj
209
- << /Type /Pages /Kids [3 0 R] /Count 1 >>
210
- endobj
211
- 3 0 obj
212
- << /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents 4 0 R >>
213
- endobj
214
- 4 0 obj
215
- << /Length 44 >>
216
- stream
217
- BT
218
- /F1 24 Tf
219
- 100 700 Td
220
- (Hello, World!) Tj
221
- ET
222
- endstream
223
- endobj
224
- 5 0 obj
225
- << /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>
226
- endobj
227
- 6 0 obj
228
- << /ProcSet [/PDF /Text] /Font << /F1 5 0 R >> >>
229
- endobj
230
- xref
231
- 0 7
232
- 0000000000 65535 f
233
- 0000000010 00000 n
234
- 0000000053 00000 n
235
- 0000000100 00000 n
236
- 0000000173 00000 n
237
- 0000000232 00000 n
238
- 0000000272 00000 n
239
- trailer
240
- << /Size 7 /Root 1 0 R >>
241
- startxref
242
- 318
243
- %%EOF"""
244
- )
245
- with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f:
246
- f.write(pdf_string.encode())
247
-
248
- return cls(f.name)
249
-
250
- elif example_type == "html":
251
- with tempfile.NamedTemporaryFile(suffix=".html", delete=False) as f:
252
- f.write("<html><body><h1>Test</h1></body></html>".encode())
253
-
254
- return cls(f.name)
80
+ def example(cls, example_type="txt"):
81
+ file_methods_class = FileMethods.get_handler(example_type)
82
+ if file_methods_class:
83
+ return cls(file_methods_class().example())
84
+ else:
85
+ print(f"Example for {example_type} is not supported.")
255
86
 
256
87
  @property
257
88
  def size(self) -> int:
@@ -260,6 +91,8 @@ class FileStore(Scenario):
260
91
  return os.path.getsize(self.path)
261
92
 
262
93
  def upload_google(self, refresh: bool = False) -> None:
94
+ import google.generativeai as genai
95
+
263
96
  genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
264
97
  google_info = genai.upload_file(self.path, mime_type=self.mime_type)
265
98
  self.external_locations["google"] = google_info.to_dict()
@@ -271,7 +104,21 @@ class FileStore(Scenario):
271
104
  return cls(**d)
272
105
 
273
106
  def __repr__(self):
274
- return f"FileStore(path='{self.path}')"
107
+ import reprlib
108
+
109
+ r = reprlib.Repr()
110
+ r.maxstring = 20 # Limit strings to 20 chars
111
+ r.maxother = 30 # Limit other types to 30 chars
112
+
113
+ params = ", ".join(f"{key}={r.repr(value)}" for key, value in self.data.items())
114
+ return f"{self.__class__.__name__}({params})"
115
+
116
+ def _repr_html_(self):
117
+ parent_html = super()._repr_html_()
118
+ from edsl.scenarios.ConstructDownloadLink import ConstructDownloadLink
119
+
120
+ link = ConstructDownloadLink(self).html_create_link(self.path, style=None)
121
+ return f"{parent_html}<br>{link}"
275
122
 
276
123
  def encode_file_to_base64_string(self, file_path: str):
277
124
  try:
@@ -296,9 +143,44 @@ class FileStore(Scenario):
296
143
 
297
144
  def open(self) -> "IO":
298
145
  if self.binary:
299
- return self.base64_to_file(self["base64_string"], is_binary=True)
146
+ return self.base64_to_file(self.base64_string, is_binary=True)
300
147
  else:
301
- return self.base64_to_text_file(self["base64_string"])
148
+ return self.base64_to_text_file(self.base64_string)
149
+
150
+ def write(self, filename: Optional[str] = None) -> str:
151
+ """
152
+ Write the file content to disk, either to a specified filename or a temporary file.
153
+
154
+ Args:
155
+ filename (Optional[str]): The destination filename. If None, creates a temporary file.
156
+
157
+ Returns:
158
+ str: The path to the written file.
159
+ """
160
+ # Determine the mode based on binary flag
161
+ mode = "wb" if self.binary else "w"
162
+
163
+ # If no filename provided, create a temporary file
164
+ if filename is None:
165
+ from tempfile import NamedTemporaryFile
166
+
167
+ with NamedTemporaryFile(delete=False, suffix="." + self.suffix) as f:
168
+ filename = f.name
169
+
170
+ # Write the content using the appropriate mode
171
+ try:
172
+ with open(filename, mode) as f:
173
+ content = self.open().read()
174
+ # For text mode, ensure we're writing a string
175
+ if not self.binary and isinstance(content, bytes):
176
+ content = content.decode("utf-8")
177
+ f.write(content)
178
+ print(f"File written to {filename}")
179
+ except Exception as e:
180
+ print(f"Error writing file: {e}")
181
+ raise
182
+
183
+ # return filename
302
184
 
303
185
  @staticmethod
304
186
  def base64_to_text_file(base64_string) -> "IO":
@@ -327,6 +209,15 @@ class FileStore(Scenario):
327
209
  # Create a StringIO object for text data
328
210
  return io.StringIO(text_data)
329
211
 
212
+ @property
213
+ def text(self):
214
+ if self.binary:
215
+ import warnings
216
+
217
+ warnings.warn("This is a binary file.")
218
+ else:
219
+ return self.base64_to_text_file(self.base64_string).read()
220
+
330
221
  def to_tempfile(self, suffix=None):
331
222
  if suffix is None:
332
223
  suffix = self.suffix
@@ -335,7 +226,7 @@ class FileStore(Scenario):
335
226
  self["base64_string"], is_binary=True
336
227
  )
337
228
  else:
338
- file_like_object = self.base64_to_text_file(self["base64_string"])
229
+ file_like_object = self.base64_to_text_file(self.base64_string)
339
230
 
340
231
  # Create a named temporary file
341
232
  mode = "wb" if self.binary else "w"
@@ -352,40 +243,23 @@ class FileStore(Scenario):
352
243
 
353
244
  return temp_file.name
354
245
 
355
- def view(self, max_size: int = 300) -> None:
356
- # with self.open() as f:
357
- if self.suffix == "csv":
358
- return view_csv(self.path)
359
-
360
- if self.suffix == "pdf":
361
- view_pdf(self.path)
362
-
363
- if self.suffix == "html":
364
- view_html(self.path)
365
-
366
- if self.suffix == "png" or self.suffix == "jpg" or self.suffix == "jpeg":
367
- if is_notebook():
368
- from IPython.display import Image
369
- from PIL import Image as PILImage
370
-
371
- if max_size:
372
- # Open the image using Pillow
373
- with PILImage.open(self.path) as img:
374
- # Get original width and height
375
- original_width, original_height = img.size
246
+ def view(self) -> None:
247
+ handler = FileMethods.get_handler(self.suffix)
248
+ if handler:
249
+ handler(self.path).view()
250
+ else:
251
+ print(f"Viewing of {self.suffix} files is not supported.")
376
252
 
377
- # Calculate the scaling factor
378
- scale = min(
379
- max_size / original_width, max_size / original_height
380
- )
253
+ def extract_text(self) -> str:
254
+ handler = FileMethods.get_handler(self.suffix)
255
+ if handler and hasattr(handler, "extract_text"):
256
+ return handler(self.path).extract_text()
381
257
 
382
- # Calculate new dimensions
383
- new_width = int(original_width * scale)
384
- new_height = int(original_height * scale)
258
+ if not self.binary:
259
+ return self.text
385
260
 
386
- return Image(self.path, width=new_width, height=new_height)
387
- else:
388
- return Image(self.path)
261
+ return None
262
+ # raise TypeError("No text method found for this file type.")
389
263
 
390
264
  def push(
391
265
  self, description: Optional[str] = None, visibility: str = "unlisted"
@@ -423,6 +297,8 @@ class FileStore(Scenario):
423
297
  :param download_path: The path to save the downloaded file.
424
298
  :param mime_type: The MIME type of the file. If None, it will be guessed from the file extension.
425
299
  """
300
+ import requests
301
+ from urllib.parse import urlparse
426
302
 
427
303
  response = requests.get(url, stream=True)
428
304
  response.raise_for_status() # Raises an HTTPError for bad responses
@@ -446,6 +322,43 @@ class FileStore(Scenario):
446
322
  # Create and return a new File instance
447
323
  return cls(download_path, mime_type=mime_type)
448
324
 
325
+ def create_link(self, custom_filename=None, style=None):
326
+ from edsl.scenarios.ConstructDownloadLink import ConstructDownloadLink
327
+
328
+ return ConstructDownloadLink(self).create_link(custom_filename, style)
329
+
330
+ def to_pandas(self):
331
+ """
332
+ Convert the file content to a pandas DataFrame if supported by the file handler.
333
+
334
+ Returns:
335
+ pandas.DataFrame: The data from the file as a DataFrame
336
+
337
+ Raises:
338
+ AttributeError: If the file type's handler doesn't support pandas conversion
339
+ """
340
+ handler = FileMethods.get_handler(self.suffix)
341
+ if handler and hasattr(handler, "to_pandas"):
342
+ return handler(self.path).to_pandas()
343
+ raise AttributeError(
344
+ f"Converting {self.suffix} files to pandas DataFrame is not supported"
345
+ )
346
+
347
+ def __getattr__(self, name):
348
+ """
349
+ Delegate pandas DataFrame methods to the underlying DataFrame if this is a CSV file
350
+ """
351
+ if self.suffix == "csv":
352
+ # Get the pandas DataFrame
353
+ df = self.to_pandas()
354
+ # Check if the requested attribute exists in the DataFrame
355
+ if hasattr(df, name):
356
+ return getattr(df, name)
357
+ # If not a CSV or attribute doesn't exist in DataFrame, raise AttributeError
358
+ raise AttributeError(
359
+ f"'{self.__class__.__name__}' object has no attribute '{name}'"
360
+ )
361
+
449
362
 
450
363
  class CSVFileStore(FileStore):
451
364
  @classmethod
@@ -606,27 +519,25 @@ class HTMLFileStore(FileStore):
606
519
 
607
520
 
608
521
  if __name__ == "__main__":
609
- # file_path = "../conjure/examples/Ex11-2.sav"
610
- # fs = FileStore(file_path)
611
- # info = fs.push()
612
- # print(info)
522
+ import doctest
613
523
 
614
- # fs = CSVFileStore.example()
615
- # fs.to_tempfile()
616
- # print(fs.view())
524
+ doctest.testmod()
617
525
 
618
- # fs = PDFFileStore.example()
526
+ # fs = FileStore.example("pdf")
619
527
  # fs.view()
620
528
 
621
- # fs = PDFFileStore("paper.pdf")
622
- # fs.view()
623
- # from edsl import Conjure
624
- pass
625
- # fs = PNGFileStore("logo.png")
626
- # fs.view()
627
- # fs.upload_google()
529
+ formats = FileMethods.supported_file_types()
530
+ for file_type in formats:
531
+ print("Now testinging", file_type)
532
+ fs = FileStore.example(file_type)
533
+ fs.view()
534
+ input("Press Enter to continue...")
535
+
536
+ # pdf_example.view()
537
+ # FileStore(pdf_example).view()
538
+
539
+ # pdf_methods = methods.get("pdf")
540
+ # file = pdf_methods().example()
541
+ # pdf_methods(file).view()
628
542
 
629
- # c = Conjure(datafile_name=fs.to_tempfile())
630
- # f = PDFFileStore("paper.pdf")
631
- # print(f.to_tempfile())
632
- # f.push()
543
+ # print(FileMethods._handlers)
@@ -0,0 +1,40 @@
1
+ import os
2
+
3
+
4
+ class PdfExtractor:
5
+ def __init__(self, pdf_path: str, parent_object: object):
6
+ self.pdf_path = pdf_path
7
+ self.constructor = parent_object.__class__
8
+
9
+ def get_object(self) -> object:
10
+ return self.constructor(self._get_pdf_dict())
11
+
12
+ def _get_pdf_dict(self) -> dict:
13
+ # Ensure the file exists
14
+ import fitz
15
+
16
+ if not os.path.exists(self.pdf_path):
17
+ raise FileNotFoundError(f"The file {self.pdf_path} does not exist.")
18
+
19
+ # Open the PDF file
20
+ document = fitz.open(self.pdf_path)
21
+
22
+ # Get the filename from the path
23
+ filename = os.path.basename(self.pdf_path)
24
+
25
+ # Iterate through each page and extract text
26
+ text = ""
27
+ for page_num in range(len(document)):
28
+ page = document.load_page(page_num)
29
+ blocks = page.get_text("blocks") # Extract text blocks
30
+
31
+ # Sort blocks by their vertical position (y0) to maintain reading order
32
+ blocks.sort(key=lambda b: (b[1], b[0])) # Sort by y0 first, then x0
33
+
34
+ # Combine the text blocks in order
35
+ for block in blocks:
36
+ text += block[4] + "\n"
37
+
38
+ # Create a dictionary for the combined text
39
+ page_info = {"filename": filename, "text": text}
40
+ return page_info