edsl 0.1.39.dev1__py3-none-any.whl → 0.1.39.dev2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- edsl/Base.py +169 -116
- edsl/__init__.py +14 -6
- edsl/__version__.py +1 -1
- edsl/agents/Agent.py +358 -146
- edsl/agents/AgentList.py +211 -73
- edsl/agents/Invigilator.py +88 -36
- edsl/agents/InvigilatorBase.py +59 -70
- edsl/agents/PromptConstructor.py +117 -219
- edsl/agents/QuestionInstructionPromptBuilder.py +128 -0
- edsl/agents/QuestionOptionProcessor.py +172 -0
- edsl/agents/QuestionTemplateReplacementsBuilder.py +137 -0
- edsl/agents/__init__.py +0 -1
- edsl/agents/prompt_helpers.py +3 -3
- edsl/config.py +22 -2
- edsl/conversation/car_buying.py +2 -1
- edsl/coop/CoopFunctionsMixin.py +15 -0
- edsl/coop/ExpectedParrotKeyHandler.py +125 -0
- edsl/coop/PriceFetcher.py +1 -1
- edsl/coop/coop.py +104 -42
- edsl/coop/utils.py +14 -14
- edsl/data/Cache.py +21 -14
- edsl/data/CacheEntry.py +12 -15
- edsl/data/CacheHandler.py +33 -12
- edsl/data/__init__.py +4 -3
- edsl/data_transfer_models.py +2 -1
- edsl/enums.py +20 -0
- edsl/exceptions/__init__.py +50 -50
- edsl/exceptions/agents.py +12 -0
- edsl/exceptions/inference_services.py +5 -0
- edsl/exceptions/questions.py +24 -6
- edsl/exceptions/scenarios.py +7 -0
- edsl/inference_services/AnthropicService.py +0 -3
- edsl/inference_services/AvailableModelCacheHandler.py +184 -0
- edsl/inference_services/AvailableModelFetcher.py +209 -0
- edsl/inference_services/AwsBedrock.py +0 -2
- edsl/inference_services/AzureAI.py +0 -2
- edsl/inference_services/GoogleService.py +2 -11
- edsl/inference_services/InferenceServiceABC.py +18 -85
- edsl/inference_services/InferenceServicesCollection.py +105 -80
- edsl/inference_services/MistralAIService.py +0 -3
- edsl/inference_services/OpenAIService.py +1 -4
- edsl/inference_services/PerplexityService.py +0 -3
- edsl/inference_services/ServiceAvailability.py +135 -0
- edsl/inference_services/TestService.py +11 -8
- edsl/inference_services/data_structures.py +62 -0
- edsl/jobs/AnswerQuestionFunctionConstructor.py +188 -0
- edsl/jobs/Answers.py +1 -14
- edsl/jobs/FetchInvigilator.py +40 -0
- edsl/jobs/InterviewTaskManager.py +98 -0
- edsl/jobs/InterviewsConstructor.py +48 -0
- edsl/jobs/Jobs.py +102 -243
- edsl/jobs/JobsChecks.py +35 -10
- edsl/jobs/JobsComponentConstructor.py +189 -0
- edsl/jobs/JobsPrompts.py +5 -3
- edsl/jobs/JobsRemoteInferenceHandler.py +128 -80
- edsl/jobs/JobsRemoteInferenceLogger.py +239 -0
- edsl/jobs/RequestTokenEstimator.py +30 -0
- edsl/jobs/buckets/BucketCollection.py +44 -3
- edsl/jobs/buckets/TokenBucket.py +53 -21
- edsl/jobs/buckets/TokenBucketAPI.py +211 -0
- edsl/jobs/buckets/TokenBucketClient.py +191 -0
- edsl/jobs/decorators.py +35 -0
- edsl/jobs/interviews/Interview.py +77 -380
- edsl/jobs/jobs_status_enums.py +9 -0
- edsl/jobs/loggers/HTMLTableJobLogger.py +304 -0
- edsl/jobs/runners/JobsRunnerAsyncio.py +4 -49
- edsl/jobs/tasks/QuestionTaskCreator.py +21 -19
- edsl/jobs/tasks/TaskHistory.py +14 -15
- edsl/jobs/tasks/task_status_enum.py +0 -2
- edsl/language_models/ComputeCost.py +63 -0
- edsl/language_models/LanguageModel.py +137 -234
- edsl/language_models/ModelList.py +11 -13
- edsl/language_models/PriceManager.py +127 -0
- edsl/language_models/RawResponseHandler.py +106 -0
- edsl/language_models/ServiceDataSources.py +0 -0
- edsl/language_models/__init__.py +0 -1
- edsl/language_models/key_management/KeyLookup.py +63 -0
- edsl/language_models/key_management/KeyLookupBuilder.py +273 -0
- edsl/language_models/key_management/KeyLookupCollection.py +38 -0
- edsl/language_models/key_management/__init__.py +0 -0
- edsl/language_models/key_management/models.py +131 -0
- edsl/language_models/registry.py +49 -59
- edsl/language_models/repair.py +2 -2
- edsl/language_models/utilities.py +5 -4
- edsl/notebooks/Notebook.py +19 -14
- edsl/notebooks/NotebookToLaTeX.py +142 -0
- edsl/prompts/Prompt.py +29 -39
- edsl/questions/AnswerValidatorMixin.py +47 -2
- edsl/questions/ExceptionExplainer.py +77 -0
- edsl/questions/HTMLQuestion.py +103 -0
- edsl/questions/LoopProcessor.py +149 -0
- edsl/questions/QuestionBase.py +37 -192
- edsl/questions/QuestionBaseGenMixin.py +52 -48
- edsl/questions/QuestionBasePromptsMixin.py +7 -3
- edsl/questions/QuestionCheckBox.py +1 -1
- edsl/questions/QuestionExtract.py +1 -1
- edsl/questions/QuestionFreeText.py +1 -2
- edsl/questions/QuestionList.py +3 -5
- edsl/questions/QuestionMatrix.py +265 -0
- edsl/questions/QuestionMultipleChoice.py +66 -22
- edsl/questions/QuestionNumerical.py +1 -3
- edsl/questions/QuestionRank.py +6 -16
- edsl/questions/ResponseValidatorABC.py +37 -11
- edsl/questions/ResponseValidatorFactory.py +28 -0
- edsl/questions/SimpleAskMixin.py +4 -3
- edsl/questions/__init__.py +1 -0
- edsl/questions/derived/QuestionLinearScale.py +6 -3
- edsl/questions/derived/QuestionTopK.py +1 -1
- edsl/questions/descriptors.py +17 -3
- edsl/questions/question_registry.py +1 -1
- edsl/questions/templates/matrix/__init__.py +1 -0
- edsl/questions/templates/matrix/answering_instructions.jinja +5 -0
- edsl/questions/templates/matrix/question_presentation.jinja +20 -0
- edsl/results/CSSParameterizer.py +1 -1
- edsl/results/Dataset.py +170 -7
- edsl/results/DatasetExportMixin.py +224 -302
- edsl/results/DatasetTree.py +28 -8
- edsl/results/MarkdownToDocx.py +122 -0
- edsl/results/MarkdownToPDF.py +111 -0
- edsl/results/Result.py +192 -206
- edsl/results/Results.py +120 -113
- edsl/results/ResultsExportMixin.py +2 -0
- edsl/results/Selector.py +23 -13
- edsl/results/TableDisplay.py +98 -171
- edsl/results/TextEditor.py +50 -0
- edsl/results/__init__.py +1 -1
- edsl/results/smart_objects.py +96 -0
- edsl/results/table_data_class.py +12 -0
- edsl/results/table_renderers.py +118 -0
- edsl/scenarios/ConstructDownloadLink.py +109 -0
- edsl/scenarios/DirectoryScanner.py +96 -0
- edsl/scenarios/DocumentChunker.py +102 -0
- edsl/scenarios/DocxScenario.py +16 -0
- edsl/scenarios/FileStore.py +118 -239
- edsl/scenarios/PdfExtractor.py +40 -0
- edsl/scenarios/Scenario.py +90 -193
- edsl/scenarios/ScenarioHtmlMixin.py +4 -3
- edsl/scenarios/ScenarioJoin.py +10 -6
- edsl/scenarios/ScenarioList.py +383 -240
- edsl/scenarios/ScenarioListExportMixin.py +0 -7
- edsl/scenarios/ScenarioListPdfMixin.py +15 -37
- edsl/scenarios/ScenarioSelector.py +156 -0
- edsl/scenarios/__init__.py +1 -2
- edsl/scenarios/file_methods.py +85 -0
- edsl/scenarios/handlers/__init__.py +13 -0
- edsl/scenarios/handlers/csv.py +38 -0
- edsl/scenarios/handlers/docx.py +76 -0
- edsl/scenarios/handlers/html.py +37 -0
- edsl/scenarios/handlers/json.py +111 -0
- edsl/scenarios/handlers/latex.py +5 -0
- edsl/scenarios/handlers/md.py +51 -0
- edsl/scenarios/handlers/pdf.py +68 -0
- edsl/scenarios/handlers/png.py +39 -0
- edsl/scenarios/handlers/pptx.py +105 -0
- edsl/scenarios/handlers/py.py +294 -0
- edsl/scenarios/handlers/sql.py +313 -0
- edsl/scenarios/handlers/sqlite.py +149 -0
- edsl/scenarios/handlers/txt.py +33 -0
- edsl/study/ObjectEntry.py +1 -1
- edsl/study/SnapShot.py +1 -1
- edsl/study/Study.py +5 -12
- edsl/surveys/ConstructDAG.py +92 -0
- edsl/surveys/EditSurvey.py +221 -0
- edsl/surveys/InstructionHandler.py +100 -0
- edsl/surveys/MemoryManagement.py +72 -0
- edsl/surveys/Rule.py +5 -4
- edsl/surveys/RuleCollection.py +25 -27
- edsl/surveys/RuleManager.py +172 -0
- edsl/surveys/Simulator.py +75 -0
- edsl/surveys/Survey.py +199 -771
- edsl/surveys/SurveyCSS.py +20 -8
- edsl/surveys/{SurveyFlowVisualizationMixin.py → SurveyFlowVisualization.py} +11 -9
- edsl/surveys/SurveyToApp.py +141 -0
- edsl/surveys/__init__.py +4 -2
- edsl/surveys/descriptors.py +6 -2
- edsl/surveys/instructions/ChangeInstruction.py +1 -2
- edsl/surveys/instructions/Instruction.py +4 -13
- edsl/surveys/instructions/InstructionCollection.py +11 -6
- edsl/templates/error_reporting/interview_details.html +1 -1
- edsl/templates/error_reporting/report.html +1 -1
- edsl/tools/plotting.py +1 -1
- edsl/utilities/PrettyList.py +56 -0
- edsl/utilities/is_notebook.py +18 -0
- edsl/utilities/is_valid_variable_name.py +11 -0
- edsl/utilities/remove_edsl_version.py +24 -0
- edsl/utilities/utilities.py +35 -23
- {edsl-0.1.39.dev1.dist-info → edsl-0.1.39.dev2.dist-info}/METADATA +12 -10
- edsl-0.1.39.dev2.dist-info/RECORD +352 -0
- edsl/language_models/KeyLookup.py +0 -30
- edsl/language_models/unused/ReplicateBase.py +0 -83
- edsl/results/ResultsDBMixin.py +0 -238
- edsl-0.1.39.dev1.dist-info/RECORD +0 -277
- {edsl-0.1.39.dev1.dist-info → edsl-0.1.39.dev2.dist-info}/LICENSE +0 -0
- {edsl-0.1.39.dev1.dist-info → edsl-0.1.39.dev2.dist-info}/WHEEL +0 -0
@@ -0,0 +1,96 @@
|
|
1
|
+
# directory_scanner.py
|
2
|
+
from dataclasses import dataclass
|
3
|
+
from typing import Optional, List, Iterator, TypeVar, Generic, Callable, Any
|
4
|
+
import os
|
5
|
+
|
6
|
+
T = TypeVar("T")
|
7
|
+
|
8
|
+
|
9
|
+
@dataclass
|
10
|
+
class DirectoryScanner:
|
11
|
+
"""
|
12
|
+
Scanner for finding files in a directory based on various criteria.
|
13
|
+
"""
|
14
|
+
|
15
|
+
directory_path: str
|
16
|
+
|
17
|
+
def scan(
|
18
|
+
self,
|
19
|
+
factory: Callable[[str], T],
|
20
|
+
recursive: bool = False,
|
21
|
+
suffix_allow_list: Optional[List[str]] = None,
|
22
|
+
suffix_exclude_list: Optional[List[str]] = None,
|
23
|
+
example_suffix: Optional[str] = None,
|
24
|
+
include_no_extension: bool = True,
|
25
|
+
) -> List[T]:
|
26
|
+
"""
|
27
|
+
Eagerly scan directory and return list of objects created by factory.
|
28
|
+
|
29
|
+
Args:
|
30
|
+
factory: Callable that creates objects from file paths
|
31
|
+
recursive: If True, recursively traverse subdirectories
|
32
|
+
suffix_allow_list: List of allowed file extensions (without dots)
|
33
|
+
suffix_exclude_list: List of excluded file extensions (takes precedence over allow list)
|
34
|
+
example_suffix: If provided, only include files with this example suffix
|
35
|
+
include_no_extension: Whether to include files without extensions
|
36
|
+
"""
|
37
|
+
return list(
|
38
|
+
self.iter_scan(
|
39
|
+
factory,
|
40
|
+
recursive=recursive,
|
41
|
+
suffix_allow_list=suffix_allow_list,
|
42
|
+
suffix_exclude_list=suffix_exclude_list,
|
43
|
+
example_suffix=example_suffix,
|
44
|
+
include_no_extension=include_no_extension,
|
45
|
+
)
|
46
|
+
)
|
47
|
+
|
48
|
+
def iter_scan(
|
49
|
+
self,
|
50
|
+
factory: Callable[[str], T],
|
51
|
+
recursive: bool = False,
|
52
|
+
suffix_allow_list: Optional[List[str]] = None,
|
53
|
+
suffix_exclude_list: Optional[List[str]] = None,
|
54
|
+
example_suffix: Optional[str] = None,
|
55
|
+
include_no_extension: bool = True,
|
56
|
+
) -> Iterator[T]:
|
57
|
+
"""
|
58
|
+
Lazily scan directory and yield objects created by factory.
|
59
|
+
"""
|
60
|
+
|
61
|
+
def should_include_file(filepath: str) -> bool:
|
62
|
+
_, ext = os.path.splitext(filepath)
|
63
|
+
ext = ext[1:] if ext else ""
|
64
|
+
|
65
|
+
# Handle no extension case
|
66
|
+
if not ext:
|
67
|
+
return include_no_extension
|
68
|
+
|
69
|
+
# Check exclusions first (they take precedence)
|
70
|
+
if suffix_exclude_list and ext in suffix_exclude_list:
|
71
|
+
return False
|
72
|
+
|
73
|
+
# Check example suffix if specified
|
74
|
+
if example_suffix and not filepath.endswith(example_suffix):
|
75
|
+
return False
|
76
|
+
|
77
|
+
# Check allowed suffixes if specified
|
78
|
+
if suffix_allow_list and ext not in suffix_allow_list:
|
79
|
+
return False
|
80
|
+
|
81
|
+
return True
|
82
|
+
|
83
|
+
def iter_files():
|
84
|
+
if recursive:
|
85
|
+
for root, _, files in os.walk(self.directory_path):
|
86
|
+
for file in files:
|
87
|
+
yield os.path.join(root, file)
|
88
|
+
else:
|
89
|
+
for file in os.listdir(self.directory_path):
|
90
|
+
file_path = os.path.join(self.directory_path, file)
|
91
|
+
if os.path.isfile(file_path):
|
92
|
+
yield file_path
|
93
|
+
|
94
|
+
for file_path in iter_files():
|
95
|
+
if should_include_file(file_path):
|
96
|
+
yield factory(file_path)
|
@@ -0,0 +1,102 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
from typing import Optional, Generator, TYPE_CHECKING
|
3
|
+
import copy
|
4
|
+
|
5
|
+
if TYPE_CHECKING:
|
6
|
+
from edsl.scenarios.Scenario import Scenario
|
7
|
+
from edsl.scenarios.ScenarioList import ScenarioList
|
8
|
+
|
9
|
+
|
10
|
+
class DocumentChunker:
|
11
|
+
def __init__(self, scenario: "Scenario"):
|
12
|
+
self.scenario = scenario
|
13
|
+
|
14
|
+
@staticmethod
|
15
|
+
def _line_chunks(text, num_lines: int) -> Generator[str, None, None]:
|
16
|
+
"""Split a text into chunks of a given size.
|
17
|
+
|
18
|
+
:param text: The text to split.
|
19
|
+
:param num_lines: The number of lines in each chunk.
|
20
|
+
|
21
|
+
Example:
|
22
|
+
|
23
|
+
>>> list(DocumentChunker._line_chunks("This is a test.\\nThis is a test. This is a test.", 1))
|
24
|
+
['This is a test.', 'This is a test. This is a test.']
|
25
|
+
"""
|
26
|
+
lines = text.split("\n")
|
27
|
+
for i in range(0, len(lines), num_lines):
|
28
|
+
chunk = "\n".join(lines[i : i + num_lines])
|
29
|
+
yield chunk
|
30
|
+
|
31
|
+
@staticmethod
|
32
|
+
def _word_chunks(text, num_words: int) -> Generator[str, None, None]:
|
33
|
+
"""Split a text into chunks of a given size.
|
34
|
+
|
35
|
+
:param text: The text to split.
|
36
|
+
:param num_words: The number of words in each chunk.
|
37
|
+
|
38
|
+
Example:
|
39
|
+
|
40
|
+
>>> list(DocumentChunker._word_chunks("This is a test.", 2))
|
41
|
+
['This is', 'a test.']
|
42
|
+
"""
|
43
|
+
words = text.split()
|
44
|
+
for i in range(0, len(words), num_words):
|
45
|
+
chunk = " ".join(words[i : i + num_words])
|
46
|
+
yield chunk
|
47
|
+
|
48
|
+
def chunk(
|
49
|
+
self,
|
50
|
+
field,
|
51
|
+
num_words: Optional[int] = None,
|
52
|
+
num_lines: Optional[int] = None,
|
53
|
+
include_original=False,
|
54
|
+
hash_original=False,
|
55
|
+
) -> ScenarioList:
|
56
|
+
"""Split a field into chunks of a given size.
|
57
|
+
|
58
|
+
:param field: The field to split.
|
59
|
+
:param num_words: The number of words in each chunk.
|
60
|
+
:param num_lines: The number of lines in each chunk.
|
61
|
+
:param include_original: Whether to include the original field in the new scenarios.
|
62
|
+
:param hash_original: Whether to hash the original field in the new scenarios.
|
63
|
+
|
64
|
+
If you specify `include_original=True`, the original field will be included in the new scenarios with an "_original" suffix.
|
65
|
+
"""
|
66
|
+
from edsl.scenarios.ScenarioList import ScenarioList
|
67
|
+
import hashlib
|
68
|
+
|
69
|
+
if num_words is not None:
|
70
|
+
chunks = list(self._word_chunks(self.scenario[field], num_words))
|
71
|
+
|
72
|
+
if num_lines is not None:
|
73
|
+
chunks = list(self._line_chunks(self.scenario[field], num_lines))
|
74
|
+
|
75
|
+
if num_words is None and num_lines is None:
|
76
|
+
raise ValueError("You must specify either num_words or num_lines.")
|
77
|
+
|
78
|
+
if num_words is not None and num_lines is not None:
|
79
|
+
raise ValueError(
|
80
|
+
"You must specify either num_words or num_lines, but not both."
|
81
|
+
)
|
82
|
+
|
83
|
+
scenarios = []
|
84
|
+
for i, chunk in enumerate(chunks):
|
85
|
+
new_scenario = copy.deepcopy(self.scenario)
|
86
|
+
new_scenario[field] = chunk
|
87
|
+
new_scenario[field + "_chunk"] = i
|
88
|
+
if include_original:
|
89
|
+
if hash_original:
|
90
|
+
new_scenario[field + "_original"] = hashlib.md5(
|
91
|
+
self.scenario[field].encode()
|
92
|
+
).hexdigest()
|
93
|
+
else:
|
94
|
+
new_scenario[field + "_original"] = self.scenario[field]
|
95
|
+
scenarios.append(new_scenario)
|
96
|
+
return ScenarioList(scenarios)
|
97
|
+
|
98
|
+
|
99
|
+
if __name__ == "__main__":
|
100
|
+
import doctest
|
101
|
+
|
102
|
+
doctest.testmod()
|
@@ -0,0 +1,16 @@
|
|
1
|
+
class DocxScenario:
|
2
|
+
def __init__(self, docx_path: str):
|
3
|
+
from docx import Document
|
4
|
+
|
5
|
+
self.doc = Document(docx_path)
|
6
|
+
self.docx_path = docx_path
|
7
|
+
|
8
|
+
def get_scenario_dict(self) -> dict:
|
9
|
+
# Extract all text
|
10
|
+
full_text = []
|
11
|
+
for para in self.doc.paragraphs:
|
12
|
+
full_text.append(para.text)
|
13
|
+
|
14
|
+
# Join the text from all paragraphs
|
15
|
+
text = "\n".join(full_text)
|
16
|
+
return {"file_path": self.docx_path, "text": text}
|
edsl/scenarios/FileStore.py
CHANGED
@@ -4,111 +4,11 @@ import tempfile
|
|
4
4
|
import mimetypes
|
5
5
|
import os
|
6
6
|
from typing import Dict, Any, IO, Optional
|
7
|
-
import requests
|
8
|
-
from urllib.parse import urlparse
|
9
7
|
|
10
|
-
|
8
|
+
from edsl.scenarios.Scenario import Scenario
|
9
|
+
from edsl.utilities.remove_edsl_version import remove_edsl_version
|
11
10
|
|
12
|
-
from edsl import
|
13
|
-
from edsl.utilities.decorators import add_edsl_version, remove_edsl_version
|
14
|
-
from edsl.utilities.utilities import is_notebook
|
15
|
-
|
16
|
-
|
17
|
-
def view_csv(csv_path):
|
18
|
-
import pandas as pd
|
19
|
-
|
20
|
-
df = pd.read_csv(csv_path)
|
21
|
-
return df
|
22
|
-
|
23
|
-
|
24
|
-
def view_html(html_path):
|
25
|
-
import os
|
26
|
-
import subprocess
|
27
|
-
from IPython.display import IFrame, display, HTML
|
28
|
-
|
29
|
-
if os.path.exists(html_path):
|
30
|
-
if is_notebook():
|
31
|
-
# Display the HTML inline in Jupyter Notebook
|
32
|
-
display(IFrame(src=html_path, width=700, height=600))
|
33
|
-
display(
|
34
|
-
HTML(
|
35
|
-
f'<a href="{html_path}" target="_blank">Open HTML in a new tab</a>'
|
36
|
-
)
|
37
|
-
)
|
38
|
-
else:
|
39
|
-
try:
|
40
|
-
if (os_name := os.name) == "posix":
|
41
|
-
# Open with the default browser on macOS
|
42
|
-
subprocess.run(["open", html_path], check=True)
|
43
|
-
elif os_name == "nt":
|
44
|
-
# Open with the default browser on Windows
|
45
|
-
os.startfile(html_path)
|
46
|
-
else:
|
47
|
-
# Open with the default browser on Linux
|
48
|
-
subprocess.run(["xdg-open", html_path], check=True)
|
49
|
-
except Exception as e:
|
50
|
-
print(f"Error opening HTML file: {e}")
|
51
|
-
else:
|
52
|
-
print("HTML file was not found.")
|
53
|
-
|
54
|
-
|
55
|
-
def view_html(html_path):
|
56
|
-
import os
|
57
|
-
from IPython.display import display, HTML
|
58
|
-
|
59
|
-
if is_notebook():
|
60
|
-
with open(html_path, "r") as f:
|
61
|
-
html_content = f.read()
|
62
|
-
display(HTML(html_content))
|
63
|
-
else:
|
64
|
-
if os.path.exists(html_path):
|
65
|
-
try:
|
66
|
-
if (os_name := os.name) == "posix":
|
67
|
-
subprocess.run(["open", html_path], check=True)
|
68
|
-
elif os_name == "nt":
|
69
|
-
os.startfile(html_path)
|
70
|
-
else:
|
71
|
-
subprocess.run(["xdg-open", html_path], check=True)
|
72
|
-
except Exception as e:
|
73
|
-
print(f"Error opening file: {e}")
|
74
|
-
else:
|
75
|
-
print("File was not created successfully.")
|
76
|
-
|
77
|
-
|
78
|
-
def view_pdf(pdf_path):
|
79
|
-
import os
|
80
|
-
import subprocess
|
81
|
-
import os
|
82
|
-
from IPython.display import HTML, display
|
83
|
-
|
84
|
-
if is_notebook():
|
85
|
-
# Convert to absolute path if needed
|
86
|
-
with open(pdf_path, "rb") as f:
|
87
|
-
base64_pdf = base64.b64encode(f.read()).decode("utf-8")
|
88
|
-
|
89
|
-
html = f"""
|
90
|
-
<iframe
|
91
|
-
src="data:application/pdf;base64,{base64_pdf}"
|
92
|
-
width="800px"
|
93
|
-
height="800px"
|
94
|
-
type="application/pdf"
|
95
|
-
></iframe>
|
96
|
-
"""
|
97
|
-
display(HTML(html))
|
98
|
-
|
99
|
-
if os.path.exists(pdf_path):
|
100
|
-
try:
|
101
|
-
if (os_name := os.name) == "posix":
|
102
|
-
# for cool kids
|
103
|
-
subprocess.run(["open", pdf_path], check=True) # macOS
|
104
|
-
elif os_name == "nt":
|
105
|
-
os.startfile(pdf_path) # Windows
|
106
|
-
else:
|
107
|
-
subprocess.run(["xdg-open", pdf_path], check=True) # Linux
|
108
|
-
except Exception as e:
|
109
|
-
print(f"Error opening PDF: {e}")
|
110
|
-
else:
|
111
|
-
print("PDF file was not created successfully.")
|
11
|
+
from edsl.scenarios.file_methods import FileMethods
|
112
12
|
|
113
13
|
|
114
14
|
class FileStore(Scenario):
|
@@ -122,6 +22,7 @@ class FileStore(Scenario):
|
|
122
22
|
suffix: Optional[str] = None,
|
123
23
|
base64_string: Optional[str] = None,
|
124
24
|
external_locations: Optional[Dict[str, str]] = None,
|
25
|
+
extracted_text: Optional[str] = None,
|
125
26
|
**kwargs,
|
126
27
|
):
|
127
28
|
if path is None and "filename" in kwargs:
|
@@ -137,6 +38,11 @@ class FileStore(Scenario):
|
|
137
38
|
)
|
138
39
|
self.base64_string = base64_string or self.encode_file_to_base64_string(path)
|
139
40
|
self.external_locations = external_locations or {}
|
41
|
+
|
42
|
+
self.extracted_text = (
|
43
|
+
self.extract_text() if extracted_text is None else extracted_text
|
44
|
+
)
|
45
|
+
|
140
46
|
super().__init__(
|
141
47
|
{
|
142
48
|
"path": path,
|
@@ -145,6 +51,7 @@ class FileStore(Scenario):
|
|
145
51
|
"suffix": self.suffix,
|
146
52
|
"mime_type": self.mime_type,
|
147
53
|
"external_locations": self.external_locations,
|
54
|
+
"extracted_text": self.extracted_text,
|
148
55
|
}
|
149
56
|
)
|
150
57
|
|
@@ -170,88 +77,12 @@ class FileStore(Scenario):
|
|
170
77
|
return "FileStore: self.path"
|
171
78
|
|
172
79
|
@classmethod
|
173
|
-
def example(cls, example_type="
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
from pathlib import Path
|
180
|
-
|
181
|
-
# Get package root directory
|
182
|
-
package_root = Path(__file__).parent.parent.parent
|
183
|
-
logo_path = package_root / "static" / "logo.png"
|
184
|
-
return cls(str(logo_path))
|
185
|
-
|
186
|
-
if example_type == "text":
|
187
|
-
with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as f:
|
188
|
-
f.write(b"Hello, World!")
|
189
|
-
|
190
|
-
return cls(path=f.name)
|
191
|
-
|
192
|
-
elif example_type == "csv":
|
193
|
-
from edsl.results.Results import Results
|
194
|
-
|
195
|
-
r = Results.example()
|
196
|
-
|
197
|
-
with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as f:
|
198
|
-
r.to_csv(filename=f.name)
|
199
|
-
return cls(f.name)
|
200
|
-
|
201
|
-
elif example_type == "pdf":
|
202
|
-
pdf_string = textwrap.dedent(
|
203
|
-
"""\
|
204
|
-
%PDF-1.4
|
205
|
-
1 0 obj
|
206
|
-
<< /Type /Catalog /Pages 2 0 R >>
|
207
|
-
endobj
|
208
|
-
2 0 obj
|
209
|
-
<< /Type /Pages /Kids [3 0 R] /Count 1 >>
|
210
|
-
endobj
|
211
|
-
3 0 obj
|
212
|
-
<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents 4 0 R >>
|
213
|
-
endobj
|
214
|
-
4 0 obj
|
215
|
-
<< /Length 44 >>
|
216
|
-
stream
|
217
|
-
BT
|
218
|
-
/F1 24 Tf
|
219
|
-
100 700 Td
|
220
|
-
(Hello, World!) Tj
|
221
|
-
ET
|
222
|
-
endstream
|
223
|
-
endobj
|
224
|
-
5 0 obj
|
225
|
-
<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>
|
226
|
-
endobj
|
227
|
-
6 0 obj
|
228
|
-
<< /ProcSet [/PDF /Text] /Font << /F1 5 0 R >> >>
|
229
|
-
endobj
|
230
|
-
xref
|
231
|
-
0 7
|
232
|
-
0000000000 65535 f
|
233
|
-
0000000010 00000 n
|
234
|
-
0000000053 00000 n
|
235
|
-
0000000100 00000 n
|
236
|
-
0000000173 00000 n
|
237
|
-
0000000232 00000 n
|
238
|
-
0000000272 00000 n
|
239
|
-
trailer
|
240
|
-
<< /Size 7 /Root 1 0 R >>
|
241
|
-
startxref
|
242
|
-
318
|
243
|
-
%%EOF"""
|
244
|
-
)
|
245
|
-
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f:
|
246
|
-
f.write(pdf_string.encode())
|
247
|
-
|
248
|
-
return cls(f.name)
|
249
|
-
|
250
|
-
elif example_type == "html":
|
251
|
-
with tempfile.NamedTemporaryFile(suffix=".html", delete=False) as f:
|
252
|
-
f.write("<html><body><h1>Test</h1></body></html>".encode())
|
253
|
-
|
254
|
-
return cls(f.name)
|
80
|
+
def example(cls, example_type="txt"):
|
81
|
+
file_methods_class = FileMethods.get_handler(example_type)
|
82
|
+
if file_methods_class:
|
83
|
+
return cls(file_methods_class().example())
|
84
|
+
else:
|
85
|
+
print(f"Example for {example_type} is not supported.")
|
255
86
|
|
256
87
|
@property
|
257
88
|
def size(self) -> int:
|
@@ -260,6 +91,8 @@ class FileStore(Scenario):
|
|
260
91
|
return os.path.getsize(self.path)
|
261
92
|
|
262
93
|
def upload_google(self, refresh: bool = False) -> None:
|
94
|
+
import google.generativeai as genai
|
95
|
+
|
263
96
|
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
|
264
97
|
google_info = genai.upload_file(self.path, mime_type=self.mime_type)
|
265
98
|
self.external_locations["google"] = google_info.to_dict()
|
@@ -271,7 +104,21 @@ class FileStore(Scenario):
|
|
271
104
|
return cls(**d)
|
272
105
|
|
273
106
|
def __repr__(self):
|
274
|
-
|
107
|
+
import reprlib
|
108
|
+
|
109
|
+
r = reprlib.Repr()
|
110
|
+
r.maxstring = 20 # Limit strings to 20 chars
|
111
|
+
r.maxother = 30 # Limit other types to 30 chars
|
112
|
+
|
113
|
+
params = ", ".join(f"{key}={r.repr(value)}" for key, value in self.data.items())
|
114
|
+
return f"{self.__class__.__name__}({params})"
|
115
|
+
|
116
|
+
def _repr_html_(self):
|
117
|
+
parent_html = super()._repr_html_()
|
118
|
+
from edsl.scenarios.ConstructDownloadLink import ConstructDownloadLink
|
119
|
+
|
120
|
+
link = ConstructDownloadLink(self).html_create_link(self.path, style=None)
|
121
|
+
return f"{parent_html}<br>{link}"
|
275
122
|
|
276
123
|
def encode_file_to_base64_string(self, file_path: str):
|
277
124
|
try:
|
@@ -296,9 +143,44 @@ class FileStore(Scenario):
|
|
296
143
|
|
297
144
|
def open(self) -> "IO":
|
298
145
|
if self.binary:
|
299
|
-
return self.base64_to_file(self
|
146
|
+
return self.base64_to_file(self.base64_string, is_binary=True)
|
300
147
|
else:
|
301
|
-
return self.base64_to_text_file(self
|
148
|
+
return self.base64_to_text_file(self.base64_string)
|
149
|
+
|
150
|
+
def write(self, filename: Optional[str] = None) -> str:
|
151
|
+
"""
|
152
|
+
Write the file content to disk, either to a specified filename or a temporary file.
|
153
|
+
|
154
|
+
Args:
|
155
|
+
filename (Optional[str]): The destination filename. If None, creates a temporary file.
|
156
|
+
|
157
|
+
Returns:
|
158
|
+
str: The path to the written file.
|
159
|
+
"""
|
160
|
+
# Determine the mode based on binary flag
|
161
|
+
mode = "wb" if self.binary else "w"
|
162
|
+
|
163
|
+
# If no filename provided, create a temporary file
|
164
|
+
if filename is None:
|
165
|
+
from tempfile import NamedTemporaryFile
|
166
|
+
|
167
|
+
with NamedTemporaryFile(delete=False, suffix="." + self.suffix) as f:
|
168
|
+
filename = f.name
|
169
|
+
|
170
|
+
# Write the content using the appropriate mode
|
171
|
+
try:
|
172
|
+
with open(filename, mode) as f:
|
173
|
+
content = self.open().read()
|
174
|
+
# For text mode, ensure we're writing a string
|
175
|
+
if not self.binary and isinstance(content, bytes):
|
176
|
+
content = content.decode("utf-8")
|
177
|
+
f.write(content)
|
178
|
+
print(f"File written to {filename}")
|
179
|
+
except Exception as e:
|
180
|
+
print(f"Error writing file: {e}")
|
181
|
+
raise
|
182
|
+
|
183
|
+
# return filename
|
302
184
|
|
303
185
|
@staticmethod
|
304
186
|
def base64_to_text_file(base64_string) -> "IO":
|
@@ -327,6 +209,15 @@ class FileStore(Scenario):
|
|
327
209
|
# Create a StringIO object for text data
|
328
210
|
return io.StringIO(text_data)
|
329
211
|
|
212
|
+
@property
|
213
|
+
def text(self):
|
214
|
+
if self.binary:
|
215
|
+
import warnings
|
216
|
+
|
217
|
+
warnings.warn("This is a binary file.")
|
218
|
+
else:
|
219
|
+
return self.base64_to_text_file(self.base64_string).read()
|
220
|
+
|
330
221
|
def to_tempfile(self, suffix=None):
|
331
222
|
if suffix is None:
|
332
223
|
suffix = self.suffix
|
@@ -335,7 +226,7 @@ class FileStore(Scenario):
|
|
335
226
|
self["base64_string"], is_binary=True
|
336
227
|
)
|
337
228
|
else:
|
338
|
-
file_like_object = self.base64_to_text_file(self
|
229
|
+
file_like_object = self.base64_to_text_file(self.base64_string)
|
339
230
|
|
340
231
|
# Create a named temporary file
|
341
232
|
mode = "wb" if self.binary else "w"
|
@@ -352,40 +243,23 @@ class FileStore(Scenario):
|
|
352
243
|
|
353
244
|
return temp_file.name
|
354
245
|
|
355
|
-
def view(self
|
356
|
-
|
357
|
-
if
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
view_pdf(self.path)
|
362
|
-
|
363
|
-
if self.suffix == "html":
|
364
|
-
view_html(self.path)
|
365
|
-
|
366
|
-
if self.suffix == "png" or self.suffix == "jpg" or self.suffix == "jpeg":
|
367
|
-
if is_notebook():
|
368
|
-
from IPython.display import Image
|
369
|
-
from PIL import Image as PILImage
|
370
|
-
|
371
|
-
if max_size:
|
372
|
-
# Open the image using Pillow
|
373
|
-
with PILImage.open(self.path) as img:
|
374
|
-
# Get original width and height
|
375
|
-
original_width, original_height = img.size
|
246
|
+
def view(self) -> None:
|
247
|
+
handler = FileMethods.get_handler(self.suffix)
|
248
|
+
if handler:
|
249
|
+
handler(self.path).view()
|
250
|
+
else:
|
251
|
+
print(f"Viewing of {self.suffix} files is not supported.")
|
376
252
|
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
253
|
+
def extract_text(self) -> str:
|
254
|
+
handler = FileMethods.get_handler(self.suffix)
|
255
|
+
if handler and hasattr(handler, "extract_text"):
|
256
|
+
return handler(self.path).extract_text()
|
381
257
|
|
382
|
-
|
383
|
-
|
384
|
-
new_height = int(original_height * scale)
|
258
|
+
if not self.binary:
|
259
|
+
return self.text
|
385
260
|
|
386
|
-
|
387
|
-
|
388
|
-
return Image(self.path)
|
261
|
+
return None
|
262
|
+
# raise TypeError("No text method found for this file type.")
|
389
263
|
|
390
264
|
def push(
|
391
265
|
self, description: Optional[str] = None, visibility: str = "unlisted"
|
@@ -423,6 +297,8 @@ class FileStore(Scenario):
|
|
423
297
|
:param download_path: The path to save the downloaded file.
|
424
298
|
:param mime_type: The MIME type of the file. If None, it will be guessed from the file extension.
|
425
299
|
"""
|
300
|
+
import requests
|
301
|
+
from urllib.parse import urlparse
|
426
302
|
|
427
303
|
response = requests.get(url, stream=True)
|
428
304
|
response.raise_for_status() # Raises an HTTPError for bad responses
|
@@ -446,6 +322,11 @@ class FileStore(Scenario):
|
|
446
322
|
# Create and return a new File instance
|
447
323
|
return cls(download_path, mime_type=mime_type)
|
448
324
|
|
325
|
+
def create_link(self, custom_filename=None, style=None):
|
326
|
+
from edsl.scenarios.ConstructDownloadLink import ConstructDownloadLink
|
327
|
+
|
328
|
+
return ConstructDownloadLink(self).create_link(custom_filename, style)
|
329
|
+
|
449
330
|
|
450
331
|
class CSVFileStore(FileStore):
|
451
332
|
@classmethod
|
@@ -606,27 +487,25 @@ class HTMLFileStore(FileStore):
|
|
606
487
|
|
607
488
|
|
608
489
|
if __name__ == "__main__":
|
609
|
-
|
610
|
-
# fs = FileStore(file_path)
|
611
|
-
# info = fs.push()
|
612
|
-
# print(info)
|
490
|
+
import doctest
|
613
491
|
|
614
|
-
|
615
|
-
# fs.to_tempfile()
|
616
|
-
# print(fs.view())
|
492
|
+
doctest.testmod()
|
617
493
|
|
618
|
-
# fs =
|
494
|
+
# fs = FileStore.example("pdf")
|
619
495
|
# fs.view()
|
620
496
|
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
|
625
|
-
|
626
|
-
|
627
|
-
|
497
|
+
formats = FileMethods.supported_file_types()
|
498
|
+
for file_type in formats:
|
499
|
+
print("Now testinging", file_type)
|
500
|
+
fs = FileStore.example(file_type)
|
501
|
+
fs.view()
|
502
|
+
input("Press Enter to continue...")
|
503
|
+
|
504
|
+
# pdf_example.view()
|
505
|
+
# FileStore(pdf_example).view()
|
506
|
+
|
507
|
+
# pdf_methods = methods.get("pdf")
|
508
|
+
# file = pdf_methods().example()
|
509
|
+
# pdf_methods(file).view()
|
628
510
|
|
629
|
-
#
|
630
|
-
# f = PDFFileStore("paper.pdf")
|
631
|
-
# print(f.to_tempfile())
|
632
|
-
# f.push()
|
511
|
+
# print(FileMethods._handlers)
|