edsl 0.1.39__py3-none-any.whl → 0.1.39.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- edsl/Base.py +116 -197
- edsl/__init__.py +7 -15
- edsl/__version__.py +1 -1
- edsl/agents/Agent.py +147 -351
- edsl/agents/AgentList.py +73 -211
- edsl/agents/Invigilator.py +50 -101
- edsl/agents/InvigilatorBase.py +70 -62
- edsl/agents/PromptConstructor.py +225 -143
- edsl/agents/__init__.py +1 -0
- edsl/agents/prompt_helpers.py +3 -3
- edsl/auto/AutoStudy.py +5 -18
- edsl/auto/StageBase.py +40 -53
- edsl/auto/StageQuestions.py +1 -2
- edsl/auto/utilities.py +6 -0
- edsl/config.py +2 -22
- edsl/conversation/car_buying.py +1 -2
- edsl/coop/PriceFetcher.py +1 -1
- edsl/coop/coop.py +47 -125
- edsl/coop/utils.py +14 -14
- edsl/data/Cache.py +27 -45
- edsl/data/CacheEntry.py +15 -12
- edsl/data/CacheHandler.py +12 -31
- edsl/data/RemoteCacheSync.py +46 -154
- edsl/data/__init__.py +3 -4
- edsl/data_transfer_models.py +1 -2
- edsl/enums.py +0 -27
- edsl/exceptions/__init__.py +50 -50
- edsl/exceptions/agents.py +0 -12
- edsl/exceptions/questions.py +6 -24
- edsl/exceptions/scenarios.py +0 -7
- edsl/inference_services/AnthropicService.py +19 -38
- edsl/inference_services/AwsBedrock.py +2 -0
- edsl/inference_services/AzureAI.py +2 -0
- edsl/inference_services/GoogleService.py +12 -7
- edsl/inference_services/InferenceServiceABC.py +85 -18
- edsl/inference_services/InferenceServicesCollection.py +79 -120
- edsl/inference_services/MistralAIService.py +3 -0
- edsl/inference_services/OpenAIService.py +35 -47
- edsl/inference_services/PerplexityService.py +3 -0
- edsl/inference_services/TestService.py +10 -11
- edsl/inference_services/TogetherAIService.py +3 -5
- edsl/jobs/Answers.py +14 -1
- edsl/jobs/Jobs.py +431 -356
- edsl/jobs/JobsChecks.py +10 -35
- edsl/jobs/JobsPrompts.py +4 -6
- edsl/jobs/JobsRemoteInferenceHandler.py +133 -205
- edsl/jobs/buckets/BucketCollection.py +3 -44
- edsl/jobs/buckets/TokenBucket.py +21 -53
- edsl/jobs/interviews/Interview.py +408 -143
- edsl/jobs/runners/JobsRunnerAsyncio.py +403 -88
- edsl/jobs/runners/JobsRunnerStatus.py +165 -133
- edsl/jobs/tasks/QuestionTaskCreator.py +19 -21
- edsl/jobs/tasks/TaskHistory.py +18 -38
- edsl/jobs/tasks/task_status_enum.py +2 -0
- edsl/language_models/KeyLookup.py +30 -0
- edsl/language_models/LanguageModel.py +236 -194
- edsl/language_models/ModelList.py +19 -28
- edsl/language_models/__init__.py +2 -1
- edsl/language_models/registry.py +190 -0
- edsl/language_models/repair.py +2 -2
- edsl/language_models/unused/ReplicateBase.py +83 -0
- edsl/language_models/utilities.py +4 -5
- edsl/notebooks/Notebook.py +14 -19
- edsl/prompts/Prompt.py +39 -29
- edsl/questions/{answer_validator_mixin.py → AnswerValidatorMixin.py} +2 -47
- edsl/questions/QuestionBase.py +214 -68
- edsl/questions/{question_base_gen_mixin.py → QuestionBaseGenMixin.py} +50 -57
- edsl/questions/QuestionBasePromptsMixin.py +3 -7
- edsl/questions/QuestionBudget.py +1 -1
- edsl/questions/QuestionCheckBox.py +3 -3
- edsl/questions/QuestionExtract.py +7 -5
- edsl/questions/QuestionFreeText.py +3 -2
- edsl/questions/QuestionList.py +18 -10
- edsl/questions/QuestionMultipleChoice.py +23 -67
- edsl/questions/QuestionNumerical.py +4 -2
- edsl/questions/QuestionRank.py +17 -7
- edsl/questions/{response_validator_abc.py → ResponseValidatorABC.py} +26 -40
- edsl/questions/SimpleAskMixin.py +3 -4
- edsl/questions/__init__.py +1 -2
- edsl/questions/derived/QuestionLinearScale.py +3 -6
- edsl/questions/derived/QuestionTopK.py +1 -1
- edsl/questions/descriptors.py +3 -17
- edsl/questions/question_registry.py +1 -1
- edsl/results/CSSParameterizer.py +1 -1
- edsl/results/Dataset.py +7 -170
- edsl/results/DatasetExportMixin.py +305 -168
- edsl/results/DatasetTree.py +8 -28
- edsl/results/Result.py +206 -298
- edsl/results/Results.py +131 -149
- edsl/results/ResultsDBMixin.py +238 -0
- edsl/results/ResultsExportMixin.py +0 -2
- edsl/results/{results_selector.py → Selector.py} +13 -23
- edsl/results/TableDisplay.py +171 -98
- edsl/results/__init__.py +1 -1
- edsl/scenarios/FileStore.py +239 -150
- edsl/scenarios/Scenario.py +193 -90
- edsl/scenarios/ScenarioHtmlMixin.py +3 -4
- edsl/scenarios/{scenario_join.py → ScenarioJoin.py} +6 -10
- edsl/scenarios/ScenarioList.py +244 -415
- edsl/scenarios/ScenarioListExportMixin.py +7 -0
- edsl/scenarios/ScenarioListPdfMixin.py +37 -15
- edsl/scenarios/__init__.py +2 -1
- edsl/study/ObjectEntry.py +1 -1
- edsl/study/SnapShot.py +1 -1
- edsl/study/Study.py +12 -5
- edsl/surveys/Rule.py +4 -5
- edsl/surveys/RuleCollection.py +27 -25
- edsl/surveys/Survey.py +791 -270
- edsl/surveys/SurveyCSS.py +8 -20
- edsl/surveys/{SurveyFlowVisualization.py → SurveyFlowVisualizationMixin.py} +9 -11
- edsl/surveys/__init__.py +2 -4
- edsl/surveys/descriptors.py +2 -6
- edsl/surveys/instructions/ChangeInstruction.py +2 -1
- edsl/surveys/instructions/Instruction.py +13 -4
- edsl/surveys/instructions/InstructionCollection.py +6 -11
- edsl/templates/error_reporting/interview_details.html +1 -1
- edsl/templates/error_reporting/report.html +1 -1
- edsl/tools/plotting.py +1 -1
- edsl/utilities/utilities.py +23 -35
- {edsl-0.1.39.dist-info → edsl-0.1.39.dev1.dist-info}/METADATA +10 -12
- edsl-0.1.39.dev1.dist-info/RECORD +277 -0
- {edsl-0.1.39.dist-info → edsl-0.1.39.dev1.dist-info}/WHEEL +1 -1
- edsl/agents/QuestionInstructionPromptBuilder.py +0 -128
- edsl/agents/QuestionTemplateReplacementsBuilder.py +0 -137
- edsl/agents/question_option_processor.py +0 -172
- edsl/coop/CoopFunctionsMixin.py +0 -15
- edsl/coop/ExpectedParrotKeyHandler.py +0 -125
- edsl/exceptions/inference_services.py +0 -5
- edsl/inference_services/AvailableModelCacheHandler.py +0 -184
- edsl/inference_services/AvailableModelFetcher.py +0 -215
- edsl/inference_services/ServiceAvailability.py +0 -135
- edsl/inference_services/data_structures.py +0 -134
- edsl/jobs/AnswerQuestionFunctionConstructor.py +0 -223
- edsl/jobs/FetchInvigilator.py +0 -47
- edsl/jobs/InterviewTaskManager.py +0 -98
- edsl/jobs/InterviewsConstructor.py +0 -50
- edsl/jobs/JobsComponentConstructor.py +0 -189
- edsl/jobs/JobsRemoteInferenceLogger.py +0 -239
- edsl/jobs/RequestTokenEstimator.py +0 -30
- edsl/jobs/async_interview_runner.py +0 -138
- edsl/jobs/buckets/TokenBucketAPI.py +0 -211
- edsl/jobs/buckets/TokenBucketClient.py +0 -191
- edsl/jobs/check_survey_scenario_compatibility.py +0 -85
- edsl/jobs/data_structures.py +0 -120
- edsl/jobs/decorators.py +0 -35
- edsl/jobs/jobs_status_enums.py +0 -9
- edsl/jobs/loggers/HTMLTableJobLogger.py +0 -304
- edsl/jobs/results_exceptions_handler.py +0 -98
- edsl/language_models/ComputeCost.py +0 -63
- edsl/language_models/PriceManager.py +0 -127
- edsl/language_models/RawResponseHandler.py +0 -106
- edsl/language_models/ServiceDataSources.py +0 -0
- edsl/language_models/key_management/KeyLookup.py +0 -63
- edsl/language_models/key_management/KeyLookupBuilder.py +0 -273
- edsl/language_models/key_management/KeyLookupCollection.py +0 -38
- edsl/language_models/key_management/__init__.py +0 -0
- edsl/language_models/key_management/models.py +0 -131
- edsl/language_models/model.py +0 -256
- edsl/notebooks/NotebookToLaTeX.py +0 -142
- edsl/questions/ExceptionExplainer.py +0 -77
- edsl/questions/HTMLQuestion.py +0 -103
- edsl/questions/QuestionMatrix.py +0 -265
- edsl/questions/data_structures.py +0 -20
- edsl/questions/loop_processor.py +0 -149
- edsl/questions/response_validator_factory.py +0 -34
- edsl/questions/templates/matrix/__init__.py +0 -1
- edsl/questions/templates/matrix/answering_instructions.jinja +0 -5
- edsl/questions/templates/matrix/question_presentation.jinja +0 -20
- edsl/results/MarkdownToDocx.py +0 -122
- edsl/results/MarkdownToPDF.py +0 -111
- edsl/results/TextEditor.py +0 -50
- edsl/results/file_exports.py +0 -252
- edsl/results/smart_objects.py +0 -96
- edsl/results/table_data_class.py +0 -12
- edsl/results/table_renderers.py +0 -118
- edsl/scenarios/ConstructDownloadLink.py +0 -109
- edsl/scenarios/DocumentChunker.py +0 -102
- edsl/scenarios/DocxScenario.py +0 -16
- edsl/scenarios/PdfExtractor.py +0 -40
- edsl/scenarios/directory_scanner.py +0 -96
- edsl/scenarios/file_methods.py +0 -85
- edsl/scenarios/handlers/__init__.py +0 -13
- edsl/scenarios/handlers/csv.py +0 -49
- edsl/scenarios/handlers/docx.py +0 -76
- edsl/scenarios/handlers/html.py +0 -37
- edsl/scenarios/handlers/json.py +0 -111
- edsl/scenarios/handlers/latex.py +0 -5
- edsl/scenarios/handlers/md.py +0 -51
- edsl/scenarios/handlers/pdf.py +0 -68
- edsl/scenarios/handlers/png.py +0 -39
- edsl/scenarios/handlers/pptx.py +0 -105
- edsl/scenarios/handlers/py.py +0 -294
- edsl/scenarios/handlers/sql.py +0 -313
- edsl/scenarios/handlers/sqlite.py +0 -149
- edsl/scenarios/handlers/txt.py +0 -33
- edsl/scenarios/scenario_selector.py +0 -156
- edsl/surveys/ConstructDAG.py +0 -92
- edsl/surveys/EditSurvey.py +0 -221
- edsl/surveys/InstructionHandler.py +0 -100
- edsl/surveys/MemoryManagement.py +0 -72
- edsl/surveys/RuleManager.py +0 -172
- edsl/surveys/Simulator.py +0 -75
- edsl/surveys/SurveyToApp.py +0 -141
- edsl/utilities/PrettyList.py +0 -56
- edsl/utilities/is_notebook.py +0 -18
- edsl/utilities/is_valid_variable_name.py +0 -11
- edsl/utilities/remove_edsl_version.py +0 -24
- edsl-0.1.39.dist-info/RECORD +0 -358
- /edsl/questions/{register_questions_meta.py → RegisterQuestionsMeta.py} +0 -0
- /edsl/results/{results_fetch_mixin.py → ResultsFetchMixin.py} +0 -0
- /edsl/results/{results_tools_mixin.py → ResultsToolsMixin.py} +0 -0
- {edsl-0.1.39.dist-info → edsl-0.1.39.dev1.dist-info}/LICENSE +0 -0
edsl/results/table_renderers.py
DELETED
@@ -1,118 +0,0 @@
|
|
1
|
-
from abc import ABC, abstractmethod
|
2
|
-
from edsl.results.table_data_class import TableData
|
3
|
-
|
4
|
-
|
5
|
-
class DataTablesRendererABC(ABC):
|
6
|
-
def __init__(self, table_data: TableData):
|
7
|
-
self.table_data = table_data
|
8
|
-
|
9
|
-
@abstractmethod
|
10
|
-
def render_html(self) -> str:
|
11
|
-
pass
|
12
|
-
|
13
|
-
|
14
|
-
class DataTablesRenderer(DataTablesRendererABC):
|
15
|
-
"""Interactive DataTables renderer implementation"""
|
16
|
-
|
17
|
-
def render_html(self) -> str:
|
18
|
-
html_template = """
|
19
|
-
<!DOCTYPE html>
|
20
|
-
<html>
|
21
|
-
<head>
|
22
|
-
<link href="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/5.3.0/css/bootstrap.min.css" rel="stylesheet">
|
23
|
-
<link href="https://cdnjs.cloudflare.com/ajax/libs/datatables.net-bs5/1.13.6/dataTables.bootstrap5.min.css" rel="stylesheet">
|
24
|
-
<link href="https://cdnjs.cloudflare.com/ajax/libs/datatables.net-buttons-bs5/2.4.1/buttons.bootstrap5.min.css" rel="stylesheet">
|
25
|
-
<link href="https://cdnjs.cloudflare.com/ajax/libs/datatables.net-responsive-bs5/2.4.1/responsive.bootstrap5.min.css" rel="stylesheet">
|
26
|
-
<style>
|
27
|
-
{css}
|
28
|
-
</style>
|
29
|
-
</head>
|
30
|
-
<body>
|
31
|
-
<div class="container">
|
32
|
-
<table id="interactive-table" class="table table-striped" style="width:100%">
|
33
|
-
<thead>
|
34
|
-
<tr>{header_cells}</tr>
|
35
|
-
</thead>
|
36
|
-
<tbody>{body_rows}</tbody>
|
37
|
-
</table>
|
38
|
-
</div>
|
39
|
-
<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.7.0/jquery.min.js"></script>
|
40
|
-
<script src="https://cdnjs.cloudflare.com/ajax/libs/datatables.net/1.13.6/jquery.dataTables.min.js"></script>
|
41
|
-
<script src="https://cdnjs.cloudflare.com/ajax/libs/datatables.net-bs5/1.13.6/dataTables.bootstrap5.min.js"></script>
|
42
|
-
<script>
|
43
|
-
$(document).ready(function() {{
|
44
|
-
$('#interactive-table').DataTable({{
|
45
|
-
pageLength: 10,
|
46
|
-
lengthMenu: [[5, 10, 25, -1], [5, 10, 25, "All"]],
|
47
|
-
scrollX: true,
|
48
|
-
responsive: true,
|
49
|
-
dom: 'Bfrtip',
|
50
|
-
buttons: [
|
51
|
-
{{
|
52
|
-
extend: 'colvis',
|
53
|
-
text: 'Show/Hide Columns'
|
54
|
-
}}
|
55
|
-
]
|
56
|
-
}});
|
57
|
-
}});
|
58
|
-
</script>
|
59
|
-
</body>
|
60
|
-
</html>
|
61
|
-
"""
|
62
|
-
|
63
|
-
header_cells = "".join(
|
64
|
-
f"<th>{header}</th>" for header in self.table_data.headers
|
65
|
-
)
|
66
|
-
body_rows = ""
|
67
|
-
for row in self.table_data.data:
|
68
|
-
body_rows += "<tr>"
|
69
|
-
body_rows += "".join(f"<td>{cell}</td>" for cell in row)
|
70
|
-
body_rows += "</tr>"
|
71
|
-
|
72
|
-
parameters = self.table_data.parameters or {}
|
73
|
-
css = self.get_css()
|
74
|
-
if hasattr(self, "css_parameterizer"):
|
75
|
-
css = self.css_parameterizer(css).apply_parameters(parameters)
|
76
|
-
|
77
|
-
return html_template.format(
|
78
|
-
css=css, header_cells=header_cells, body_rows=body_rows
|
79
|
-
)
|
80
|
-
|
81
|
-
@classmethod
|
82
|
-
def get_css(cls) -> str:
|
83
|
-
"""Load CSS content from the file next to this module"""
|
84
|
-
css_path = Path(__file__).parent / "table_display.css"
|
85
|
-
return css_path.read_text()
|
86
|
-
|
87
|
-
|
88
|
-
class PandasStyleRenderer(DataTablesRendererABC):
|
89
|
-
"""Pandas-based styled renderer implementation"""
|
90
|
-
|
91
|
-
def render_html(self) -> str:
|
92
|
-
import pandas as pd
|
93
|
-
|
94
|
-
from contextlib import redirect_stderr
|
95
|
-
import io
|
96
|
-
|
97
|
-
stderr = io.StringIO()
|
98
|
-
with redirect_stderr(stderr):
|
99
|
-
if self.table_data.raw_data_set is not None and hasattr(
|
100
|
-
self.table_data.raw_data_set, "to_pandas"
|
101
|
-
):
|
102
|
-
df = self.table_data.raw_data_set.to_pandas()
|
103
|
-
else:
|
104
|
-
df = pd.DataFrame(self.table_data.data, columns=self.table_data.headers)
|
105
|
-
|
106
|
-
styled_df = df.style.set_properties(
|
107
|
-
**{"text-align": "left"}
|
108
|
-
).background_gradient()
|
109
|
-
|
110
|
-
return f"""
|
111
|
-
<div style="max-height: 500px; overflow-y: auto;">
|
112
|
-
{styled_df.to_html()}
|
113
|
-
</div>
|
114
|
-
"""
|
115
|
-
|
116
|
-
@classmethod
|
117
|
-
def get_css(cls) -> str:
|
118
|
-
return "" # Pandas styling handles its own CSS
|
@@ -1,109 +0,0 @@
|
|
1
|
-
import os
|
2
|
-
import mimetypes
|
3
|
-
|
4
|
-
|
5
|
-
class ConstructDownloadLink:
|
6
|
-
"""
|
7
|
-
A class to create HTML download links for FileStore objects.
|
8
|
-
The links can be displayed in Jupyter notebooks or other web interfaces.
|
9
|
-
"""
|
10
|
-
|
11
|
-
def __init__(self, filestore):
|
12
|
-
"""
|
13
|
-
Initialize with a FileStore object.
|
14
|
-
|
15
|
-
Args:
|
16
|
-
filestore: A FileStore object containing the file to be made downloadable
|
17
|
-
"""
|
18
|
-
self.filestore = filestore
|
19
|
-
|
20
|
-
def create_link(self, custom_filename=None, style=None):
|
21
|
-
from IPython.display import HTML
|
22
|
-
|
23
|
-
html = self.html_create_link(custom_filename, style)
|
24
|
-
return HTML(html)
|
25
|
-
|
26
|
-
def html_create_link(self, custom_filename=None, style=None):
|
27
|
-
"""
|
28
|
-
Create an HTML download link for the file.
|
29
|
-
|
30
|
-
Args:
|
31
|
-
custom_filename (str, optional): Custom name for the downloaded file.
|
32
|
-
If None, uses original filename.
|
33
|
-
style (dict, optional): Custom CSS styles for the download button.
|
34
|
-
If None, uses default styling.
|
35
|
-
|
36
|
-
Returns:
|
37
|
-
IPython.display.HTML: HTML object containing the download link
|
38
|
-
"""
|
39
|
-
|
40
|
-
# Get filename from path or use custom filename
|
41
|
-
original_filename = os.path.basename(self.filestore.path)
|
42
|
-
filename = custom_filename or original_filename
|
43
|
-
|
44
|
-
# Use the base64 string already stored in FileStore
|
45
|
-
b64_data = self.filestore.base64_string
|
46
|
-
|
47
|
-
# Use mime type from FileStore or guess it
|
48
|
-
mime_type = self.filestore.mime_type
|
49
|
-
|
50
|
-
# Default style if none provided
|
51
|
-
default_style = {
|
52
|
-
"background-color": "#4CAF50",
|
53
|
-
"color": "white",
|
54
|
-
"padding": "10px 20px",
|
55
|
-
"text-decoration": "none",
|
56
|
-
"border-radius": "4px",
|
57
|
-
"display": "inline-block",
|
58
|
-
"margin": "10px 0",
|
59
|
-
"font-family": "sans-serif",
|
60
|
-
"cursor": "pointer",
|
61
|
-
}
|
62
|
-
|
63
|
-
button_style = style or default_style
|
64
|
-
style_str = "; ".join(f"{k}: {v}" for k, v in button_style.items())
|
65
|
-
|
66
|
-
html = f"""
|
67
|
-
<a download="{filename}"
|
68
|
-
href="data:{mime_type};base64,{b64_data}"
|
69
|
-
style="{style_str}">
|
70
|
-
Download {filename}
|
71
|
-
</a>
|
72
|
-
"""
|
73
|
-
return html
|
74
|
-
|
75
|
-
def create_multiple_links(self, files, custom_filenames=None, style=None):
|
76
|
-
"""
|
77
|
-
Create multiple download links at once.
|
78
|
-
Useful when you want to provide different versions of the same file
|
79
|
-
or related files together.
|
80
|
-
|
81
|
-
Args:
|
82
|
-
files (list): List of FileStore objects
|
83
|
-
custom_filenames (list, optional): List of custom filenames for downloads
|
84
|
-
style (dict, optional): Custom CSS styles for the download buttons
|
85
|
-
|
86
|
-
Returns:
|
87
|
-
IPython.display.HTML: HTML object containing all download links
|
88
|
-
"""
|
89
|
-
if custom_filenames is None:
|
90
|
-
custom_filenames = [None] * len(files)
|
91
|
-
|
92
|
-
html_parts = []
|
93
|
-
for file_obj, custom_name in zip(files, custom_filenames):
|
94
|
-
link_creator = ConstructDownloadLink(file_obj)
|
95
|
-
html_parts.append(
|
96
|
-
link_creator.create_link(
|
97
|
-
custom_filename=custom_name, style=style
|
98
|
-
)._repr_html_()
|
99
|
-
)
|
100
|
-
|
101
|
-
return HTML(
|
102
|
-
'<div style="display: flex; gap: 10px;">' + "".join(html_parts) + "</div>"
|
103
|
-
)
|
104
|
-
|
105
|
-
|
106
|
-
if __name__ == "__main__":
|
107
|
-
import doctest
|
108
|
-
|
109
|
-
doctest.testmod()
|
@@ -1,102 +0,0 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
from typing import Optional, Generator, TYPE_CHECKING
|
3
|
-
import copy
|
4
|
-
|
5
|
-
if TYPE_CHECKING:
|
6
|
-
from edsl.scenarios.Scenario import Scenario
|
7
|
-
from edsl.scenarios.ScenarioList import ScenarioList
|
8
|
-
|
9
|
-
|
10
|
-
class DocumentChunker:
|
11
|
-
def __init__(self, scenario: "Scenario"):
|
12
|
-
self.scenario = scenario
|
13
|
-
|
14
|
-
@staticmethod
|
15
|
-
def _line_chunks(text, num_lines: int) -> Generator[str, None, None]:
|
16
|
-
"""Split a text into chunks of a given size.
|
17
|
-
|
18
|
-
:param text: The text to split.
|
19
|
-
:param num_lines: The number of lines in each chunk.
|
20
|
-
|
21
|
-
Example:
|
22
|
-
|
23
|
-
>>> list(DocumentChunker._line_chunks("This is a test.\\nThis is a test. This is a test.", 1))
|
24
|
-
['This is a test.', 'This is a test. This is a test.']
|
25
|
-
"""
|
26
|
-
lines = text.split("\n")
|
27
|
-
for i in range(0, len(lines), num_lines):
|
28
|
-
chunk = "\n".join(lines[i : i + num_lines])
|
29
|
-
yield chunk
|
30
|
-
|
31
|
-
@staticmethod
|
32
|
-
def _word_chunks(text, num_words: int) -> Generator[str, None, None]:
|
33
|
-
"""Split a text into chunks of a given size.
|
34
|
-
|
35
|
-
:param text: The text to split.
|
36
|
-
:param num_words: The number of words in each chunk.
|
37
|
-
|
38
|
-
Example:
|
39
|
-
|
40
|
-
>>> list(DocumentChunker._word_chunks("This is a test.", 2))
|
41
|
-
['This is', 'a test.']
|
42
|
-
"""
|
43
|
-
words = text.split()
|
44
|
-
for i in range(0, len(words), num_words):
|
45
|
-
chunk = " ".join(words[i : i + num_words])
|
46
|
-
yield chunk
|
47
|
-
|
48
|
-
def chunk(
|
49
|
-
self,
|
50
|
-
field,
|
51
|
-
num_words: Optional[int] = None,
|
52
|
-
num_lines: Optional[int] = None,
|
53
|
-
include_original=False,
|
54
|
-
hash_original=False,
|
55
|
-
) -> ScenarioList:
|
56
|
-
"""Split a field into chunks of a given size.
|
57
|
-
|
58
|
-
:param field: The field to split.
|
59
|
-
:param num_words: The number of words in each chunk.
|
60
|
-
:param num_lines: The number of lines in each chunk.
|
61
|
-
:param include_original: Whether to include the original field in the new scenarios.
|
62
|
-
:param hash_original: Whether to hash the original field in the new scenarios.
|
63
|
-
|
64
|
-
If you specify `include_original=True`, the original field will be included in the new scenarios with an "_original" suffix.
|
65
|
-
"""
|
66
|
-
from edsl.scenarios.ScenarioList import ScenarioList
|
67
|
-
import hashlib
|
68
|
-
|
69
|
-
if num_words is not None:
|
70
|
-
chunks = list(self._word_chunks(self.scenario[field], num_words))
|
71
|
-
|
72
|
-
if num_lines is not None:
|
73
|
-
chunks = list(self._line_chunks(self.scenario[field], num_lines))
|
74
|
-
|
75
|
-
if num_words is None and num_lines is None:
|
76
|
-
raise ValueError("You must specify either num_words or num_lines.")
|
77
|
-
|
78
|
-
if num_words is not None and num_lines is not None:
|
79
|
-
raise ValueError(
|
80
|
-
"You must specify either num_words or num_lines, but not both."
|
81
|
-
)
|
82
|
-
|
83
|
-
scenarios = []
|
84
|
-
for i, chunk in enumerate(chunks):
|
85
|
-
new_scenario = copy.deepcopy(self.scenario)
|
86
|
-
new_scenario[field] = chunk
|
87
|
-
new_scenario[field + "_chunk"] = i
|
88
|
-
if include_original:
|
89
|
-
if hash_original:
|
90
|
-
new_scenario[field + "_original"] = hashlib.md5(
|
91
|
-
self.scenario[field].encode()
|
92
|
-
).hexdigest()
|
93
|
-
else:
|
94
|
-
new_scenario[field + "_original"] = self.scenario[field]
|
95
|
-
scenarios.append(new_scenario)
|
96
|
-
return ScenarioList(scenarios)
|
97
|
-
|
98
|
-
|
99
|
-
if __name__ == "__main__":
|
100
|
-
import doctest
|
101
|
-
|
102
|
-
doctest.testmod()
|
edsl/scenarios/DocxScenario.py
DELETED
@@ -1,16 +0,0 @@
|
|
1
|
-
class DocxScenario:
|
2
|
-
def __init__(self, docx_path: str):
|
3
|
-
from docx import Document
|
4
|
-
|
5
|
-
self.doc = Document(docx_path)
|
6
|
-
self.docx_path = docx_path
|
7
|
-
|
8
|
-
def get_scenario_dict(self) -> dict:
|
9
|
-
# Extract all text
|
10
|
-
full_text = []
|
11
|
-
for para in self.doc.paragraphs:
|
12
|
-
full_text.append(para.text)
|
13
|
-
|
14
|
-
# Join the text from all paragraphs
|
15
|
-
text = "\n".join(full_text)
|
16
|
-
return {"file_path": self.docx_path, "text": text}
|
edsl/scenarios/PdfExtractor.py
DELETED
@@ -1,40 +0,0 @@
|
|
1
|
-
import os
|
2
|
-
|
3
|
-
|
4
|
-
class PdfExtractor:
|
5
|
-
def __init__(self, pdf_path: str, parent_object: object):
|
6
|
-
self.pdf_path = pdf_path
|
7
|
-
self.constructor = parent_object.__class__
|
8
|
-
|
9
|
-
def get_object(self) -> object:
|
10
|
-
return self.constructor(self._get_pdf_dict())
|
11
|
-
|
12
|
-
def _get_pdf_dict(self) -> dict:
|
13
|
-
# Ensure the file exists
|
14
|
-
import fitz
|
15
|
-
|
16
|
-
if not os.path.exists(self.pdf_path):
|
17
|
-
raise FileNotFoundError(f"The file {self.pdf_path} does not exist.")
|
18
|
-
|
19
|
-
# Open the PDF file
|
20
|
-
document = fitz.open(self.pdf_path)
|
21
|
-
|
22
|
-
# Get the filename from the path
|
23
|
-
filename = os.path.basename(self.pdf_path)
|
24
|
-
|
25
|
-
# Iterate through each page and extract text
|
26
|
-
text = ""
|
27
|
-
for page_num in range(len(document)):
|
28
|
-
page = document.load_page(page_num)
|
29
|
-
blocks = page.get_text("blocks") # Extract text blocks
|
30
|
-
|
31
|
-
# Sort blocks by their vertical position (y0) to maintain reading order
|
32
|
-
blocks.sort(key=lambda b: (b[1], b[0])) # Sort by y0 first, then x0
|
33
|
-
|
34
|
-
# Combine the text blocks in order
|
35
|
-
for block in blocks:
|
36
|
-
text += block[4] + "\n"
|
37
|
-
|
38
|
-
# Create a dictionary for the combined text
|
39
|
-
page_info = {"filename": filename, "text": text}
|
40
|
-
return page_info
|
@@ -1,96 +0,0 @@
|
|
1
|
-
# directory_scanner.py
|
2
|
-
from dataclasses import dataclass
|
3
|
-
from typing import Optional, List, Iterator, TypeVar, Generic, Callable, Any
|
4
|
-
import os
|
5
|
-
|
6
|
-
T = TypeVar("T")
|
7
|
-
|
8
|
-
|
9
|
-
@dataclass
|
10
|
-
class DirectoryScanner:
|
11
|
-
"""
|
12
|
-
Scanner for finding files in a directory based on various criteria.
|
13
|
-
"""
|
14
|
-
|
15
|
-
directory_path: str
|
16
|
-
|
17
|
-
def scan(
|
18
|
-
self,
|
19
|
-
factory: Callable[[str], T],
|
20
|
-
recursive: bool = False,
|
21
|
-
suffix_allow_list: Optional[List[str]] = None,
|
22
|
-
suffix_exclude_list: Optional[List[str]] = None,
|
23
|
-
example_suffix: Optional[str] = None,
|
24
|
-
include_no_extension: bool = True,
|
25
|
-
) -> List[T]:
|
26
|
-
"""
|
27
|
-
Eagerly scan directory and return list of objects created by factory.
|
28
|
-
|
29
|
-
Args:
|
30
|
-
factory: Callable that creates objects from file paths
|
31
|
-
recursive: If True, recursively traverse subdirectories
|
32
|
-
suffix_allow_list: List of allowed file extensions (without dots)
|
33
|
-
suffix_exclude_list: List of excluded file extensions (takes precedence over allow list)
|
34
|
-
example_suffix: If provided, only include files with this example suffix
|
35
|
-
include_no_extension: Whether to include files without extensions
|
36
|
-
"""
|
37
|
-
return list(
|
38
|
-
self.iter_scan(
|
39
|
-
factory,
|
40
|
-
recursive=recursive,
|
41
|
-
suffix_allow_list=suffix_allow_list,
|
42
|
-
suffix_exclude_list=suffix_exclude_list,
|
43
|
-
example_suffix=example_suffix,
|
44
|
-
include_no_extension=include_no_extension,
|
45
|
-
)
|
46
|
-
)
|
47
|
-
|
48
|
-
def iter_scan(
|
49
|
-
self,
|
50
|
-
factory: Callable[[str], T],
|
51
|
-
recursive: bool = False,
|
52
|
-
suffix_allow_list: Optional[List[str]] = None,
|
53
|
-
suffix_exclude_list: Optional[List[str]] = None,
|
54
|
-
example_suffix: Optional[str] = None,
|
55
|
-
include_no_extension: bool = True,
|
56
|
-
) -> Iterator[T]:
|
57
|
-
"""
|
58
|
-
Lazily scan directory and yield objects created by factory.
|
59
|
-
"""
|
60
|
-
|
61
|
-
def should_include_file(filepath: str) -> bool:
|
62
|
-
_, ext = os.path.splitext(filepath)
|
63
|
-
ext = ext[1:] if ext else ""
|
64
|
-
|
65
|
-
# Handle no extension case
|
66
|
-
if not ext:
|
67
|
-
return include_no_extension
|
68
|
-
|
69
|
-
# Check exclusions first (they take precedence)
|
70
|
-
if suffix_exclude_list and ext in suffix_exclude_list:
|
71
|
-
return False
|
72
|
-
|
73
|
-
# Check example suffix if specified
|
74
|
-
if example_suffix and not filepath.endswith(example_suffix):
|
75
|
-
return False
|
76
|
-
|
77
|
-
# Check allowed suffixes if specified
|
78
|
-
if suffix_allow_list and ext not in suffix_allow_list:
|
79
|
-
return False
|
80
|
-
|
81
|
-
return True
|
82
|
-
|
83
|
-
def iter_files():
|
84
|
-
if recursive:
|
85
|
-
for root, _, files in os.walk(self.directory_path):
|
86
|
-
for file in files:
|
87
|
-
yield os.path.join(root, file)
|
88
|
-
else:
|
89
|
-
for file in os.listdir(self.directory_path):
|
90
|
-
file_path = os.path.join(self.directory_path, file)
|
91
|
-
if os.path.isfile(file_path):
|
92
|
-
yield file_path
|
93
|
-
|
94
|
-
for file_path in iter_files():
|
95
|
-
if should_include_file(file_path):
|
96
|
-
yield factory(file_path)
|
edsl/scenarios/file_methods.py
DELETED
@@ -1,85 +0,0 @@
|
|
1
|
-
from typing import Optional, Dict, Type
|
2
|
-
from abc import ABC, abstractmethod
|
3
|
-
import importlib.metadata
|
4
|
-
import importlib.util
|
5
|
-
|
6
|
-
from edsl.utilities.is_notebook import is_notebook
|
7
|
-
|
8
|
-
|
9
|
-
class FileMethods(ABC):
|
10
|
-
_handlers: Dict[str, Type["FileMethods"]] = {}
|
11
|
-
|
12
|
-
def __init__(self, path: Optional[str] = None):
|
13
|
-
self.path = path
|
14
|
-
|
15
|
-
def __init_subclass__(cls) -> None:
|
16
|
-
"""Register subclasses automatically when they're defined."""
|
17
|
-
super().__init_subclass__()
|
18
|
-
if hasattr(cls, "suffix"):
|
19
|
-
FileMethods._handlers[cls.suffix] = cls
|
20
|
-
|
21
|
-
@classmethod
|
22
|
-
def get_handler(cls, suffix: str) -> Optional[Type["FileMethods"]]:
|
23
|
-
"""Get the appropriate handler class for a given suffix."""
|
24
|
-
# Load plugins if they haven't been loaded yet
|
25
|
-
if not cls._handlers:
|
26
|
-
cls.load_plugins()
|
27
|
-
return cls._handlers.get(suffix.lower())
|
28
|
-
|
29
|
-
@classmethod
|
30
|
-
def load_plugins(cls):
|
31
|
-
"""Load all file handler plugins including built-ins and external plugins."""
|
32
|
-
|
33
|
-
from edsl.scenarios import handlers
|
34
|
-
|
35
|
-
# Then load any external plugins
|
36
|
-
try:
|
37
|
-
entries = importlib.metadata.entry_points(group="file_handlers")
|
38
|
-
except TypeError: # some Python 3.9 bullshit
|
39
|
-
# entries = importlib.metadata.entry_points()
|
40
|
-
entries = []
|
41
|
-
|
42
|
-
for ep in entries:
|
43
|
-
try:
|
44
|
-
handler_class = ep.load()
|
45
|
-
# Registration happens automatically via __init_subclass__
|
46
|
-
except Exception as e:
|
47
|
-
print(f"Failed to load external handler {ep.name}: {e}")
|
48
|
-
|
49
|
-
@classmethod
|
50
|
-
def get_handler_for_path(cls, path: str) -> Optional[Type["FileMethods"]]:
|
51
|
-
"""Get the appropriate handler class for a file path."""
|
52
|
-
suffix = path.split(".")[-1].lower() if "." in path else ""
|
53
|
-
return cls.get_handler(suffix)
|
54
|
-
|
55
|
-
@classmethod
|
56
|
-
def create(cls, path: str) -> Optional["FileMethods"]:
|
57
|
-
"""Create an appropriate handler instance for the given path."""
|
58
|
-
handler_class = cls.get_handler_for_path(path)
|
59
|
-
if handler_class:
|
60
|
-
return handler_class(path)
|
61
|
-
return None
|
62
|
-
|
63
|
-
@classmethod
|
64
|
-
def supported_file_types(cls):
|
65
|
-
if not cls._handlers:
|
66
|
-
cls.load_plugins()
|
67
|
-
return list(cls._handlers.keys())
|
68
|
-
|
69
|
-
@abstractmethod
|
70
|
-
def view_system(self):
|
71
|
-
...
|
72
|
-
|
73
|
-
@abstractmethod
|
74
|
-
def view_notebook(self):
|
75
|
-
...
|
76
|
-
|
77
|
-
def view(self):
|
78
|
-
if is_notebook():
|
79
|
-
self.view_notebook()
|
80
|
-
else:
|
81
|
-
self.view_system()
|
82
|
-
|
83
|
-
@abstractmethod
|
84
|
-
def example(self):
|
85
|
-
...
|
@@ -1,13 +0,0 @@
|
|
1
|
-
from .pdf import PdfMethods
|
2
|
-
from .docx import DocxMethods
|
3
|
-
from .png import PngMethods
|
4
|
-
from .txt import TxtMethods
|
5
|
-
from .html import HtmlMethods
|
6
|
-
from .md import MarkdownMethods
|
7
|
-
from .csv import CsvMethods
|
8
|
-
from .json import JsonMethods
|
9
|
-
from .sql import SqlMethods
|
10
|
-
from .pptx import PptxMethods
|
11
|
-
from .latex import LaTeXMethods
|
12
|
-
from .py import PyMethods
|
13
|
-
from .sqlite import SQLiteMethods
|
edsl/scenarios/handlers/csv.py
DELETED
@@ -1,49 +0,0 @@
|
|
1
|
-
import tempfile
|
2
|
-
from edsl.scenarios.file_methods import FileMethods
|
3
|
-
|
4
|
-
|
5
|
-
class CsvMethods(FileMethods):
|
6
|
-
suffix = "csv"
|
7
|
-
|
8
|
-
def view_system(self):
|
9
|
-
import os
|
10
|
-
import subprocess
|
11
|
-
|
12
|
-
if os.path.exists(self.path):
|
13
|
-
try:
|
14
|
-
if (os_name := os.name) == "posix":
|
15
|
-
subprocess.run(["open", self.path], check=True) # macOS
|
16
|
-
elif os_name == "nt":
|
17
|
-
os.startfile(self.path) # Windows
|
18
|
-
else:
|
19
|
-
subprocess.run(["xdg-open", self.path], check=True) # Linux
|
20
|
-
except Exception as e:
|
21
|
-
print(f"Error opening CSV: {e}")
|
22
|
-
else:
|
23
|
-
print("CSV file was not found.")
|
24
|
-
|
25
|
-
def view_notebook(self):
|
26
|
-
import pandas as pd
|
27
|
-
from IPython.display import display
|
28
|
-
|
29
|
-
df = pd.read_csv(self.path)
|
30
|
-
display(df)
|
31
|
-
|
32
|
-
def example(self):
|
33
|
-
import pandas as pd
|
34
|
-
|
35
|
-
df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
|
36
|
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as f:
|
37
|
-
df.to_csv(f.name, index=False)
|
38
|
-
return f.name
|
39
|
-
|
40
|
-
def to_pandas(self):
|
41
|
-
"""
|
42
|
-
Convert the CSV file to a pandas DataFrame.
|
43
|
-
|
44
|
-
Returns:
|
45
|
-
pandas.DataFrame: The data from the CSV as a DataFrame
|
46
|
-
"""
|
47
|
-
import pandas as pd
|
48
|
-
|
49
|
-
return pd.read_csv(self.path)
|