ara-cli 0.1.9.69__py3-none-any.whl → 0.1.10.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ara-cli might be problematic. Click here for more details.
- ara_cli/__init__.py +18 -2
- ara_cli/__main__.py +248 -62
- ara_cli/ara_command_action.py +155 -86
- ara_cli/ara_config.py +226 -80
- ara_cli/ara_subcommands/__init__.py +0 -0
- ara_cli/ara_subcommands/autofix.py +26 -0
- ara_cli/ara_subcommands/chat.py +27 -0
- ara_cli/ara_subcommands/classifier_directory.py +16 -0
- ara_cli/ara_subcommands/common.py +100 -0
- ara_cli/ara_subcommands/create.py +75 -0
- ara_cli/ara_subcommands/delete.py +22 -0
- ara_cli/ara_subcommands/extract.py +22 -0
- ara_cli/ara_subcommands/fetch_templates.py +14 -0
- ara_cli/ara_subcommands/list.py +65 -0
- ara_cli/ara_subcommands/list_tags.py +25 -0
- ara_cli/ara_subcommands/load.py +48 -0
- ara_cli/ara_subcommands/prompt.py +136 -0
- ara_cli/ara_subcommands/read.py +47 -0
- ara_cli/ara_subcommands/read_status.py +20 -0
- ara_cli/ara_subcommands/read_user.py +20 -0
- ara_cli/ara_subcommands/reconnect.py +27 -0
- ara_cli/ara_subcommands/rename.py +22 -0
- ara_cli/ara_subcommands/scan.py +14 -0
- ara_cli/ara_subcommands/set_status.py +22 -0
- ara_cli/ara_subcommands/set_user.py +22 -0
- ara_cli/ara_subcommands/template.py +16 -0
- ara_cli/artefact_autofix.py +649 -68
- ara_cli/artefact_creator.py +8 -11
- ara_cli/artefact_deleter.py +2 -4
- ara_cli/artefact_fuzzy_search.py +22 -10
- ara_cli/artefact_link_updater.py +4 -4
- ara_cli/artefact_lister.py +29 -55
- ara_cli/artefact_models/artefact_data_retrieval.py +23 -0
- ara_cli/artefact_models/artefact_load.py +11 -3
- ara_cli/artefact_models/artefact_model.py +146 -39
- ara_cli/artefact_models/artefact_templates.py +70 -44
- ara_cli/artefact_models/businessgoal_artefact_model.py +23 -25
- ara_cli/artefact_models/epic_artefact_model.py +34 -26
- ara_cli/artefact_models/feature_artefact_model.py +203 -64
- ara_cli/artefact_models/keyfeature_artefact_model.py +21 -24
- ara_cli/artefact_models/serialize_helper.py +1 -1
- ara_cli/artefact_models/task_artefact_model.py +83 -15
- ara_cli/artefact_models/userstory_artefact_model.py +37 -27
- ara_cli/artefact_models/vision_artefact_model.py +23 -42
- ara_cli/artefact_reader.py +92 -91
- ara_cli/artefact_renamer.py +8 -4
- ara_cli/artefact_scan.py +66 -3
- ara_cli/chat.py +622 -162
- ara_cli/chat_agent/__init__.py +0 -0
- ara_cli/chat_agent/agent_communicator.py +62 -0
- ara_cli/chat_agent/agent_process_manager.py +211 -0
- ara_cli/chat_agent/agent_status_manager.py +73 -0
- ara_cli/chat_agent/agent_workspace_manager.py +76 -0
- ara_cli/commands/__init__.py +0 -0
- ara_cli/commands/command.py +7 -0
- ara_cli/commands/extract_command.py +15 -0
- ara_cli/commands/load_command.py +65 -0
- ara_cli/commands/load_image_command.py +34 -0
- ara_cli/commands/read_command.py +117 -0
- ara_cli/completers.py +144 -0
- ara_cli/directory_navigator.py +37 -4
- ara_cli/error_handler.py +134 -0
- ara_cli/file_classifier.py +6 -5
- ara_cli/file_lister.py +1 -1
- ara_cli/file_loaders/__init__.py +0 -0
- ara_cli/file_loaders/binary_file_loader.py +33 -0
- ara_cli/file_loaders/document_file_loader.py +34 -0
- ara_cli/file_loaders/document_reader.py +245 -0
- ara_cli/file_loaders/document_readers.py +233 -0
- ara_cli/file_loaders/file_loader.py +50 -0
- ara_cli/file_loaders/file_loaders.py +123 -0
- ara_cli/file_loaders/image_processor.py +89 -0
- ara_cli/file_loaders/markdown_reader.py +75 -0
- ara_cli/file_loaders/text_file_loader.py +187 -0
- ara_cli/global_file_lister.py +51 -0
- ara_cli/list_filter.py +1 -1
- ara_cli/output_suppressor.py +1 -1
- ara_cli/prompt_extractor.py +215 -88
- ara_cli/prompt_handler.py +521 -134
- ara_cli/prompt_rag.py +2 -2
- ara_cli/tag_extractor.py +83 -38
- ara_cli/template_loader.py +245 -0
- ara_cli/template_manager.py +18 -13
- ara_cli/templates/prompt-modules/commands/empty.commands.md +2 -12
- ara_cli/templates/prompt-modules/commands/extract_general.commands.md +12 -0
- ara_cli/templates/prompt-modules/commands/extract_markdown.commands.md +11 -0
- ara_cli/templates/prompt-modules/commands/extract_python.commands.md +13 -0
- ara_cli/templates/prompt-modules/commands/feature_add_or_modifiy_specified_behavior.commands.md +36 -0
- ara_cli/templates/prompt-modules/commands/feature_generate_initial_specified_bevahior.commands.md +53 -0
- ara_cli/templates/prompt-modules/commands/prompt_template_tech_stack_transformer.commands.md +95 -0
- ara_cli/templates/prompt-modules/commands/python_bug_fixing_code.commands.md +34 -0
- ara_cli/templates/prompt-modules/commands/python_generate_code.commands.md +27 -0
- ara_cli/templates/prompt-modules/commands/python_refactoring_code.commands.md +39 -0
- ara_cli/templates/prompt-modules/commands/python_step_definitions_generation_and_fixing.commands.md +40 -0
- ara_cli/templates/prompt-modules/commands/python_unittest_generation_and_fixing.commands.md +48 -0
- ara_cli/update_config_prompt.py +9 -3
- ara_cli/version.py +1 -1
- ara_cli-0.1.10.8.dist-info/METADATA +241 -0
- ara_cli-0.1.10.8.dist-info/RECORD +193 -0
- tests/test_ara_command_action.py +73 -59
- tests/test_ara_config.py +341 -36
- tests/test_artefact_autofix.py +1060 -0
- tests/test_artefact_link_updater.py +3 -3
- tests/test_artefact_lister.py +52 -132
- tests/test_artefact_renamer.py +2 -2
- tests/test_artefact_scan.py +327 -33
- tests/test_chat.py +2063 -498
- tests/test_file_classifier.py +24 -1
- tests/test_file_creator.py +3 -5
- tests/test_file_lister.py +1 -1
- tests/test_global_file_lister.py +131 -0
- tests/test_list_filter.py +2 -2
- tests/test_prompt_handler.py +746 -0
- tests/test_tag_extractor.py +19 -13
- tests/test_template_loader.py +192 -0
- tests/test_template_manager.py +5 -4
- tests/test_update_config_prompt.py +2 -2
- ara_cli/ara_command_parser.py +0 -327
- ara_cli/templates/prompt-modules/blueprints/complete_pytest_unittest.blueprint.md +0 -27
- ara_cli/templates/prompt-modules/blueprints/task_todo_list_implement_feature_BDD_way.blueprint.md +0 -30
- ara_cli/templates/prompt-modules/commands/artefact_classification.commands.md +0 -9
- ara_cli/templates/prompt-modules/commands/artefact_extension.commands.md +0 -17
- ara_cli/templates/prompt-modules/commands/artefact_formulation.commands.md +0 -14
- ara_cli/templates/prompt-modules/commands/behave_step_generation.commands.md +0 -102
- ara_cli/templates/prompt-modules/commands/code_generation_complex.commands.md +0 -20
- ara_cli/templates/prompt-modules/commands/code_generation_simple.commands.md +0 -13
- ara_cli/templates/prompt-modules/commands/error_fixing.commands.md +0 -20
- ara_cli/templates/prompt-modules/commands/feature_file_update.commands.md +0 -18
- ara_cli/templates/prompt-modules/commands/feature_formulation.commands.md +0 -43
- ara_cli/templates/prompt-modules/commands/js_code_generation_simple.commands.md +0 -13
- ara_cli/templates/prompt-modules/commands/refactoring.commands.md +0 -15
- ara_cli/templates/prompt-modules/commands/refactoring_analysis.commands.md +0 -9
- ara_cli/templates/prompt-modules/commands/reverse_engineer_feature_file.commands.md +0 -15
- ara_cli/templates/prompt-modules/commands/reverse_engineer_program_flow.commands.md +0 -19
- ara_cli/templates/template.businessgoal +0 -10
- ara_cli/templates/template.capability +0 -10
- ara_cli/templates/template.epic +0 -15
- ara_cli/templates/template.example +0 -6
- ara_cli/templates/template.feature +0 -26
- ara_cli/templates/template.issue +0 -14
- ara_cli/templates/template.keyfeature +0 -15
- ara_cli/templates/template.task +0 -6
- ara_cli/templates/template.userstory +0 -17
- ara_cli/templates/template.vision +0 -14
- ara_cli-0.1.9.69.dist-info/METADATA +0 -16
- ara_cli-0.1.9.69.dist-info/RECORD +0 -158
- tests/test_ara_autofix.py +0 -219
- {ara_cli-0.1.9.69.dist-info → ara_cli-0.1.10.8.dist-info}/WHEEL +0 -0
- {ara_cli-0.1.9.69.dist-info → ara_cli-0.1.10.8.dist-info}/entry_points.txt +0 -0
- {ara_cli-0.1.9.69.dist-info → ara_cli-0.1.10.8.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from typing import Tuple, Optional
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class DocumentReader(ABC):
|
|
7
|
+
"""Abstract base class for document readers"""
|
|
8
|
+
|
|
9
|
+
def __init__(self, file_path: str):
|
|
10
|
+
self.file_path = file_path
|
|
11
|
+
self.base_dir = os.path.dirname(file_path)
|
|
12
|
+
|
|
13
|
+
@abstractmethod
|
|
14
|
+
def read(self, extract_images: bool = False) -> str:
|
|
15
|
+
"""Read document and optionally extract images"""
|
|
16
|
+
pass
|
|
17
|
+
|
|
18
|
+
def create_image_data_dir(self, extension_suffix: str) -> str:
|
|
19
|
+
"""
|
|
20
|
+
Create data directory for images with file extension suffix to avoid conflicts.
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
str: Path to images directory
|
|
24
|
+
"""
|
|
25
|
+
file_name_with_ext = os.path.splitext(os.path.basename(self.file_path))[
|
|
26
|
+
0] + f"_{extension_suffix}"
|
|
27
|
+
data_dir = os.path.join(self.base_dir, f"{file_name_with_ext}.data")
|
|
28
|
+
images_dir = os.path.join(data_dir, "images")
|
|
29
|
+
if not os.path.exists(images_dir):
|
|
30
|
+
os.makedirs(images_dir)
|
|
31
|
+
return images_dir
|
|
32
|
+
|
|
33
|
+
def save_and_describe_image(
|
|
34
|
+
self,
|
|
35
|
+
image_data: bytes,
|
|
36
|
+
image_format: str,
|
|
37
|
+
save_dir: str,
|
|
38
|
+
image_counter: int
|
|
39
|
+
) -> Tuple[str, str]:
|
|
40
|
+
"""
|
|
41
|
+
Save image data and get its description from LLM.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
tuple: (relative_image_path, description)
|
|
45
|
+
"""
|
|
46
|
+
from ara_cli.prompt_handler import describe_image
|
|
47
|
+
|
|
48
|
+
# Save image
|
|
49
|
+
image_filename = f"{image_counter}.{image_format}"
|
|
50
|
+
image_path = os.path.join(save_dir, image_filename)
|
|
51
|
+
|
|
52
|
+
with open(image_path, "wb") as image_file:
|
|
53
|
+
image_file.write(image_data)
|
|
54
|
+
|
|
55
|
+
# Get image description from LLM
|
|
56
|
+
description = describe_image(image_path)
|
|
57
|
+
|
|
58
|
+
# Get relative path
|
|
59
|
+
relative_image_path = os.path.relpath(image_path, self.base_dir)
|
|
60
|
+
|
|
61
|
+
return relative_image_path, description
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class DocxReader(DocumentReader):
|
|
65
|
+
"""Reader for DOCX files"""
|
|
66
|
+
|
|
67
|
+
def read(self, extract_images: bool = False) -> str:
|
|
68
|
+
import docx
|
|
69
|
+
|
|
70
|
+
doc = docx.Document(self.file_path)
|
|
71
|
+
text_content = '\n'.join(para.text for para in doc.paragraphs)
|
|
72
|
+
|
|
73
|
+
if not extract_images:
|
|
74
|
+
return text_content
|
|
75
|
+
|
|
76
|
+
from PIL import Image
|
|
77
|
+
import io
|
|
78
|
+
|
|
79
|
+
# Create data directory for images
|
|
80
|
+
images_dir = self.create_image_data_dir("docx")
|
|
81
|
+
|
|
82
|
+
# Extract and process images
|
|
83
|
+
image_descriptions = []
|
|
84
|
+
image_counter = 1
|
|
85
|
+
|
|
86
|
+
for rel in doc.part.rels.values():
|
|
87
|
+
if "image" in rel.reltype:
|
|
88
|
+
image_data = rel.target_part.blob
|
|
89
|
+
|
|
90
|
+
# Determine image format
|
|
91
|
+
image = Image.open(io.BytesIO(image_data))
|
|
92
|
+
image_format = image.format.lower()
|
|
93
|
+
|
|
94
|
+
# Save and describe image
|
|
95
|
+
relative_path, description = self.save_and_describe_image(
|
|
96
|
+
image_data, image_format, images_dir, image_counter
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
# Add formatted description to list
|
|
100
|
+
image_description = f"\nImage: {relative_path}\n[{description}]\n"
|
|
101
|
+
image_descriptions.append(image_description)
|
|
102
|
+
|
|
103
|
+
image_counter += 1
|
|
104
|
+
|
|
105
|
+
# Combine text content with image descriptions
|
|
106
|
+
if image_descriptions:
|
|
107
|
+
text_content += "\n\n### Extracted Images\n" + \
|
|
108
|
+
"\n".join(image_descriptions)
|
|
109
|
+
|
|
110
|
+
return text_content
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class PdfReader(DocumentReader):
|
|
114
|
+
"""Reader for PDF files"""
|
|
115
|
+
|
|
116
|
+
def read(self, extract_images: bool = False) -> str:
|
|
117
|
+
import pymupdf4llm
|
|
118
|
+
|
|
119
|
+
if not extract_images:
|
|
120
|
+
return pymupdf4llm.to_markdown(self.file_path, write_images=False)
|
|
121
|
+
|
|
122
|
+
import fitz # PyMuPDF
|
|
123
|
+
|
|
124
|
+
# Create images directory
|
|
125
|
+
images_dir = self.create_image_data_dir("pdf")
|
|
126
|
+
|
|
127
|
+
# Extract text without images first
|
|
128
|
+
text_content = pymupdf4llm.to_markdown(
|
|
129
|
+
self.file_path, write_images=False)
|
|
130
|
+
|
|
131
|
+
# Extract and process images
|
|
132
|
+
doc = fitz.open(self.file_path)
|
|
133
|
+
image_descriptions = []
|
|
134
|
+
image_counter = 1
|
|
135
|
+
|
|
136
|
+
for page_num, page in enumerate(doc):
|
|
137
|
+
image_list = page.get_images()
|
|
138
|
+
|
|
139
|
+
for img_index, img in enumerate(image_list):
|
|
140
|
+
# Extract image
|
|
141
|
+
xref = img[0]
|
|
142
|
+
base_image = doc.extract_image(xref)
|
|
143
|
+
image_bytes = base_image["image"]
|
|
144
|
+
image_ext = base_image["ext"]
|
|
145
|
+
|
|
146
|
+
# Save and describe image
|
|
147
|
+
relative_path, description = self.save_and_describe_image(
|
|
148
|
+
image_bytes, image_ext, images_dir, image_counter
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
# Add formatted description to list
|
|
152
|
+
image_description = f"\nImage: {relative_path}\n[{description}]\n"
|
|
153
|
+
image_descriptions.append(image_description)
|
|
154
|
+
|
|
155
|
+
image_counter += 1
|
|
156
|
+
|
|
157
|
+
doc.close()
|
|
158
|
+
|
|
159
|
+
# Combine text content with image descriptions
|
|
160
|
+
if image_descriptions:
|
|
161
|
+
text_content += "\n\n### Extracted Images\n" + \
|
|
162
|
+
"\n".join(image_descriptions)
|
|
163
|
+
|
|
164
|
+
return text_content
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
class OdtReader(DocumentReader):
|
|
168
|
+
"""Reader for ODT files"""
|
|
169
|
+
|
|
170
|
+
def read(self, extract_images: bool = False) -> str:
|
|
171
|
+
import pymupdf4llm
|
|
172
|
+
|
|
173
|
+
if not extract_images:
|
|
174
|
+
return pymupdf4llm.to_markdown(self.file_path, write_images=False)
|
|
175
|
+
|
|
176
|
+
import zipfile
|
|
177
|
+
from PIL import Image
|
|
178
|
+
import io
|
|
179
|
+
|
|
180
|
+
# Create data directory for images
|
|
181
|
+
images_dir = self.create_image_data_dir("odt")
|
|
182
|
+
|
|
183
|
+
# Get text content
|
|
184
|
+
text_content = pymupdf4llm.to_markdown(
|
|
185
|
+
self.file_path, write_images=False)
|
|
186
|
+
|
|
187
|
+
# Extract and process images from ODT
|
|
188
|
+
image_descriptions = []
|
|
189
|
+
image_counter = 1
|
|
190
|
+
|
|
191
|
+
try:
|
|
192
|
+
with zipfile.ZipFile(self.file_path, 'r') as odt_zip:
|
|
193
|
+
# List all files in the Pictures directory
|
|
194
|
+
picture_files = [
|
|
195
|
+
f for f in odt_zip.namelist() if f.startswith('Pictures/')]
|
|
196
|
+
|
|
197
|
+
for picture_file in picture_files:
|
|
198
|
+
# Extract image data
|
|
199
|
+
image_data = odt_zip.read(picture_file)
|
|
200
|
+
|
|
201
|
+
# Determine image format
|
|
202
|
+
image = Image.open(io.BytesIO(image_data))
|
|
203
|
+
image_format = image.format.lower()
|
|
204
|
+
|
|
205
|
+
# Save and describe image
|
|
206
|
+
relative_path, description = self.save_and_describe_image(
|
|
207
|
+
image_data, image_format, images_dir, image_counter
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
# Add formatted description to list
|
|
211
|
+
image_description = f"\nImage: {relative_path}\n[{description}]\n"
|
|
212
|
+
image_descriptions.append(image_description)
|
|
213
|
+
|
|
214
|
+
image_counter += 1
|
|
215
|
+
except Exception as e:
|
|
216
|
+
print(f"Warning: Could not extract images from ODT: {e}")
|
|
217
|
+
|
|
218
|
+
# Combine text content with image descriptions
|
|
219
|
+
if image_descriptions:
|
|
220
|
+
text_content += "\n\n### Extracted Images\n" + \
|
|
221
|
+
"\n".join(image_descriptions)
|
|
222
|
+
|
|
223
|
+
return text_content
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
class DocumentReaderFactory:
|
|
227
|
+
"""Factory for creating appropriate document readers"""
|
|
228
|
+
|
|
229
|
+
@staticmethod
|
|
230
|
+
def create_reader(file_path: str) -> Optional[DocumentReader]:
|
|
231
|
+
"""Create appropriate reader based on file extension"""
|
|
232
|
+
_, ext = os.path.splitext(file_path)
|
|
233
|
+
ext = ext.lower()
|
|
234
|
+
|
|
235
|
+
readers = {
|
|
236
|
+
'.docx': DocxReader,
|
|
237
|
+
'.pdf': PdfReader,
|
|
238
|
+
'.odt': OdtReader
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
reader_class = readers.get(ext)
|
|
242
|
+
if reader_class:
|
|
243
|
+
return reader_class(file_path)
|
|
244
|
+
|
|
245
|
+
return None
|
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from typing import Tuple, Optional
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class DocumentReader(ABC):
|
|
7
|
+
"""Abstract base class for document readers"""
|
|
8
|
+
|
|
9
|
+
def __init__(self, file_path: str):
|
|
10
|
+
self.file_path = file_path
|
|
11
|
+
self.base_dir = os.path.dirname(file_path)
|
|
12
|
+
|
|
13
|
+
@abstractmethod
|
|
14
|
+
def read(self, extract_images: bool = False) -> str:
|
|
15
|
+
"""Read document and optionally extract images"""
|
|
16
|
+
pass
|
|
17
|
+
|
|
18
|
+
def create_image_data_dir(self, extension_suffix: str) -> str:
|
|
19
|
+
"""
|
|
20
|
+
Create data directory for images with file extension suffix to avoid conflicts.
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
str: Path to images directory
|
|
24
|
+
"""
|
|
25
|
+
file_name_with_ext = os.path.splitext(os.path.basename(self.file_path))[0] + f"_{extension_suffix}"
|
|
26
|
+
data_dir = os.path.join(self.base_dir, f"{file_name_with_ext}.data")
|
|
27
|
+
images_dir = os.path.join(data_dir, "images")
|
|
28
|
+
if not os.path.exists(images_dir):
|
|
29
|
+
os.makedirs(images_dir)
|
|
30
|
+
return images_dir
|
|
31
|
+
|
|
32
|
+
def save_and_describe_image(self, image_data: bytes, image_format: str,
|
|
33
|
+
save_dir: str, image_counter: int) -> Tuple[str, str]:
|
|
34
|
+
"""
|
|
35
|
+
Save image data and get its description from LLM.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
tuple: (relative_image_path, description)
|
|
39
|
+
"""
|
|
40
|
+
from ara_cli.prompt_handler import describe_image
|
|
41
|
+
|
|
42
|
+
# Save image
|
|
43
|
+
image_filename = f"{image_counter}.{image_format}"
|
|
44
|
+
image_path = os.path.join(save_dir, image_filename)
|
|
45
|
+
|
|
46
|
+
with open(image_path, "wb") as image_file:
|
|
47
|
+
image_file.write(image_data)
|
|
48
|
+
|
|
49
|
+
# Get image description from LLM
|
|
50
|
+
description = describe_image(image_path)
|
|
51
|
+
|
|
52
|
+
# Get relative path
|
|
53
|
+
relative_image_path = os.path.relpath(image_path, self.base_dir)
|
|
54
|
+
|
|
55
|
+
return relative_image_path, description
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class DocxReader(DocumentReader):
|
|
59
|
+
"""Reader for DOCX files"""
|
|
60
|
+
|
|
61
|
+
def read(self, extract_images: bool = False) -> str:
|
|
62
|
+
import docx
|
|
63
|
+
|
|
64
|
+
doc = docx.Document(self.file_path)
|
|
65
|
+
text_content = '\n'.join(para.text for para in doc.paragraphs)
|
|
66
|
+
|
|
67
|
+
if not extract_images:
|
|
68
|
+
return text_content
|
|
69
|
+
|
|
70
|
+
from PIL import Image
|
|
71
|
+
import io
|
|
72
|
+
|
|
73
|
+
# Create data directory for images
|
|
74
|
+
images_dir = self.create_image_data_dir("docx")
|
|
75
|
+
|
|
76
|
+
# Extract and process images
|
|
77
|
+
image_descriptions = []
|
|
78
|
+
image_counter = 1
|
|
79
|
+
|
|
80
|
+
for rel in doc.part.rels.values():
|
|
81
|
+
if "image" in rel.reltype:
|
|
82
|
+
image_data = rel.target_part.blob
|
|
83
|
+
|
|
84
|
+
# Determine image format
|
|
85
|
+
image = Image.open(io.BytesIO(image_data))
|
|
86
|
+
image_format = image.format.lower()
|
|
87
|
+
|
|
88
|
+
# Save and describe image
|
|
89
|
+
relative_path, description = self.save_and_describe_image(
|
|
90
|
+
image_data, image_format, images_dir, image_counter
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
# Add formatted description to list
|
|
94
|
+
image_description = f"\nImage: {relative_path}\n[{description}]\n"
|
|
95
|
+
image_descriptions.append(image_description)
|
|
96
|
+
|
|
97
|
+
image_counter += 1
|
|
98
|
+
|
|
99
|
+
# Combine text content with image descriptions
|
|
100
|
+
if image_descriptions:
|
|
101
|
+
text_content += "\n\n### Extracted Images\n" + "\n".join(image_descriptions)
|
|
102
|
+
|
|
103
|
+
return text_content
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class PdfReader(DocumentReader):
|
|
107
|
+
"""Reader for PDF files"""
|
|
108
|
+
|
|
109
|
+
def read(self, extract_images: bool = False) -> str:
|
|
110
|
+
import pymupdf4llm
|
|
111
|
+
|
|
112
|
+
if not extract_images:
|
|
113
|
+
return pymupdf4llm.to_markdown(self.file_path, write_images=False)
|
|
114
|
+
|
|
115
|
+
import fitz # PyMuPDF
|
|
116
|
+
|
|
117
|
+
# Create images directory
|
|
118
|
+
images_dir = self.create_image_data_dir("pdf")
|
|
119
|
+
|
|
120
|
+
# Extract text without images first
|
|
121
|
+
text_content = pymupdf4llm.to_markdown(self.file_path, write_images=False)
|
|
122
|
+
|
|
123
|
+
# Extract and process images
|
|
124
|
+
doc = fitz.open(self.file_path)
|
|
125
|
+
image_descriptions = []
|
|
126
|
+
image_counter = 1
|
|
127
|
+
|
|
128
|
+
for page_num, page in enumerate(doc):
|
|
129
|
+
image_list = page.get_images()
|
|
130
|
+
|
|
131
|
+
for img_index, img in enumerate(image_list):
|
|
132
|
+
# Extract image
|
|
133
|
+
xref = img[0]
|
|
134
|
+
base_image = doc.extract_image(xref)
|
|
135
|
+
image_bytes = base_image["image"]
|
|
136
|
+
image_ext = base_image["ext"]
|
|
137
|
+
|
|
138
|
+
# Save and describe image
|
|
139
|
+
relative_path, description = self.save_and_describe_image(
|
|
140
|
+
image_bytes, image_ext, images_dir, image_counter
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
# Add formatted description to list
|
|
144
|
+
image_description = f"\nImage: {relative_path}\n[{description}]\n"
|
|
145
|
+
image_descriptions.append(image_description)
|
|
146
|
+
|
|
147
|
+
image_counter += 1
|
|
148
|
+
|
|
149
|
+
doc.close()
|
|
150
|
+
|
|
151
|
+
# Combine text content with image descriptions
|
|
152
|
+
if image_descriptions:
|
|
153
|
+
text_content += "\n\n### Extracted Images\n" + "\n".join(image_descriptions)
|
|
154
|
+
|
|
155
|
+
return text_content
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
class OdtReader(DocumentReader):
|
|
159
|
+
"""Reader for ODT files"""
|
|
160
|
+
|
|
161
|
+
def read(self, extract_images: bool = False) -> str:
|
|
162
|
+
import pymupdf4llm
|
|
163
|
+
|
|
164
|
+
if not extract_images:
|
|
165
|
+
return pymupdf4llm.to_markdown(self.file_path, write_images=False)
|
|
166
|
+
|
|
167
|
+
import zipfile
|
|
168
|
+
from PIL import Image
|
|
169
|
+
import io
|
|
170
|
+
|
|
171
|
+
# Create data directory for images
|
|
172
|
+
images_dir = self.create_image_data_dir("odt")
|
|
173
|
+
|
|
174
|
+
# Get text content
|
|
175
|
+
text_content = pymupdf4llm.to_markdown(self.file_path, write_images=False)
|
|
176
|
+
|
|
177
|
+
# Extract and process images from ODT
|
|
178
|
+
image_descriptions = []
|
|
179
|
+
image_counter = 1
|
|
180
|
+
|
|
181
|
+
try:
|
|
182
|
+
with zipfile.ZipFile(self.file_path, 'r') as odt_zip:
|
|
183
|
+
# List all files in the Pictures directory
|
|
184
|
+
picture_files = [f for f in odt_zip.namelist() if f.startswith('Pictures/')]
|
|
185
|
+
|
|
186
|
+
for picture_file in picture_files:
|
|
187
|
+
# Extract image data
|
|
188
|
+
image_data = odt_zip.read(picture_file)
|
|
189
|
+
|
|
190
|
+
# Determine image format
|
|
191
|
+
image = Image.open(io.BytesIO(image_data))
|
|
192
|
+
image_format = image.format.lower()
|
|
193
|
+
|
|
194
|
+
# Save and describe image
|
|
195
|
+
relative_path, description = self.save_and_describe_image(
|
|
196
|
+
image_data, image_format, images_dir, image_counter
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
# Add formatted description to list
|
|
200
|
+
image_description = f"\nImage: {relative_path}\n[{description}]\n"
|
|
201
|
+
image_descriptions.append(image_description)
|
|
202
|
+
|
|
203
|
+
image_counter += 1
|
|
204
|
+
except Exception as e:
|
|
205
|
+
print(f"Warning: Could not extract images from ODT: {e}")
|
|
206
|
+
|
|
207
|
+
# Combine text content with image descriptions
|
|
208
|
+
if image_descriptions:
|
|
209
|
+
text_content += "\n\n### Extracted Images\n" + "\n".join(image_descriptions)
|
|
210
|
+
|
|
211
|
+
return text_content
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
class DocumentReaderFactory:
|
|
215
|
+
"""Factory for creating appropriate document readers"""
|
|
216
|
+
|
|
217
|
+
@staticmethod
|
|
218
|
+
def create_reader(file_path: str) -> Optional[DocumentReader]:
|
|
219
|
+
"""Create appropriate reader based on file extension"""
|
|
220
|
+
_, ext = os.path.splitext(file_path)
|
|
221
|
+
ext = ext.lower()
|
|
222
|
+
|
|
223
|
+
readers = {
|
|
224
|
+
'.docx': DocxReader,
|
|
225
|
+
'.pdf': PdfReader,
|
|
226
|
+
'.odt': OdtReader
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
reader_class = readers.get(ext)
|
|
230
|
+
if reader_class:
|
|
231
|
+
return reader_class(file_path)
|
|
232
|
+
|
|
233
|
+
return None
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class FileLoader(ABC):
|
|
6
|
+
"""Abstract base class for file loaders"""
|
|
7
|
+
|
|
8
|
+
def __init__(self, chat_instance):
|
|
9
|
+
self.chat = chat_instance
|
|
10
|
+
|
|
11
|
+
@abstractmethod
|
|
12
|
+
def load(self, file_path: str, **kwargs) -> bool:
|
|
13
|
+
"""Load file with specific implementation"""
|
|
14
|
+
pass
|
|
15
|
+
|
|
16
|
+
def add_prompt_tag_if_needed(self):
|
|
17
|
+
"""Add prompt tag to chat if needed"""
|
|
18
|
+
self.chat.add_prompt_tag_if_needed(self.chat.chat_name)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class FileLoaderFactory:
|
|
22
|
+
"""Factory for creating appropriate file loaders"""
|
|
23
|
+
BINARY_TYPE_MAPPING = {
|
|
24
|
+
".png": "image/png",
|
|
25
|
+
".jpg": "image/jpeg",
|
|
26
|
+
".jpeg": "image/jpeg",
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
DOCUMENT_TYPE_EXTENSIONS = [".docx", ".doc", ".odt", ".pdf"]
|
|
30
|
+
|
|
31
|
+
@staticmethod
|
|
32
|
+
def create_loader(file_name: str, chat_instance) -> Optional[FileLoader]:
|
|
33
|
+
"""Create appropriate loader based on file type"""
|
|
34
|
+
from ara_cli.file_loaders.binary_file_loader import BinaryFileLoader
|
|
35
|
+
from ara_cli.file_loaders.text_file_loader import TextFileLoader
|
|
36
|
+
from ara_cli.file_loaders.document_file_loader import DocumentFileLoader
|
|
37
|
+
|
|
38
|
+
file_name_lower = file_name.lower()
|
|
39
|
+
|
|
40
|
+
# Check if it's a binary file
|
|
41
|
+
for extension, mime_type in FileLoaderFactory.BINARY_TYPE_MAPPING.items():
|
|
42
|
+
if file_name_lower.endswith(extension):
|
|
43
|
+
return BinaryFileLoader(chat_instance)
|
|
44
|
+
|
|
45
|
+
# Check if it's a document
|
|
46
|
+
if any(file_name_lower.endswith(ext) for ext in FileLoaderFactory.DOCUMENT_TYPE_EXTENSIONS):
|
|
47
|
+
return DocumentFileLoader(chat_instance)
|
|
48
|
+
|
|
49
|
+
# Default to text file loader
|
|
50
|
+
return TextFileLoader(chat_instance)
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import base64
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from typing import Optional
|
|
5
|
+
from ara_cli.file_loaders.markdown_reader import MarkdownReader
|
|
6
|
+
from ara_cli.file_loaders.document_readers import DocumentReaderFactory
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class FileLoader(ABC):
|
|
10
|
+
"""Abstract base class for file loaders"""
|
|
11
|
+
|
|
12
|
+
def __init__(self, chat_instance):
|
|
13
|
+
self.chat = chat_instance
|
|
14
|
+
|
|
15
|
+
@abstractmethod
|
|
16
|
+
def load(self, file_path: str, **kwargs) -> bool:
|
|
17
|
+
"""Load file with specific implementation"""
|
|
18
|
+
pass
|
|
19
|
+
|
|
20
|
+
def add_prompt_tag_if_needed(self):
|
|
21
|
+
"""Add prompt tag to chat if needed"""
|
|
22
|
+
self.chat.add_prompt_tag_if_needed(self.chat.chat_name)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class TextFileLoader(FileLoader):
|
|
26
|
+
"""Loads text files"""
|
|
27
|
+
|
|
28
|
+
def load(self, file_path: str, prefix: str = "", suffix: str = "",
|
|
29
|
+
block_delimiter: str = "", extract_images: bool = False) -> bool:
|
|
30
|
+
"""Load text file with optional markdown image extraction"""
|
|
31
|
+
|
|
32
|
+
is_md_file = file_path.lower().endswith('.md')
|
|
33
|
+
|
|
34
|
+
if is_md_file and extract_images:
|
|
35
|
+
reader = MarkdownReader(file_path)
|
|
36
|
+
file_content = reader.read(extract_images=True)
|
|
37
|
+
else:
|
|
38
|
+
with open(file_path, 'r', encoding='utf-8', errors="replace") as file:
|
|
39
|
+
file_content = file.read()
|
|
40
|
+
|
|
41
|
+
if block_delimiter:
|
|
42
|
+
file_content = f"{block_delimiter}\n{file_content}\n{block_delimiter}"
|
|
43
|
+
|
|
44
|
+
write_content = f"{prefix}{file_content}{suffix}\n"
|
|
45
|
+
|
|
46
|
+
with open(self.chat.chat_name, 'a', encoding='utf-8') as chat_file:
|
|
47
|
+
chat_file.write(write_content)
|
|
48
|
+
|
|
49
|
+
return True
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class BinaryFileLoader(FileLoader):
|
|
53
|
+
"""Loads binary files (images)"""
|
|
54
|
+
|
|
55
|
+
def load(self, file_path: str, mime_type: str, prefix: str = "", suffix: str = "") -> bool:
|
|
56
|
+
"""Load binary file as base64"""
|
|
57
|
+
|
|
58
|
+
with open(file_path, 'rb') as file:
|
|
59
|
+
file_content = file.read()
|
|
60
|
+
|
|
61
|
+
base64_image = base64.b64encode(file_content).decode("utf-8")
|
|
62
|
+
write_content = f"{prefix}{suffix}\n"
|
|
63
|
+
|
|
64
|
+
with open(self.chat.chat_name, 'a', encoding='utf-8') as chat_file:
|
|
65
|
+
chat_file.write(write_content)
|
|
66
|
+
|
|
67
|
+
return True
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class DocumentFileLoader(FileLoader):
|
|
71
|
+
"""Loads document files (PDF, DOCX, ODT)"""
|
|
72
|
+
|
|
73
|
+
def load(self, file_path: str, prefix: str = "", suffix: str = "",
|
|
74
|
+
block_delimiter: str = "```", extract_images: bool = False) -> bool:
|
|
75
|
+
"""Load document file with optional image extraction"""
|
|
76
|
+
|
|
77
|
+
reader = DocumentReaderFactory.create_reader(file_path)
|
|
78
|
+
|
|
79
|
+
if not reader:
|
|
80
|
+
print("Unsupported document type.")
|
|
81
|
+
return False
|
|
82
|
+
|
|
83
|
+
text_content = reader.read(extract_images=extract_images)
|
|
84
|
+
|
|
85
|
+
if block_delimiter:
|
|
86
|
+
text_content = f"{block_delimiter}\n{text_content}\n{block_delimiter}"
|
|
87
|
+
|
|
88
|
+
write_content = f"{prefix}{text_content}{suffix}\n"
|
|
89
|
+
|
|
90
|
+
with open(self.chat.chat_name, 'a', encoding='utf-8') as chat_file:
|
|
91
|
+
chat_file.write(write_content)
|
|
92
|
+
|
|
93
|
+
return True
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class FileLoaderFactory:
|
|
97
|
+
"""Factory for creating appropriate file loaders"""
|
|
98
|
+
|
|
99
|
+
BINARY_TYPE_MAPPING = {
|
|
100
|
+
".png": "image/png",
|
|
101
|
+
".jpg": "image/jpeg",
|
|
102
|
+
".jpeg": "image/jpeg",
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
DOCUMENT_TYPE_EXTENSIONS = [".docx", ".doc", ".odt", ".pdf"]
|
|
106
|
+
|
|
107
|
+
@staticmethod
|
|
108
|
+
def create_loader(file_name: str, chat_instance) -> Optional[FileLoader]:
|
|
109
|
+
"""Create appropriate loader based on file type"""
|
|
110
|
+
|
|
111
|
+
file_name_lower = file_name.lower()
|
|
112
|
+
|
|
113
|
+
# Check if it's a binary file
|
|
114
|
+
for extension, mime_type in FileLoaderFactory.BINARY_TYPE_MAPPING.items():
|
|
115
|
+
if file_name_lower.endswith(extension):
|
|
116
|
+
return BinaryFileLoader(chat_instance)
|
|
117
|
+
|
|
118
|
+
# Check if it's a document
|
|
119
|
+
if any(file_name_lower.endswith(ext) for ext in FileLoaderFactory.DOCUMENT_TYPE_EXTENSIONS):
|
|
120
|
+
return DocumentFileLoader(chat_instance)
|
|
121
|
+
|
|
122
|
+
# Default to text file loader
|
|
123
|
+
return TextFileLoader(chat_instance)
|