ara-cli 0.1.13.3__py3-none-any.whl → 0.1.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ara_cli/__init__.py +1 -1
- ara_cli/ara_command_action.py +162 -112
- ara_cli/ara_config.py +1 -1
- ara_cli/ara_subcommands/convert.py +66 -2
- ara_cli/ara_subcommands/prompt.py +266 -106
- ara_cli/artefact_autofix.py +2 -2
- ara_cli/artefact_converter.py +152 -53
- ara_cli/artefact_creator.py +41 -17
- ara_cli/artefact_lister.py +3 -3
- ara_cli/artefact_models/artefact_model.py +1 -1
- ara_cli/artefact_models/artefact_templates.py +0 -9
- ara_cli/artefact_models/feature_artefact_model.py +8 -8
- ara_cli/artefact_reader.py +62 -43
- ara_cli/artefact_scan.py +39 -17
- ara_cli/chat.py +23 -15
- ara_cli/children_contribution_updater.py +737 -0
- ara_cli/classifier.py +34 -0
- ara_cli/commands/load_command.py +4 -3
- ara_cli/commands/load_image_command.py +1 -1
- ara_cli/commands/read_command.py +23 -27
- ara_cli/completers.py +24 -0
- ara_cli/error_handler.py +26 -11
- ara_cli/file_loaders/document_reader.py +0 -178
- ara_cli/file_loaders/factories/__init__.py +0 -0
- ara_cli/file_loaders/factories/document_reader_factory.py +32 -0
- ara_cli/file_loaders/factories/file_loader_factory.py +27 -0
- ara_cli/file_loaders/file_loader.py +1 -30
- ara_cli/file_loaders/loaders/__init__.py +0 -0
- ara_cli/file_loaders/{document_file_loader.py → loaders/document_file_loader.py} +1 -1
- ara_cli/file_loaders/loaders/text_file_loader.py +47 -0
- ara_cli/file_loaders/readers/__init__.py +0 -0
- ara_cli/file_loaders/readers/docx_reader.py +49 -0
- ara_cli/file_loaders/readers/excel_reader.py +27 -0
- ara_cli/file_loaders/{markdown_reader.py → readers/markdown_reader.py} +1 -1
- ara_cli/file_loaders/readers/odt_reader.py +59 -0
- ara_cli/file_loaders/readers/pdf_reader.py +54 -0
- ara_cli/file_loaders/readers/pptx_reader.py +104 -0
- ara_cli/file_loaders/tools/__init__.py +0 -0
- ara_cli/output_suppressor.py +53 -0
- ara_cli/prompt_handler.py +123 -17
- ara_cli/tag_extractor.py +8 -7
- ara_cli/version.py +1 -1
- {ara_cli-0.1.13.3.dist-info → ara_cli-0.1.14.0.dist-info}/METADATA +18 -12
- {ara_cli-0.1.13.3.dist-info → ara_cli-0.1.14.0.dist-info}/RECORD +58 -45
- {ara_cli-0.1.13.3.dist-info → ara_cli-0.1.14.0.dist-info}/WHEEL +1 -1
- tests/test_artefact_converter.py +1 -46
- tests/test_artefact_lister.py +11 -8
- tests/test_chat.py +4 -4
- tests/test_chat_givens_images.py +1 -1
- tests/test_children_contribution_updater.py +98 -0
- tests/test_document_loader_office.py +267 -0
- tests/test_prompt_handler.py +416 -214
- tests/test_setup_default_chat_prompt_mode.py +198 -0
- tests/test_tag_extractor.py +95 -49
- ara_cli/file_loaders/document_readers.py +0 -233
- ara_cli/file_loaders/file_loaders.py +0 -123
- ara_cli/file_loaders/text_file_loader.py +0 -187
- /ara_cli/file_loaders/{binary_file_loader.py → loaders/binary_file_loader.py} +0 -0
- /ara_cli/file_loaders/{image_processor.py → tools/image_processor.py} +0 -0
- {ara_cli-0.1.13.3.dist-info → ara_cli-0.1.14.0.dist-info}/entry_points.txt +0 -0
- {ara_cli-0.1.13.3.dist-info → ara_cli-0.1.14.0.dist-info}/top_level.txt +0 -0
ara_cli/classifier.py
CHANGED
|
@@ -72,3 +72,37 @@ class Classifier:
|
|
|
72
72
|
@lru_cache(maxsize=None)
|
|
73
73
|
def artefact_titles():
|
|
74
74
|
return Classifier.artefact_title.values()
|
|
75
|
+
|
|
76
|
+
# Leaf-node classifiers that cannot have children
|
|
77
|
+
_leaf_classifiers = {"task", "issue"}
|
|
78
|
+
|
|
79
|
+
# Contribution hierarchy: child -> valid parent classifiers
|
|
80
|
+
contribution_hierarchy = {
|
|
81
|
+
"businessgoal": ["vision"],
|
|
82
|
+
"capability": ["vision", "businessgoal"],
|
|
83
|
+
"keyfeature": ["vision", "businessgoal", "capability"],
|
|
84
|
+
"epic": ["vision", "businessgoal", "capability", "keyfeature"],
|
|
85
|
+
"userstory": ["vision", "businessgoal", "capability", "keyfeature", "epic"],
|
|
86
|
+
"example": ["vision", "businessgoal", "capability", "keyfeature", "epic", "userstory"],
|
|
87
|
+
"feature": ["vision", "businessgoal", "capability", "keyfeature", "epic", "userstory"],
|
|
88
|
+
"task": ["vision", "businessgoal", "capability", "keyfeature", "epic", "userstory", "feature"],
|
|
89
|
+
"issue": ["vision", "businessgoal", "capability", "keyfeature", "epic", "userstory", "feature"],
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
@staticmethod
|
|
93
|
+
@lru_cache(maxsize=None)
|
|
94
|
+
def can_have_children(classifier: str) -> bool:
|
|
95
|
+
"""
|
|
96
|
+
Check if a classifier can have children artefacts.
|
|
97
|
+
Task and Issue are leaf-node classifiers and cannot have children.
|
|
98
|
+
"""
|
|
99
|
+
return classifier not in Classifier._leaf_classifiers
|
|
100
|
+
|
|
101
|
+
@staticmethod
|
|
102
|
+
@lru_cache(maxsize=None)
|
|
103
|
+
def get_valid_parent_classifiers(child_classifier: str) -> list:
|
|
104
|
+
"""
|
|
105
|
+
Get list of valid parent classifiers for a given child classifier.
|
|
106
|
+
Returns empty list if classifier has no valid parents (e.g., vision).
|
|
107
|
+
"""
|
|
108
|
+
return Classifier.contribution_hierarchy.get(child_classifier, [])
|
ara_cli/commands/load_command.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from ara_cli.commands.command import Command
|
|
2
|
-
from ara_cli.file_loaders.
|
|
3
|
-
from ara_cli.file_loaders.binary_file_loader import BinaryFileLoader
|
|
2
|
+
from ara_cli.file_loaders.factories.file_loader_factory import FileLoaderFactory
|
|
3
|
+
from ara_cli.file_loaders.loaders.binary_file_loader import BinaryFileLoader
|
|
4
|
+
from ara_cli import BINARY_TYPE_MAPPING
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
class LoadCommand(Command):
|
|
@@ -29,7 +30,7 @@ class LoadCommand(Command):
|
|
|
29
30
|
# Determine mime type for binary files
|
|
30
31
|
file_name_lower = self.file_path.lower()
|
|
31
32
|
mime_type = None
|
|
32
|
-
for extension, mt in
|
|
33
|
+
for extension, mt in BINARY_TYPE_MAPPING.items():
|
|
33
34
|
if file_name_lower.endswith(extension):
|
|
34
35
|
mime_type = mt
|
|
35
36
|
break
|
ara_cli/commands/read_command.py
CHANGED
|
@@ -3,9 +3,9 @@ from ara_cli.artefact_reader import ArtefactReader
|
|
|
3
3
|
from ara_cli.file_classifier import FileClassifier
|
|
4
4
|
from ara_cli.list_filter import ListFilter, filter_list
|
|
5
5
|
from ara_cli.artefact_models.artefact_data_retrieval import (
|
|
6
|
-
artefact_content_retrieval,
|
|
7
|
-
artefact_path_retrieval,
|
|
8
|
-
artefact_tags_retrieval
|
|
6
|
+
artefact_content_retrieval,
|
|
7
|
+
artefact_path_retrieval,
|
|
8
|
+
artefact_tags_retrieval,
|
|
9
9
|
)
|
|
10
10
|
from ara_cli.artefact_fuzzy_search import suggest_close_name_matches
|
|
11
11
|
import os
|
|
@@ -18,7 +18,7 @@ class ReadCommand(Command):
|
|
|
18
18
|
artefact_name: str,
|
|
19
19
|
read_mode: str = "default",
|
|
20
20
|
list_filter: ListFilter = None,
|
|
21
|
-
output=None
|
|
21
|
+
output=None,
|
|
22
22
|
):
|
|
23
23
|
self.classifier = classifier
|
|
24
24
|
self.artefact_name = artefact_name
|
|
@@ -29,7 +29,8 @@ class ReadCommand(Command):
|
|
|
29
29
|
def execute(self) -> bool:
|
|
30
30
|
"""Execute the read command and return success status."""
|
|
31
31
|
file_classifier = FileClassifier(os)
|
|
32
|
-
|
|
32
|
+
reader = ArtefactReader()
|
|
33
|
+
classified_artefacts = reader.read_artefacts()
|
|
33
34
|
|
|
34
35
|
if not self.classifier or not self.artefact_name:
|
|
35
36
|
self._filter_and_print(classified_artefacts, file_classifier)
|
|
@@ -39,15 +40,12 @@ class ReadCommand(Command):
|
|
|
39
40
|
all_artefact_names = [a.title for a in artefacts]
|
|
40
41
|
|
|
41
42
|
if self.artefact_name not in all_artefact_names:
|
|
42
|
-
suggest_close_name_matches(
|
|
43
|
-
self.artefact_name,
|
|
44
|
-
all_artefact_names
|
|
45
|
-
)
|
|
43
|
+
suggest_close_name_matches(self.artefact_name, all_artefact_names)
|
|
46
44
|
return False
|
|
47
45
|
|
|
48
|
-
target_artefact = next(
|
|
49
|
-
lambda x: x.title == self.artefact_name, artefacts
|
|
50
|
-
)
|
|
46
|
+
target_artefact = next(
|
|
47
|
+
filter(lambda x: x.title == self.artefact_name, artefacts)
|
|
48
|
+
)
|
|
51
49
|
|
|
52
50
|
artefacts_by_classifier = {self.classifier: []}
|
|
53
51
|
|
|
@@ -55,16 +53,14 @@ class ReadCommand(Command):
|
|
|
55
53
|
match self.read_mode:
|
|
56
54
|
case "branch":
|
|
57
55
|
self._handle_branch_mode(
|
|
58
|
-
classified_artefacts, artefacts_by_classifier
|
|
56
|
+
classified_artefacts, artefacts_by_classifier, reader
|
|
59
57
|
)
|
|
60
58
|
case "children":
|
|
61
59
|
artefacts_by_classifier = self._handle_children_mode(
|
|
62
|
-
classified_artefacts
|
|
60
|
+
classified_artefacts, reader
|
|
63
61
|
)
|
|
64
62
|
case _:
|
|
65
|
-
self._handle_default_mode(
|
|
66
|
-
target_artefact, artefacts_by_classifier
|
|
67
|
-
)
|
|
63
|
+
self._handle_default_mode(target_artefact, artefacts_by_classifier)
|
|
68
64
|
|
|
69
65
|
# Apply filtering and print results
|
|
70
66
|
self._filter_and_print(artefacts_by_classifier, file_classifier)
|
|
@@ -78,21 +74,23 @@ class ReadCommand(Command):
|
|
|
78
74
|
self.output(f"Error reading artefact: {e}")
|
|
79
75
|
return False
|
|
80
76
|
|
|
81
|
-
def _handle_branch_mode(
|
|
77
|
+
def _handle_branch_mode(
|
|
78
|
+
self, classified_artefacts, artefacts_by_classifier, reader
|
|
79
|
+
):
|
|
82
80
|
"""Handle branch read mode."""
|
|
83
|
-
|
|
81
|
+
reader.step_through_value_chain(
|
|
84
82
|
artefact_name=self.artefact_name,
|
|
85
83
|
classifier=self.classifier,
|
|
86
84
|
artefacts_by_classifier=artefacts_by_classifier,
|
|
87
|
-
classified_artefacts=classified_artefacts
|
|
85
|
+
classified_artefacts=classified_artefacts,
|
|
88
86
|
)
|
|
89
87
|
|
|
90
|
-
def _handle_children_mode(self, classified_artefacts):
|
|
88
|
+
def _handle_children_mode(self, classified_artefacts, reader):
|
|
91
89
|
"""Handle children read mode."""
|
|
92
|
-
return
|
|
90
|
+
return reader.find_children(
|
|
93
91
|
artefact_name=self.artefact_name,
|
|
94
92
|
classifier=self.classifier,
|
|
95
|
-
classified_artefacts=classified_artefacts
|
|
93
|
+
classified_artefacts=classified_artefacts,
|
|
96
94
|
)
|
|
97
95
|
|
|
98
96
|
def _handle_default_mode(self, target_artefact, artefacts_by_classifier):
|
|
@@ -106,12 +104,10 @@ class ReadCommand(Command):
|
|
|
106
104
|
list_filter=self.list_filter,
|
|
107
105
|
content_retrieval_strategy=artefact_content_retrieval,
|
|
108
106
|
file_path_retrieval=artefact_path_retrieval,
|
|
109
|
-
tag_retrieval=artefact_tags_retrieval
|
|
107
|
+
tag_retrieval=artefact_tags_retrieval,
|
|
110
108
|
)
|
|
111
109
|
|
|
112
110
|
def _filter_and_print(self, artefacts_by_classifier, file_classifier):
|
|
113
111
|
"""Apply list filtering and print results"""
|
|
114
112
|
filtered_artefacts = self._apply_filtering(artefacts_by_classifier)
|
|
115
|
-
file_classifier.print_classified_files(
|
|
116
|
-
filtered_artefacts, print_content=True
|
|
117
|
-
)
|
|
113
|
+
file_classifier.print_classified_files(filtered_artefacts, print_content=True)
|
ara_cli/completers.py
CHANGED
|
@@ -87,6 +87,21 @@ def complete_chat_files(incomplete: str) -> List[str]:
|
|
|
87
87
|
return []
|
|
88
88
|
|
|
89
89
|
|
|
90
|
+
def complete_prompt_step(incomplete: str) -> List[str]:
|
|
91
|
+
"""Complete prompt step/subcommand names."""
|
|
92
|
+
steps = [
|
|
93
|
+
"init",
|
|
94
|
+
"load",
|
|
95
|
+
"send",
|
|
96
|
+
"load-and-send",
|
|
97
|
+
"extract",
|
|
98
|
+
"update",
|
|
99
|
+
"chat",
|
|
100
|
+
"init-rag",
|
|
101
|
+
]
|
|
102
|
+
return [s for s in steps if s.startswith(incomplete)]
|
|
103
|
+
|
|
104
|
+
|
|
90
105
|
# Dynamic completers that need context
|
|
91
106
|
class DynamicCompleters:
|
|
92
107
|
@staticmethod
|
|
@@ -164,6 +179,15 @@ class DynamicCompleters:
|
|
|
164
179
|
|
|
165
180
|
return completer
|
|
166
181
|
|
|
182
|
+
@staticmethod
|
|
183
|
+
def create_prompt_step_completer():
|
|
184
|
+
"""Create a completer for prompt step/subcommand names."""
|
|
185
|
+
|
|
186
|
+
def completer(ctx: typer.Context, incomplete: str) -> List[str]:
|
|
187
|
+
return complete_prompt_step(incomplete)
|
|
188
|
+
|
|
189
|
+
return completer
|
|
190
|
+
|
|
167
191
|
@staticmethod
|
|
168
192
|
def create_convert_source_artefact_name_completer():
|
|
169
193
|
"""Create a completer for convert command source artefact names based on old_classifier context."""
|
ara_cli/error_handler.py
CHANGED
|
@@ -5,8 +5,9 @@ from enum import Enum
|
|
|
5
5
|
from functools import wraps
|
|
6
6
|
|
|
7
7
|
|
|
8
|
-
RED =
|
|
9
|
-
|
|
8
|
+
RED = "\033[91m"
|
|
9
|
+
YELLOW = "\033[93m"
|
|
10
|
+
RESET = "\033[0m"
|
|
10
11
|
|
|
11
12
|
|
|
12
13
|
class ErrorLevel(Enum):
|
|
@@ -69,7 +70,6 @@ class ErrorHandler:
|
|
|
69
70
|
|
|
70
71
|
sys.exit(1)
|
|
71
72
|
|
|
72
|
-
|
|
73
73
|
def report_error(self, error: Exception, context: Optional[str] = None) -> None:
|
|
74
74
|
"""Report error with standardized formatting but don't exit"""
|
|
75
75
|
if isinstance(error, AraError):
|
|
@@ -77,33 +77,44 @@ class ErrorHandler:
|
|
|
77
77
|
else:
|
|
78
78
|
self._report_generic_error(error, context)
|
|
79
79
|
|
|
80
|
-
|
|
81
80
|
def _report_ara_error(self, error: AraError, context: Optional[str] = None) -> None:
|
|
82
81
|
"""Report ARA-specific errors without exiting"""
|
|
83
82
|
error_prefix = f"[{error.level.value}]"
|
|
84
83
|
|
|
84
|
+
# Choose color based on error level
|
|
85
|
+
if error.level in (ErrorLevel.INFO, ErrorLevel.WARNING):
|
|
86
|
+
color = YELLOW
|
|
87
|
+
else:
|
|
88
|
+
color = RED
|
|
89
|
+
|
|
85
90
|
if context:
|
|
86
|
-
print(
|
|
91
|
+
print(
|
|
92
|
+
f"{color}{error_prefix} {context}: {error.message}{RESET}",
|
|
93
|
+
file=sys.stderr,
|
|
94
|
+
)
|
|
87
95
|
else:
|
|
88
|
-
print(f"{
|
|
96
|
+
print(f"{color}{error_prefix} {error.message}{RESET}", file=sys.stderr)
|
|
89
97
|
|
|
90
98
|
if self.debug_mode:
|
|
91
99
|
traceback.print_exc()
|
|
92
100
|
|
|
93
|
-
|
|
94
|
-
|
|
101
|
+
def _report_generic_error(
|
|
102
|
+
self, error: Exception, context: Optional[str] = None
|
|
103
|
+
) -> None:
|
|
95
104
|
"""Report generic Python errors without exiting"""
|
|
96
105
|
error_type = type(error).__name__
|
|
97
106
|
|
|
98
107
|
if context:
|
|
99
|
-
print(
|
|
108
|
+
print(
|
|
109
|
+
f"{RED}[ERROR] {context}: {error_type}: {str(error)}{RESET}",
|
|
110
|
+
file=sys.stderr,
|
|
111
|
+
)
|
|
100
112
|
else:
|
|
101
113
|
print(f"{RED}[ERROR] {error_type}: {str(error)}{RESET}", file=sys.stderr)
|
|
102
114
|
|
|
103
115
|
if self.debug_mode:
|
|
104
116
|
traceback.print_exc()
|
|
105
117
|
|
|
106
|
-
|
|
107
118
|
def validate_and_exit(
|
|
108
119
|
self, condition: bool, message: str, error_code: int = 1
|
|
109
120
|
) -> None:
|
|
@@ -112,7 +123,11 @@ class ErrorHandler:
|
|
|
112
123
|
raise AraValidationError(message)
|
|
113
124
|
|
|
114
125
|
|
|
115
|
-
def handle_errors(
|
|
126
|
+
def handle_errors(
|
|
127
|
+
_func=None,
|
|
128
|
+
context: Optional[str] = None,
|
|
129
|
+
error_handler: Optional[ErrorHandler] = None,
|
|
130
|
+
):
|
|
116
131
|
"""Decorator to handle errors in action functions"""
|
|
117
132
|
|
|
118
133
|
def decorator(func):
|
|
@@ -61,185 +61,7 @@ class DocumentReader(ABC):
|
|
|
61
61
|
return relative_image_path, description
|
|
62
62
|
|
|
63
63
|
|
|
64
|
-
class DocxReader(DocumentReader):
|
|
65
|
-
"""Reader for DOCX files"""
|
|
66
64
|
|
|
67
|
-
def read(self, extract_images: bool = False) -> str:
|
|
68
|
-
import docx
|
|
69
|
-
|
|
70
|
-
doc = docx.Document(self.file_path)
|
|
71
|
-
text_content = '\n'.join(para.text for para in doc.paragraphs)
|
|
72
|
-
|
|
73
|
-
if not extract_images:
|
|
74
|
-
return text_content
|
|
75
|
-
|
|
76
|
-
from PIL import Image
|
|
77
|
-
import io
|
|
78
|
-
|
|
79
|
-
# Create data directory for images
|
|
80
|
-
images_dir = self.create_image_data_dir("docx")
|
|
81
|
-
|
|
82
|
-
# Extract and process images
|
|
83
|
-
image_descriptions = []
|
|
84
|
-
image_counter = 1
|
|
85
|
-
|
|
86
|
-
for rel in doc.part.rels.values():
|
|
87
|
-
if "image" in rel.reltype:
|
|
88
|
-
image_data = rel.target_part.blob
|
|
89
|
-
|
|
90
|
-
# Determine image format
|
|
91
|
-
image = Image.open(io.BytesIO(image_data))
|
|
92
|
-
image_format = image.format.lower()
|
|
93
|
-
|
|
94
|
-
# Save and describe image
|
|
95
|
-
relative_path, description = self.save_and_describe_image(
|
|
96
|
-
image_data, image_format, images_dir, image_counter
|
|
97
|
-
)
|
|
98
|
-
|
|
99
|
-
# Add formatted description to list
|
|
100
|
-
image_description = f"\nImage: {relative_path}\n[{description}]\n"
|
|
101
|
-
image_descriptions.append(image_description)
|
|
102
|
-
|
|
103
|
-
image_counter += 1
|
|
104
|
-
|
|
105
|
-
# Combine text content with image descriptions
|
|
106
|
-
if image_descriptions:
|
|
107
|
-
text_content += "\n\n### Extracted Images\n" + \
|
|
108
|
-
"\n".join(image_descriptions)
|
|
109
|
-
|
|
110
|
-
return text_content
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
class PdfReader(DocumentReader):
|
|
114
|
-
"""Reader for PDF files"""
|
|
115
|
-
|
|
116
|
-
def read(self, extract_images: bool = False) -> str:
|
|
117
|
-
import pymupdf4llm
|
|
118
|
-
|
|
119
|
-
if not extract_images:
|
|
120
|
-
return pymupdf4llm.to_markdown(self.file_path, write_images=False)
|
|
121
|
-
|
|
122
|
-
import fitz # PyMuPDF
|
|
123
|
-
|
|
124
|
-
# Create images directory
|
|
125
|
-
images_dir = self.create_image_data_dir("pdf")
|
|
126
|
-
|
|
127
|
-
# Extract text without images first
|
|
128
|
-
text_content = pymupdf4llm.to_markdown(
|
|
129
|
-
self.file_path, write_images=False)
|
|
130
|
-
|
|
131
|
-
# Extract and process images
|
|
132
|
-
doc = fitz.open(self.file_path)
|
|
133
|
-
image_descriptions = []
|
|
134
|
-
image_counter = 1
|
|
135
|
-
|
|
136
|
-
for page_num, page in enumerate(doc):
|
|
137
|
-
image_list = page.get_images()
|
|
138
|
-
|
|
139
|
-
for img_index, img in enumerate(image_list):
|
|
140
|
-
# Extract image
|
|
141
|
-
xref = img[0]
|
|
142
|
-
base_image = doc.extract_image(xref)
|
|
143
|
-
image_bytes = base_image["image"]
|
|
144
|
-
image_ext = base_image["ext"]
|
|
145
|
-
|
|
146
|
-
# Save and describe image
|
|
147
|
-
relative_path, description = self.save_and_describe_image(
|
|
148
|
-
image_bytes, image_ext, images_dir, image_counter
|
|
149
|
-
)
|
|
150
|
-
|
|
151
|
-
# Add formatted description to list
|
|
152
|
-
image_description = f"\nImage: {relative_path}\n[{description}]\n"
|
|
153
|
-
image_descriptions.append(image_description)
|
|
154
|
-
|
|
155
|
-
image_counter += 1
|
|
156
|
-
|
|
157
|
-
doc.close()
|
|
158
|
-
|
|
159
|
-
# Combine text content with image descriptions
|
|
160
|
-
if image_descriptions:
|
|
161
|
-
text_content += "\n\n### Extracted Images\n" + \
|
|
162
|
-
"\n".join(image_descriptions)
|
|
163
|
-
|
|
164
|
-
return text_content
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
class OdtReader(DocumentReader):
|
|
168
|
-
"""Reader for ODT files"""
|
|
169
|
-
|
|
170
|
-
def read(self, extract_images: bool = False) -> str:
|
|
171
|
-
import pymupdf4llm
|
|
172
|
-
|
|
173
|
-
if not extract_images:
|
|
174
|
-
return pymupdf4llm.to_markdown(self.file_path, write_images=False)
|
|
175
|
-
|
|
176
|
-
import zipfile
|
|
177
|
-
from PIL import Image
|
|
178
|
-
import io
|
|
179
|
-
|
|
180
|
-
# Create data directory for images
|
|
181
|
-
images_dir = self.create_image_data_dir("odt")
|
|
182
|
-
|
|
183
|
-
# Get text content
|
|
184
|
-
text_content = pymupdf4llm.to_markdown(
|
|
185
|
-
self.file_path, write_images=False)
|
|
186
|
-
|
|
187
|
-
# Extract and process images from ODT
|
|
188
|
-
image_descriptions = []
|
|
189
|
-
image_counter = 1
|
|
190
|
-
|
|
191
|
-
try:
|
|
192
|
-
with zipfile.ZipFile(self.file_path, 'r') as odt_zip:
|
|
193
|
-
# List all files in the Pictures directory
|
|
194
|
-
picture_files = [
|
|
195
|
-
f for f in odt_zip.namelist() if f.startswith('Pictures/')]
|
|
196
|
-
|
|
197
|
-
for picture_file in picture_files:
|
|
198
|
-
# Extract image data
|
|
199
|
-
image_data = odt_zip.read(picture_file)
|
|
200
|
-
|
|
201
|
-
# Determine image format
|
|
202
|
-
image = Image.open(io.BytesIO(image_data))
|
|
203
|
-
image_format = image.format.lower()
|
|
204
|
-
|
|
205
|
-
# Save and describe image
|
|
206
|
-
relative_path, description = self.save_and_describe_image(
|
|
207
|
-
image_data, image_format, images_dir, image_counter
|
|
208
|
-
)
|
|
209
|
-
|
|
210
|
-
# Add formatted description to list
|
|
211
|
-
image_description = f"\nImage: {relative_path}\n[{description}]\n"
|
|
212
|
-
image_descriptions.append(image_description)
|
|
213
|
-
|
|
214
|
-
image_counter += 1
|
|
215
|
-
except Exception as e:
|
|
216
|
-
print(f"Warning: Could not extract images from ODT: {e}")
|
|
217
|
-
|
|
218
|
-
# Combine text content with image descriptions
|
|
219
|
-
if image_descriptions:
|
|
220
|
-
text_content += "\n\n### Extracted Images\n" + \
|
|
221
|
-
"\n".join(image_descriptions)
|
|
222
|
-
|
|
223
|
-
return text_content
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
class DocumentReaderFactory:
|
|
227
|
-
"""Factory for creating appropriate document readers"""
|
|
228
|
-
|
|
229
|
-
@staticmethod
|
|
230
|
-
def create_reader(file_path: str) -> Optional[DocumentReader]:
|
|
231
|
-
"""Create appropriate reader based on file extension"""
|
|
232
|
-
_, ext = os.path.splitext(file_path)
|
|
233
|
-
ext = ext.lower()
|
|
234
65
|
|
|
235
|
-
readers = {
|
|
236
|
-
'.docx': DocxReader,
|
|
237
|
-
'.pdf': PdfReader,
|
|
238
|
-
'.odt': OdtReader
|
|
239
|
-
}
|
|
240
66
|
|
|
241
|
-
reader_class = readers.get(ext)
|
|
242
|
-
if reader_class:
|
|
243
|
-
return reader_class(file_path)
|
|
244
67
|
|
|
245
|
-
return None
|
|
File without changes
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Optional
|
|
3
|
+
from ara_cli.file_loaders.document_reader import DocumentReader
|
|
4
|
+
from ara_cli.file_loaders.readers.docx_reader import DocxReader
|
|
5
|
+
from ara_cli.file_loaders.readers.pdf_reader import PdfReader
|
|
6
|
+
from ara_cli.file_loaders.readers.odt_reader import OdtReader
|
|
7
|
+
from ara_cli.file_loaders.readers.excel_reader import ExcelReader
|
|
8
|
+
from ara_cli.file_loaders.readers.pptx_reader import PptxReader
|
|
9
|
+
|
|
10
|
+
class DocumentReaderFactory:
|
|
11
|
+
"""Factory for creating appropriate document readers"""
|
|
12
|
+
|
|
13
|
+
@staticmethod
|
|
14
|
+
def create_reader(file_path: str) -> Optional[DocumentReader]:
|
|
15
|
+
"""Create appropriate reader based on file extension"""
|
|
16
|
+
_, ext = os.path.splitext(file_path)
|
|
17
|
+
ext = ext.lower()
|
|
18
|
+
|
|
19
|
+
readers = {
|
|
20
|
+
'.docx': DocxReader,
|
|
21
|
+
'.pdf': PdfReader,
|
|
22
|
+
'.odt': OdtReader,
|
|
23
|
+
'.xlsx': ExcelReader,
|
|
24
|
+
'.xls': ExcelReader,
|
|
25
|
+
'.pptx': PptxReader
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
reader_class = readers.get(ext)
|
|
29
|
+
if reader_class:
|
|
30
|
+
return reader_class(file_path)
|
|
31
|
+
|
|
32
|
+
return None
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
from ara_cli import BINARY_TYPE_MAPPING, DOCUMENT_TYPE_EXTENSIONS
|
|
3
|
+
from ara_cli.file_loaders.file_loader import FileLoader
|
|
4
|
+
|
|
5
|
+
class FileLoaderFactory:
|
|
6
|
+
"""Factory for creating appropriate file loaders"""
|
|
7
|
+
|
|
8
|
+
@staticmethod
|
|
9
|
+
def create_loader(file_name: str, chat_instance) -> Optional[FileLoader]:
|
|
10
|
+
"""Create appropriate loader based on file type"""
|
|
11
|
+
from ara_cli.file_loaders.loaders.binary_file_loader import BinaryFileLoader
|
|
12
|
+
from ara_cli.file_loaders.loaders.text_file_loader import TextFileLoader
|
|
13
|
+
from ara_cli.file_loaders.loaders.document_file_loader import DocumentFileLoader
|
|
14
|
+
|
|
15
|
+
file_name_lower = file_name.lower()
|
|
16
|
+
|
|
17
|
+
# Check if it's a binary file
|
|
18
|
+
for extension, mime_type in BINARY_TYPE_MAPPING.items():
|
|
19
|
+
if file_name_lower.endswith(extension):
|
|
20
|
+
return BinaryFileLoader(chat_instance)
|
|
21
|
+
|
|
22
|
+
# Check if it's a document
|
|
23
|
+
if any(file_name_lower.endswith(ext) for ext in DOCUMENT_TYPE_EXTENSIONS):
|
|
24
|
+
return DocumentFileLoader(chat_instance)
|
|
25
|
+
|
|
26
|
+
# Default to text file loader
|
|
27
|
+
return TextFileLoader(chat_instance)
|
|
@@ -18,33 +18,4 @@ class FileLoader(ABC):
|
|
|
18
18
|
self.chat.add_prompt_tag_if_needed(self.chat.chat_name)
|
|
19
19
|
|
|
20
20
|
|
|
21
|
-
|
|
22
|
-
"""Factory for creating appropriate file loaders"""
|
|
23
|
-
BINARY_TYPE_MAPPING = {
|
|
24
|
-
".png": "image/png",
|
|
25
|
-
".jpg": "image/jpeg",
|
|
26
|
-
".jpeg": "image/jpeg",
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
DOCUMENT_TYPE_EXTENSIONS = [".docx", ".doc", ".odt", ".pdf"]
|
|
30
|
-
|
|
31
|
-
@staticmethod
|
|
32
|
-
def create_loader(file_name: str, chat_instance) -> Optional[FileLoader]:
|
|
33
|
-
"""Create appropriate loader based on file type"""
|
|
34
|
-
from ara_cli.file_loaders.binary_file_loader import BinaryFileLoader
|
|
35
|
-
from ara_cli.file_loaders.text_file_loader import TextFileLoader
|
|
36
|
-
from ara_cli.file_loaders.document_file_loader import DocumentFileLoader
|
|
37
|
-
|
|
38
|
-
file_name_lower = file_name.lower()
|
|
39
|
-
|
|
40
|
-
# Check if it's a binary file
|
|
41
|
-
for extension, mime_type in FileLoaderFactory.BINARY_TYPE_MAPPING.items():
|
|
42
|
-
if file_name_lower.endswith(extension):
|
|
43
|
-
return BinaryFileLoader(chat_instance)
|
|
44
|
-
|
|
45
|
-
# Check if it's a document
|
|
46
|
-
if any(file_name_lower.endswith(ext) for ext in FileLoaderFactory.DOCUMENT_TYPE_EXTENSIONS):
|
|
47
|
-
return DocumentFileLoader(chat_instance)
|
|
48
|
-
|
|
49
|
-
# Default to text file loader
|
|
50
|
-
return TextFileLoader(chat_instance)
|
|
21
|
+
|
|
File without changes
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
import base64
|
|
4
|
+
import tempfile
|
|
5
|
+
from typing import Optional, Tuple
|
|
6
|
+
import requests
|
|
7
|
+
from charset_normalizer import from_path
|
|
8
|
+
from ara_cli.file_loaders.file_loader import FileLoader
|
|
9
|
+
from ara_cli.file_loaders.readers.markdown_reader import MarkdownReader
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class TextFileLoader(FileLoader):
|
|
13
|
+
"""Loads text files"""
|
|
14
|
+
|
|
15
|
+
def load(
|
|
16
|
+
self,
|
|
17
|
+
file_path: str,
|
|
18
|
+
prefix: str = "",
|
|
19
|
+
suffix: str = "",
|
|
20
|
+
block_delimiter: str = "",
|
|
21
|
+
extract_images: bool = False,
|
|
22
|
+
**kwargs,
|
|
23
|
+
) -> bool:
|
|
24
|
+
"""Load text file with optional markdown image extraction"""
|
|
25
|
+
|
|
26
|
+
is_md_file = file_path.lower().endswith(".md")
|
|
27
|
+
|
|
28
|
+
if is_md_file and extract_images:
|
|
29
|
+
reader = MarkdownReader(file_path)
|
|
30
|
+
file_content = reader.read(extract_images=True).replace("\r\n", "\n")
|
|
31
|
+
else:
|
|
32
|
+
# Use charset-normalizer to detect encoding
|
|
33
|
+
encoded_content = from_path(file_path).best()
|
|
34
|
+
if not encoded_content:
|
|
35
|
+
print(f"Failed to detect encoding for {file_path}")
|
|
36
|
+
return False
|
|
37
|
+
file_content = str(encoded_content).replace("\r\n", "\n")
|
|
38
|
+
|
|
39
|
+
if block_delimiter:
|
|
40
|
+
file_content = f"{block_delimiter}\n{file_content}\n{block_delimiter}"
|
|
41
|
+
|
|
42
|
+
write_content = f"{prefix}{file_content}{suffix}\n"
|
|
43
|
+
|
|
44
|
+
with open(self.chat.chat_name, "a", encoding="utf-8") as chat_file:
|
|
45
|
+
chat_file.write(write_content)
|
|
46
|
+
|
|
47
|
+
return True
|
|
File without changes
|