ara-cli 0.1.13.3__py3-none-any.whl → 0.1.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ara_cli/__init__.py +1 -1
- ara_cli/ara_command_action.py +162 -112
- ara_cli/ara_config.py +1 -1
- ara_cli/ara_subcommands/convert.py +66 -2
- ara_cli/ara_subcommands/prompt.py +266 -106
- ara_cli/artefact_autofix.py +2 -2
- ara_cli/artefact_converter.py +152 -53
- ara_cli/artefact_creator.py +41 -17
- ara_cli/artefact_lister.py +3 -3
- ara_cli/artefact_models/artefact_model.py +1 -1
- ara_cli/artefact_models/artefact_templates.py +0 -9
- ara_cli/artefact_models/feature_artefact_model.py +8 -8
- ara_cli/artefact_reader.py +62 -43
- ara_cli/artefact_scan.py +39 -17
- ara_cli/chat.py +23 -15
- ara_cli/children_contribution_updater.py +737 -0
- ara_cli/classifier.py +34 -0
- ara_cli/commands/load_command.py +4 -3
- ara_cli/commands/load_image_command.py +1 -1
- ara_cli/commands/read_command.py +23 -27
- ara_cli/completers.py +24 -0
- ara_cli/error_handler.py +26 -11
- ara_cli/file_loaders/document_reader.py +0 -178
- ara_cli/file_loaders/factories/__init__.py +0 -0
- ara_cli/file_loaders/factories/document_reader_factory.py +32 -0
- ara_cli/file_loaders/factories/file_loader_factory.py +27 -0
- ara_cli/file_loaders/file_loader.py +1 -30
- ara_cli/file_loaders/loaders/__init__.py +0 -0
- ara_cli/file_loaders/{document_file_loader.py → loaders/document_file_loader.py} +1 -1
- ara_cli/file_loaders/loaders/text_file_loader.py +47 -0
- ara_cli/file_loaders/readers/__init__.py +0 -0
- ara_cli/file_loaders/readers/docx_reader.py +49 -0
- ara_cli/file_loaders/readers/excel_reader.py +27 -0
- ara_cli/file_loaders/{markdown_reader.py → readers/markdown_reader.py} +1 -1
- ara_cli/file_loaders/readers/odt_reader.py +59 -0
- ara_cli/file_loaders/readers/pdf_reader.py +54 -0
- ara_cli/file_loaders/readers/pptx_reader.py +104 -0
- ara_cli/file_loaders/tools/__init__.py +0 -0
- ara_cli/output_suppressor.py +53 -0
- ara_cli/prompt_handler.py +123 -17
- ara_cli/tag_extractor.py +8 -7
- ara_cli/version.py +1 -1
- {ara_cli-0.1.13.3.dist-info → ara_cli-0.1.14.0.dist-info}/METADATA +18 -12
- {ara_cli-0.1.13.3.dist-info → ara_cli-0.1.14.0.dist-info}/RECORD +58 -45
- {ara_cli-0.1.13.3.dist-info → ara_cli-0.1.14.0.dist-info}/WHEEL +1 -1
- tests/test_artefact_converter.py +1 -46
- tests/test_artefact_lister.py +11 -8
- tests/test_chat.py +4 -4
- tests/test_chat_givens_images.py +1 -1
- tests/test_children_contribution_updater.py +98 -0
- tests/test_document_loader_office.py +267 -0
- tests/test_prompt_handler.py +416 -214
- tests/test_setup_default_chat_prompt_mode.py +198 -0
- tests/test_tag_extractor.py +95 -49
- ara_cli/file_loaders/document_readers.py +0 -233
- ara_cli/file_loaders/file_loaders.py +0 -123
- ara_cli/file_loaders/text_file_loader.py +0 -187
- /ara_cli/file_loaders/{binary_file_loader.py → loaders/binary_file_loader.py} +0 -0
- /ara_cli/file_loaders/{image_processor.py → tools/image_processor.py} +0 -0
- {ara_cli-0.1.13.3.dist-info → ara_cli-0.1.14.0.dist-info}/entry_points.txt +0 -0
- {ara_cli-0.1.13.3.dist-info → ara_cli-0.1.14.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import tempfile
|
|
3
|
+
from unittest.mock import patch, MagicMock
|
|
4
|
+
import pytest
|
|
5
|
+
from ara_cli.prompt_chat import initialize_prompt_chat_mode
|
|
6
|
+
from ara_cli.classifier import Classifier
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class TestSetupDefaultCombinedChatPromptMode:
|
|
10
|
+
"""
|
|
11
|
+
Tests mirroring the scenarios in Setup_Default_Combined_Chat_Prompt_Mode.feature.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
@pytest.fixture
|
|
15
|
+
def setup_ara_environment(self):
|
|
16
|
+
"""Sets up a temporary directory structure mimicking the ara project."""
|
|
17
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
18
|
+
# Change to tmpdir to ensure relative paths work if used
|
|
19
|
+
cwd = os.getcwd()
|
|
20
|
+
os.chdir(tmpdir)
|
|
21
|
+
|
|
22
|
+
# Setup base directories
|
|
23
|
+
os.makedirs(os.path.join(tmpdir, "ara", "tasks"), exist_ok=True)
|
|
24
|
+
|
|
25
|
+
yield tmpdir
|
|
26
|
+
|
|
27
|
+
os.chdir(cwd)
|
|
28
|
+
|
|
29
|
+
@patch("ara_cli.prompt_chat.update_artefact_config_prompt_files")
|
|
30
|
+
@patch("ara_cli.chat.Chat.start", autospec=True)
|
|
31
|
+
@patch("ara_cli.chat.Chat.start_non_interactive", autospec=True)
|
|
32
|
+
@patch("ara_cli.prompt_handler.ConfigManager.get_config")
|
|
33
|
+
def test_scenario_1_start_prompt_chat_no_existing_file(
|
|
34
|
+
self,
|
|
35
|
+
mock_get_config,
|
|
36
|
+
mock_start_non_interactive,
|
|
37
|
+
mock_start,
|
|
38
|
+
mock_update_config,
|
|
39
|
+
setup_ara_environment,
|
|
40
|
+
):
|
|
41
|
+
"""
|
|
42
|
+
Scenario: Start prompt-chat mode with existing prompt.data directory and no existing default chat file.
|
|
43
|
+
Expectation: task_chat.md is created.
|
|
44
|
+
"""
|
|
45
|
+
root_dir = setup_ara_environment
|
|
46
|
+
task_name = "123_chat_test"
|
|
47
|
+
classifier = "task"
|
|
48
|
+
|
|
49
|
+
# Mock config to avoid loading real config
|
|
50
|
+
mock_config = MagicMock()
|
|
51
|
+
mock_config.llm_config = [{"provider": "openai", "model": "gpt-4"}]
|
|
52
|
+
mock_get_config.return_value = mock_config
|
|
53
|
+
|
|
54
|
+
# Simulate directory creation normally handled by update_artefact_config_prompt_files
|
|
55
|
+
data_dir = os.path.join(root_dir, "ara", "tasks", f"{task_name}.data")
|
|
56
|
+
os.makedirs(data_dir, exist_ok=True)
|
|
57
|
+
|
|
58
|
+
# Execute
|
|
59
|
+
initialize_prompt_chat_mode(
|
|
60
|
+
classifier=classifier,
|
|
61
|
+
param=task_name,
|
|
62
|
+
chat_name=None, # Default to classifier name 'task' -> 'task_chat.md'
|
|
63
|
+
reset=None,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
# check paths
|
|
67
|
+
# Classifier.get_sub_directory('task') -> 'tasks'
|
|
68
|
+
# Path: ara/tasks/123_chat_test.data/task_chat.md (Chat adds _chat.md suffix if missing)
|
|
69
|
+
expected_chat_path = os.path.join(
|
|
70
|
+
root_dir, "ara", "tasks", f"{task_name}.data", "task_chat.md"
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
assert os.path.exists(
|
|
74
|
+
expected_chat_path
|
|
75
|
+
), f"Chat file not created at {expected_chat_path}"
|
|
76
|
+
|
|
77
|
+
with open(expected_chat_path, "r") as f:
|
|
78
|
+
content = f.read()
|
|
79
|
+
assert "# ara prompt:" in content
|
|
80
|
+
|
|
81
|
+
# Verify update config called
|
|
82
|
+
mock_update_config.assert_called_once()
|
|
83
|
+
|
|
84
|
+
@patch("ara_cli.prompt_chat.update_artefact_config_prompt_files")
|
|
85
|
+
@patch("ara_cli.chat.Chat.start", autospec=True)
|
|
86
|
+
@patch("ara_cli.chat.Chat.start_non_interactive", autospec=True)
|
|
87
|
+
@patch("ara_cli.prompt_handler.ConfigManager.get_config")
|
|
88
|
+
@patch("sys.stdin.readline", return_value="y\n") # Simulate user typing 'y'
|
|
89
|
+
@patch("sys.stdout.write") # Capture stdout to check prompt
|
|
90
|
+
def test_scenario_2_reset_existing_chat(
|
|
91
|
+
self,
|
|
92
|
+
mock_stdout,
|
|
93
|
+
mock_stdin,
|
|
94
|
+
mock_get_config,
|
|
95
|
+
mock_start_non_interactive,
|
|
96
|
+
mock_start,
|
|
97
|
+
mock_update_config,
|
|
98
|
+
setup_ara_environment,
|
|
99
|
+
):
|
|
100
|
+
"""
|
|
101
|
+
Scenario: Start prompt-chat mode with existing chat file, choose to reset.
|
|
102
|
+
Expectation: User prompted, file content reset.
|
|
103
|
+
"""
|
|
104
|
+
root_dir = setup_ara_environment
|
|
105
|
+
task_name = "123_chat_test"
|
|
106
|
+
classifier = "task"
|
|
107
|
+
|
|
108
|
+
# Setup existing file
|
|
109
|
+
data_dir = os.path.join(root_dir, "ara", "tasks", f"{task_name}.data")
|
|
110
|
+
os.makedirs(data_dir, exist_ok=True)
|
|
111
|
+
chat_path = os.path.join(data_dir, "task_chat.md")
|
|
112
|
+
|
|
113
|
+
with open(chat_path, "w") as f:
|
|
114
|
+
f.write("# ara prompt:\nOld Content")
|
|
115
|
+
|
|
116
|
+
# Mock config
|
|
117
|
+
mock_config = MagicMock()
|
|
118
|
+
mock_config.llm_config = [{"provider": "openai", "model": "gpt-4"}]
|
|
119
|
+
mock_get_config.return_value = mock_config
|
|
120
|
+
|
|
121
|
+
# Execute
|
|
122
|
+
initialize_prompt_chat_mode(
|
|
123
|
+
classifier=classifier,
|
|
124
|
+
param=task_name,
|
|
125
|
+
chat_name=None,
|
|
126
|
+
reset=None, # Should trigger interactive prompt
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
# Verify prompt was printed
|
|
130
|
+
# Note: sys.stdout.write is called by print()
|
|
131
|
+
# We check if any call args contained the prompt string
|
|
132
|
+
prompt_found = False
|
|
133
|
+
for call in mock_stdout.call_args_list:
|
|
134
|
+
if (
|
|
135
|
+
call.args
|
|
136
|
+
and "already exists. Do you want to reset the chat?" in call.args[0]
|
|
137
|
+
):
|
|
138
|
+
prompt_found = True
|
|
139
|
+
break
|
|
140
|
+
# Alternatively, 'print' might use the buffer directly, but patching sys.stdout should catch it if flush=True/end="" usage in chat.py matches.
|
|
141
|
+
# chat.py: print(f"{chat_file_short} already exists. Do you want to reset the chat? (y/N): ", end="", flush=True)
|
|
142
|
+
|
|
143
|
+
# Since 'print' with end="" calls stdout.write, this should work.
|
|
144
|
+
# However, verifying strict output in mock might be tricky if not captured perfectly.
|
|
145
|
+
# Focusing on the RESULT (file reset) is most important for Scenario logic.
|
|
146
|
+
|
|
147
|
+
with open(chat_path, "r") as f:
|
|
148
|
+
content = f.read()
|
|
149
|
+
|
|
150
|
+
assert content == "# ara prompt:\n", "Chat file should have been reset"
|
|
151
|
+
assert "Old Content" not in content
|
|
152
|
+
|
|
153
|
+
@patch("ara_cli.prompt_chat.update_artefact_config_prompt_files")
|
|
154
|
+
@patch("ara_cli.chat.Chat.start", autospec=True)
|
|
155
|
+
@patch("ara_cli.chat.Chat.start_non_interactive", autospec=True)
|
|
156
|
+
@patch("ara_cli.prompt_handler.ConfigManager.get_config")
|
|
157
|
+
@patch("sys.stdin.readline", return_value="n\n") # Simulate user typing 'n'
|
|
158
|
+
def test_scenario_3_append_existing_chat(
|
|
159
|
+
self,
|
|
160
|
+
mock_stdin,
|
|
161
|
+
mock_get_config,
|
|
162
|
+
mock_start_non_interactive,
|
|
163
|
+
mock_start,
|
|
164
|
+
mock_update_config,
|
|
165
|
+
setup_ara_environment,
|
|
166
|
+
):
|
|
167
|
+
"""
|
|
168
|
+
Scenario: Start prompt-chat mode with existing chat file, choose to append.
|
|
169
|
+
Expectation: File content preserved.
|
|
170
|
+
"""
|
|
171
|
+
root_dir = setup_ara_environment
|
|
172
|
+
task_name = "123_chat_test"
|
|
173
|
+
classifier = "task"
|
|
174
|
+
|
|
175
|
+
# Setup existing file
|
|
176
|
+
data_dir = os.path.join(root_dir, "ara", "tasks", f"{task_name}.data")
|
|
177
|
+
os.makedirs(data_dir, exist_ok=True)
|
|
178
|
+
chat_path = os.path.join(data_dir, "task_chat.md")
|
|
179
|
+
|
|
180
|
+
original_content = "# ara prompt:\nOld Content"
|
|
181
|
+
with open(chat_path, "w") as f:
|
|
182
|
+
f.write(original_content)
|
|
183
|
+
|
|
184
|
+
# Mock config
|
|
185
|
+
mock_config = MagicMock()
|
|
186
|
+
mock_config.llm_config = [{"provider": "openai", "model": "gpt-4"}]
|
|
187
|
+
mock_get_config.return_value = mock_config
|
|
188
|
+
|
|
189
|
+
# Execute
|
|
190
|
+
initialize_prompt_chat_mode(
|
|
191
|
+
classifier=classifier, param=task_name, chat_name=None, reset=None
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
# Verify content preserved
|
|
195
|
+
with open(chat_path, "r") as f:
|
|
196
|
+
content = f.read()
|
|
197
|
+
|
|
198
|
+
assert content == original_content, "Chat file content should be preserved"
|
tests/test_tag_extractor.py
CHANGED
|
@@ -7,61 +7,107 @@ from ara_cli.list_filter import ListFilter
|
|
|
7
7
|
@pytest.fixture
|
|
8
8
|
def artefact():
|
|
9
9
|
"""Fixture to create a mock artefact object."""
|
|
10
|
+
|
|
10
11
|
class Artefact:
|
|
11
|
-
def __init__(
|
|
12
|
+
def __init__(
|
|
13
|
+
self,
|
|
14
|
+
tags,
|
|
15
|
+
status,
|
|
16
|
+
users,
|
|
17
|
+
author="creator_unknown",
|
|
18
|
+
path="dummy.md",
|
|
19
|
+
content="",
|
|
20
|
+
):
|
|
12
21
|
self.tags = tags
|
|
13
22
|
self.status = status
|
|
14
23
|
self.users = users
|
|
15
24
|
self.author = author
|
|
16
25
|
self.path = path
|
|
17
26
|
self.content = content
|
|
27
|
+
|
|
18
28
|
return Artefact
|
|
19
29
|
|
|
20
30
|
|
|
21
|
-
@pytest.mark.parametrize(
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
]
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
31
|
+
@pytest.mark.parametrize(
|
|
32
|
+
"navigate_to_target, filtered_extra_column, list_filter, artefact_data, expected_tags",
|
|
33
|
+
[
|
|
34
|
+
(
|
|
35
|
+
False,
|
|
36
|
+
False,
|
|
37
|
+
None,
|
|
38
|
+
{
|
|
39
|
+
"artefacts": [
|
|
40
|
+
(["tag1", "tag2"], "in-progress", ["user1"], "creator_unknown"),
|
|
41
|
+
(["tag3"], "done", ["user2"], "creator_unknown"),
|
|
42
|
+
]
|
|
43
|
+
},
|
|
44
|
+
[
|
|
45
|
+
"creator_unknown",
|
|
46
|
+
"done",
|
|
47
|
+
"in-progress",
|
|
48
|
+
"tag1",
|
|
49
|
+
"tag2",
|
|
50
|
+
"tag3",
|
|
51
|
+
"user_user1",
|
|
52
|
+
"user_user2",
|
|
53
|
+
],
|
|
54
|
+
),
|
|
55
|
+
(
|
|
56
|
+
False,
|
|
57
|
+
True,
|
|
58
|
+
None,
|
|
59
|
+
{
|
|
60
|
+
"artefacts": [
|
|
61
|
+
(
|
|
62
|
+
["project_a", "priority_high"],
|
|
63
|
+
None,
|
|
64
|
+
["user1"],
|
|
65
|
+
"creator_unknown",
|
|
66
|
+
),
|
|
67
|
+
(["feature_x"], "done", ["user2"], "creator_unknown"),
|
|
68
|
+
]
|
|
69
|
+
},
|
|
70
|
+
["project_a"],
|
|
71
|
+
),
|
|
72
|
+
(
|
|
73
|
+
False,
|
|
74
|
+
False,
|
|
75
|
+
ListFilter(include_tags=["kritik"]),
|
|
76
|
+
{
|
|
77
|
+
"artefacts": [
|
|
78
|
+
(["release", "kritik"], "review", ["dev1"], "creator_unknown"),
|
|
79
|
+
(["bugfix"], "to-do", ["dev2"], "creator_unknown"),
|
|
80
|
+
]
|
|
81
|
+
},
|
|
82
|
+
["creator_unknown", "kritik", "release", "review", "user_dev1"],
|
|
83
|
+
),
|
|
84
|
+
(
|
|
85
|
+
True,
|
|
86
|
+
False,
|
|
87
|
+
None,
|
|
88
|
+
{"artefacts": [(["tag3"], "status2", ["user3"], "creator_unknown")]},
|
|
89
|
+
["creator_unknown", "status2", "tag3", "user_user3"],
|
|
90
|
+
),
|
|
91
|
+
(False, False, None, {"artefacts": []}, []),
|
|
92
|
+
],
|
|
93
|
+
)
|
|
94
|
+
@patch("ara_cli.artefact_reader.ArtefactReader")
|
|
95
|
+
@patch("ara_cli.template_manager.DirectoryNavigator")
|
|
96
|
+
def test_extract_tags(
|
|
97
|
+
mock_directory_navigator,
|
|
98
|
+
mock_artefact_reader,
|
|
99
|
+
artefact,
|
|
100
|
+
navigate_to_target,
|
|
101
|
+
filtered_extra_column,
|
|
102
|
+
list_filter,
|
|
103
|
+
artefact_data,
|
|
104
|
+
expected_tags,
|
|
105
|
+
):
|
|
106
|
+
mock_artefacts = {
|
|
107
|
+
key: [artefact(*data) for data in artefact_list]
|
|
108
|
+
for key, artefact_list in artefact_data.items()
|
|
109
|
+
}
|
|
110
|
+
mock_artefact_reader.return_value.read_artefacts.return_value = mock_artefacts
|
|
65
111
|
|
|
66
112
|
mock_navigator_instance = mock_directory_navigator.return_value
|
|
67
113
|
mock_navigator_instance.navigate_to_target = MagicMock()
|
|
@@ -71,7 +117,7 @@ def test_extract_tags(mock_directory_navigator, mock_artefact_reader, artefact,
|
|
|
71
117
|
result = tag_extractor.extract_tags(
|
|
72
118
|
navigate_to_target=navigate_to_target,
|
|
73
119
|
filtered_extra_column=filtered_extra_column,
|
|
74
|
-
list_filter=list_filter
|
|
120
|
+
list_filter=list_filter,
|
|
75
121
|
)
|
|
76
122
|
|
|
77
123
|
if navigate_to_target:
|
|
@@ -79,11 +125,11 @@ def test_extract_tags(mock_directory_navigator, mock_artefact_reader, artefact,
|
|
|
79
125
|
else:
|
|
80
126
|
mock_navigator_instance.navigate_to_target.assert_not_called()
|
|
81
127
|
|
|
82
|
-
mock_artefact_reader.read_artefacts.assert_called_once()
|
|
128
|
+
mock_artefact_reader.return_value.read_artefacts.assert_called_once()
|
|
83
129
|
|
|
84
130
|
# Convert dictionary result to flat list for comparison
|
|
85
131
|
actual_tags = []
|
|
86
132
|
for group in result.values():
|
|
87
133
|
actual_tags.extend(group)
|
|
88
|
-
|
|
89
|
-
assert sorted(actual_tags) == sorted(expected_tags)
|
|
134
|
+
|
|
135
|
+
assert sorted(actual_tags) == sorted(expected_tags)
|
|
@@ -1,233 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
from abc import ABC, abstractmethod
|
|
3
|
-
from typing import Tuple, Optional
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class DocumentReader(ABC):
|
|
7
|
-
"""Abstract base class for document readers"""
|
|
8
|
-
|
|
9
|
-
def __init__(self, file_path: str):
|
|
10
|
-
self.file_path = file_path
|
|
11
|
-
self.base_dir = os.path.dirname(file_path)
|
|
12
|
-
|
|
13
|
-
@abstractmethod
|
|
14
|
-
def read(self, extract_images: bool = False) -> str:
|
|
15
|
-
"""Read document and optionally extract images"""
|
|
16
|
-
pass
|
|
17
|
-
|
|
18
|
-
def create_image_data_dir(self, extension_suffix: str) -> str:
|
|
19
|
-
"""
|
|
20
|
-
Create data directory for images with file extension suffix to avoid conflicts.
|
|
21
|
-
|
|
22
|
-
Returns:
|
|
23
|
-
str: Path to images directory
|
|
24
|
-
"""
|
|
25
|
-
file_name_with_ext = os.path.splitext(os.path.basename(self.file_path))[0] + f"_{extension_suffix}"
|
|
26
|
-
data_dir = os.path.join(self.base_dir, f"{file_name_with_ext}.data")
|
|
27
|
-
images_dir = os.path.join(data_dir, "images")
|
|
28
|
-
if not os.path.exists(images_dir):
|
|
29
|
-
os.makedirs(images_dir)
|
|
30
|
-
return images_dir
|
|
31
|
-
|
|
32
|
-
def save_and_describe_image(self, image_data: bytes, image_format: str,
|
|
33
|
-
save_dir: str, image_counter: int) -> Tuple[str, str]:
|
|
34
|
-
"""
|
|
35
|
-
Save image data and get its description from LLM.
|
|
36
|
-
|
|
37
|
-
Returns:
|
|
38
|
-
tuple: (relative_image_path, description)
|
|
39
|
-
"""
|
|
40
|
-
from ara_cli.prompt_handler import describe_image
|
|
41
|
-
|
|
42
|
-
# Save image
|
|
43
|
-
image_filename = f"{image_counter}.{image_format}"
|
|
44
|
-
image_path = os.path.join(save_dir, image_filename)
|
|
45
|
-
|
|
46
|
-
with open(image_path, "wb") as image_file:
|
|
47
|
-
image_file.write(image_data)
|
|
48
|
-
|
|
49
|
-
# Get image description from LLM
|
|
50
|
-
description = describe_image(image_path)
|
|
51
|
-
|
|
52
|
-
# Get relative path
|
|
53
|
-
relative_image_path = os.path.relpath(image_path, self.base_dir)
|
|
54
|
-
|
|
55
|
-
return relative_image_path, description
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
class DocxReader(DocumentReader):
|
|
59
|
-
"""Reader for DOCX files"""
|
|
60
|
-
|
|
61
|
-
def read(self, extract_images: bool = False) -> str:
|
|
62
|
-
import docx
|
|
63
|
-
|
|
64
|
-
doc = docx.Document(self.file_path)
|
|
65
|
-
text_content = '\n'.join(para.text for para in doc.paragraphs)
|
|
66
|
-
|
|
67
|
-
if not extract_images:
|
|
68
|
-
return text_content
|
|
69
|
-
|
|
70
|
-
from PIL import Image
|
|
71
|
-
import io
|
|
72
|
-
|
|
73
|
-
# Create data directory for images
|
|
74
|
-
images_dir = self.create_image_data_dir("docx")
|
|
75
|
-
|
|
76
|
-
# Extract and process images
|
|
77
|
-
image_descriptions = []
|
|
78
|
-
image_counter = 1
|
|
79
|
-
|
|
80
|
-
for rel in doc.part.rels.values():
|
|
81
|
-
if "image" in rel.reltype:
|
|
82
|
-
image_data = rel.target_part.blob
|
|
83
|
-
|
|
84
|
-
# Determine image format
|
|
85
|
-
image = Image.open(io.BytesIO(image_data))
|
|
86
|
-
image_format = image.format.lower()
|
|
87
|
-
|
|
88
|
-
# Save and describe image
|
|
89
|
-
relative_path, description = self.save_and_describe_image(
|
|
90
|
-
image_data, image_format, images_dir, image_counter
|
|
91
|
-
)
|
|
92
|
-
|
|
93
|
-
# Add formatted description to list
|
|
94
|
-
image_description = f"\nImage: {relative_path}\n[{description}]\n"
|
|
95
|
-
image_descriptions.append(image_description)
|
|
96
|
-
|
|
97
|
-
image_counter += 1
|
|
98
|
-
|
|
99
|
-
# Combine text content with image descriptions
|
|
100
|
-
if image_descriptions:
|
|
101
|
-
text_content += "\n\n### Extracted Images\n" + "\n".join(image_descriptions)
|
|
102
|
-
|
|
103
|
-
return text_content
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
class PdfReader(DocumentReader):
|
|
107
|
-
"""Reader for PDF files"""
|
|
108
|
-
|
|
109
|
-
def read(self, extract_images: bool = False) -> str:
|
|
110
|
-
import pymupdf4llm
|
|
111
|
-
|
|
112
|
-
if not extract_images:
|
|
113
|
-
return pymupdf4llm.to_markdown(self.file_path, write_images=False)
|
|
114
|
-
|
|
115
|
-
import fitz # PyMuPDF
|
|
116
|
-
|
|
117
|
-
# Create images directory
|
|
118
|
-
images_dir = self.create_image_data_dir("pdf")
|
|
119
|
-
|
|
120
|
-
# Extract text without images first
|
|
121
|
-
text_content = pymupdf4llm.to_markdown(self.file_path, write_images=False)
|
|
122
|
-
|
|
123
|
-
# Extract and process images
|
|
124
|
-
doc = fitz.open(self.file_path)
|
|
125
|
-
image_descriptions = []
|
|
126
|
-
image_counter = 1
|
|
127
|
-
|
|
128
|
-
for page_num, page in enumerate(doc):
|
|
129
|
-
image_list = page.get_images()
|
|
130
|
-
|
|
131
|
-
for img_index, img in enumerate(image_list):
|
|
132
|
-
# Extract image
|
|
133
|
-
xref = img[0]
|
|
134
|
-
base_image = doc.extract_image(xref)
|
|
135
|
-
image_bytes = base_image["image"]
|
|
136
|
-
image_ext = base_image["ext"]
|
|
137
|
-
|
|
138
|
-
# Save and describe image
|
|
139
|
-
relative_path, description = self.save_and_describe_image(
|
|
140
|
-
image_bytes, image_ext, images_dir, image_counter
|
|
141
|
-
)
|
|
142
|
-
|
|
143
|
-
# Add formatted description to list
|
|
144
|
-
image_description = f"\nImage: {relative_path}\n[{description}]\n"
|
|
145
|
-
image_descriptions.append(image_description)
|
|
146
|
-
|
|
147
|
-
image_counter += 1
|
|
148
|
-
|
|
149
|
-
doc.close()
|
|
150
|
-
|
|
151
|
-
# Combine text content with image descriptions
|
|
152
|
-
if image_descriptions:
|
|
153
|
-
text_content += "\n\n### Extracted Images\n" + "\n".join(image_descriptions)
|
|
154
|
-
|
|
155
|
-
return text_content
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
class OdtReader(DocumentReader):
|
|
159
|
-
"""Reader for ODT files"""
|
|
160
|
-
|
|
161
|
-
def read(self, extract_images: bool = False) -> str:
|
|
162
|
-
import pymupdf4llm
|
|
163
|
-
|
|
164
|
-
if not extract_images:
|
|
165
|
-
return pymupdf4llm.to_markdown(self.file_path, write_images=False)
|
|
166
|
-
|
|
167
|
-
import zipfile
|
|
168
|
-
from PIL import Image
|
|
169
|
-
import io
|
|
170
|
-
|
|
171
|
-
# Create data directory for images
|
|
172
|
-
images_dir = self.create_image_data_dir("odt")
|
|
173
|
-
|
|
174
|
-
# Get text content
|
|
175
|
-
text_content = pymupdf4llm.to_markdown(self.file_path, write_images=False)
|
|
176
|
-
|
|
177
|
-
# Extract and process images from ODT
|
|
178
|
-
image_descriptions = []
|
|
179
|
-
image_counter = 1
|
|
180
|
-
|
|
181
|
-
try:
|
|
182
|
-
with zipfile.ZipFile(self.file_path, 'r') as odt_zip:
|
|
183
|
-
# List all files in the Pictures directory
|
|
184
|
-
picture_files = [f for f in odt_zip.namelist() if f.startswith('Pictures/')]
|
|
185
|
-
|
|
186
|
-
for picture_file in picture_files:
|
|
187
|
-
# Extract image data
|
|
188
|
-
image_data = odt_zip.read(picture_file)
|
|
189
|
-
|
|
190
|
-
# Determine image format
|
|
191
|
-
image = Image.open(io.BytesIO(image_data))
|
|
192
|
-
image_format = image.format.lower()
|
|
193
|
-
|
|
194
|
-
# Save and describe image
|
|
195
|
-
relative_path, description = self.save_and_describe_image(
|
|
196
|
-
image_data, image_format, images_dir, image_counter
|
|
197
|
-
)
|
|
198
|
-
|
|
199
|
-
# Add formatted description to list
|
|
200
|
-
image_description = f"\nImage: {relative_path}\n[{description}]\n"
|
|
201
|
-
image_descriptions.append(image_description)
|
|
202
|
-
|
|
203
|
-
image_counter += 1
|
|
204
|
-
except Exception as e:
|
|
205
|
-
print(f"Warning: Could not extract images from ODT: {e}")
|
|
206
|
-
|
|
207
|
-
# Combine text content with image descriptions
|
|
208
|
-
if image_descriptions:
|
|
209
|
-
text_content += "\n\n### Extracted Images\n" + "\n".join(image_descriptions)
|
|
210
|
-
|
|
211
|
-
return text_content
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
class DocumentReaderFactory:
|
|
215
|
-
"""Factory for creating appropriate document readers"""
|
|
216
|
-
|
|
217
|
-
@staticmethod
|
|
218
|
-
def create_reader(file_path: str) -> Optional[DocumentReader]:
|
|
219
|
-
"""Create appropriate reader based on file extension"""
|
|
220
|
-
_, ext = os.path.splitext(file_path)
|
|
221
|
-
ext = ext.lower()
|
|
222
|
-
|
|
223
|
-
readers = {
|
|
224
|
-
'.docx': DocxReader,
|
|
225
|
-
'.pdf': PdfReader,
|
|
226
|
-
'.odt': OdtReader
|
|
227
|
-
}
|
|
228
|
-
|
|
229
|
-
reader_class = readers.get(ext)
|
|
230
|
-
if reader_class:
|
|
231
|
-
return reader_class(file_path)
|
|
232
|
-
|
|
233
|
-
return None
|