kweaver-dolphin 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- DolphinLanguageSDK/__init__.py +58 -0
- dolphin/__init__.py +62 -0
- dolphin/cli/__init__.py +20 -0
- dolphin/cli/args/__init__.py +9 -0
- dolphin/cli/args/parser.py +567 -0
- dolphin/cli/builtin_agents/__init__.py +22 -0
- dolphin/cli/commands/__init__.py +4 -0
- dolphin/cli/interrupt/__init__.py +8 -0
- dolphin/cli/interrupt/handler.py +205 -0
- dolphin/cli/interrupt/keyboard.py +82 -0
- dolphin/cli/main.py +49 -0
- dolphin/cli/multimodal/__init__.py +34 -0
- dolphin/cli/multimodal/clipboard.py +327 -0
- dolphin/cli/multimodal/handler.py +249 -0
- dolphin/cli/multimodal/image_processor.py +214 -0
- dolphin/cli/multimodal/input_parser.py +149 -0
- dolphin/cli/runner/__init__.py +8 -0
- dolphin/cli/runner/runner.py +989 -0
- dolphin/cli/ui/__init__.py +10 -0
- dolphin/cli/ui/console.py +2795 -0
- dolphin/cli/ui/input.py +340 -0
- dolphin/cli/ui/layout.py +425 -0
- dolphin/cli/ui/stream_renderer.py +302 -0
- dolphin/cli/utils/__init__.py +8 -0
- dolphin/cli/utils/helpers.py +135 -0
- dolphin/cli/utils/version.py +49 -0
- dolphin/core/__init__.py +107 -0
- dolphin/core/agent/__init__.py +10 -0
- dolphin/core/agent/agent_state.py +69 -0
- dolphin/core/agent/base_agent.py +970 -0
- dolphin/core/code_block/__init__.py +0 -0
- dolphin/core/code_block/agent_init_block.py +0 -0
- dolphin/core/code_block/assign_block.py +98 -0
- dolphin/core/code_block/basic_code_block.py +1865 -0
- dolphin/core/code_block/explore_block.py +1327 -0
- dolphin/core/code_block/explore_block_v2.py +712 -0
- dolphin/core/code_block/explore_strategy.py +672 -0
- dolphin/core/code_block/judge_block.py +220 -0
- dolphin/core/code_block/prompt_block.py +32 -0
- dolphin/core/code_block/skill_call_deduplicator.py +291 -0
- dolphin/core/code_block/tool_block.py +129 -0
- dolphin/core/common/__init__.py +17 -0
- dolphin/core/common/constants.py +176 -0
- dolphin/core/common/enums.py +1173 -0
- dolphin/core/common/exceptions.py +133 -0
- dolphin/core/common/multimodal.py +539 -0
- dolphin/core/common/object_type.py +165 -0
- dolphin/core/common/output_format.py +432 -0
- dolphin/core/common/types.py +36 -0
- dolphin/core/config/__init__.py +16 -0
- dolphin/core/config/global_config.py +1289 -0
- dolphin/core/config/ontology_config.py +133 -0
- dolphin/core/context/__init__.py +12 -0
- dolphin/core/context/context.py +1580 -0
- dolphin/core/context/context_manager.py +161 -0
- dolphin/core/context/var_output.py +82 -0
- dolphin/core/context/variable_pool.py +356 -0
- dolphin/core/context_engineer/__init__.py +41 -0
- dolphin/core/context_engineer/config/__init__.py +5 -0
- dolphin/core/context_engineer/config/settings.py +402 -0
- dolphin/core/context_engineer/core/__init__.py +7 -0
- dolphin/core/context_engineer/core/budget_manager.py +327 -0
- dolphin/core/context_engineer/core/context_assembler.py +583 -0
- dolphin/core/context_engineer/core/context_manager.py +637 -0
- dolphin/core/context_engineer/core/tokenizer_service.py +260 -0
- dolphin/core/context_engineer/example/incremental_example.py +267 -0
- dolphin/core/context_engineer/example/traditional_example.py +334 -0
- dolphin/core/context_engineer/services/__init__.py +5 -0
- dolphin/core/context_engineer/services/compressor.py +399 -0
- dolphin/core/context_engineer/utils/__init__.py +6 -0
- dolphin/core/context_engineer/utils/context_utils.py +441 -0
- dolphin/core/context_engineer/utils/message_formatter.py +270 -0
- dolphin/core/context_engineer/utils/token_utils.py +139 -0
- dolphin/core/coroutine/__init__.py +15 -0
- dolphin/core/coroutine/context_snapshot.py +154 -0
- dolphin/core/coroutine/context_snapshot_profile.py +922 -0
- dolphin/core/coroutine/context_snapshot_store.py +268 -0
- dolphin/core/coroutine/execution_frame.py +145 -0
- dolphin/core/coroutine/execution_state_registry.py +161 -0
- dolphin/core/coroutine/resume_handle.py +101 -0
- dolphin/core/coroutine/step_result.py +101 -0
- dolphin/core/executor/__init__.py +18 -0
- dolphin/core/executor/debug_controller.py +630 -0
- dolphin/core/executor/dolphin_executor.py +1063 -0
- dolphin/core/executor/executor.py +624 -0
- dolphin/core/flags/__init__.py +27 -0
- dolphin/core/flags/definitions.py +49 -0
- dolphin/core/flags/manager.py +113 -0
- dolphin/core/hook/__init__.py +95 -0
- dolphin/core/hook/expression_evaluator.py +499 -0
- dolphin/core/hook/hook_dispatcher.py +380 -0
- dolphin/core/hook/hook_types.py +248 -0
- dolphin/core/hook/isolated_variable_pool.py +284 -0
- dolphin/core/interfaces.py +53 -0
- dolphin/core/llm/__init__.py +0 -0
- dolphin/core/llm/llm.py +495 -0
- dolphin/core/llm/llm_call.py +100 -0
- dolphin/core/llm/llm_client.py +1285 -0
- dolphin/core/llm/message_sanitizer.py +120 -0
- dolphin/core/logging/__init__.py +20 -0
- dolphin/core/logging/logger.py +526 -0
- dolphin/core/message/__init__.py +8 -0
- dolphin/core/message/compressor.py +749 -0
- dolphin/core/parser/__init__.py +8 -0
- dolphin/core/parser/parser.py +405 -0
- dolphin/core/runtime/__init__.py +10 -0
- dolphin/core/runtime/runtime_graph.py +926 -0
- dolphin/core/runtime/runtime_instance.py +446 -0
- dolphin/core/skill/__init__.py +14 -0
- dolphin/core/skill/context_retention.py +157 -0
- dolphin/core/skill/skill_function.py +686 -0
- dolphin/core/skill/skill_matcher.py +282 -0
- dolphin/core/skill/skillkit.py +700 -0
- dolphin/core/skill/skillset.py +72 -0
- dolphin/core/trajectory/__init__.py +10 -0
- dolphin/core/trajectory/recorder.py +189 -0
- dolphin/core/trajectory/trajectory.py +522 -0
- dolphin/core/utils/__init__.py +9 -0
- dolphin/core/utils/cache_kv.py +212 -0
- dolphin/core/utils/tools.py +340 -0
- dolphin/lib/__init__.py +93 -0
- dolphin/lib/debug/__init__.py +8 -0
- dolphin/lib/debug/visualizer.py +409 -0
- dolphin/lib/memory/__init__.py +28 -0
- dolphin/lib/memory/async_processor.py +220 -0
- dolphin/lib/memory/llm_calls.py +195 -0
- dolphin/lib/memory/manager.py +78 -0
- dolphin/lib/memory/sandbox.py +46 -0
- dolphin/lib/memory/storage.py +245 -0
- dolphin/lib/memory/utils.py +51 -0
- dolphin/lib/ontology/__init__.py +12 -0
- dolphin/lib/ontology/basic/__init__.py +0 -0
- dolphin/lib/ontology/basic/base.py +102 -0
- dolphin/lib/ontology/basic/concept.py +130 -0
- dolphin/lib/ontology/basic/object.py +11 -0
- dolphin/lib/ontology/basic/relation.py +63 -0
- dolphin/lib/ontology/datasource/__init__.py +27 -0
- dolphin/lib/ontology/datasource/datasource.py +66 -0
- dolphin/lib/ontology/datasource/oracle_datasource.py +338 -0
- dolphin/lib/ontology/datasource/sql.py +845 -0
- dolphin/lib/ontology/mapping.py +177 -0
- dolphin/lib/ontology/ontology.py +733 -0
- dolphin/lib/ontology/ontology_context.py +16 -0
- dolphin/lib/ontology/ontology_manager.py +107 -0
- dolphin/lib/skill_results/__init__.py +31 -0
- dolphin/lib/skill_results/cache_backend.py +559 -0
- dolphin/lib/skill_results/result_processor.py +181 -0
- dolphin/lib/skill_results/result_reference.py +179 -0
- dolphin/lib/skill_results/skillkit_hook.py +324 -0
- dolphin/lib/skill_results/strategies.py +328 -0
- dolphin/lib/skill_results/strategy_registry.py +150 -0
- dolphin/lib/skillkits/__init__.py +44 -0
- dolphin/lib/skillkits/agent_skillkit.py +155 -0
- dolphin/lib/skillkits/cognitive_skillkit.py +82 -0
- dolphin/lib/skillkits/env_skillkit.py +250 -0
- dolphin/lib/skillkits/mcp_adapter.py +616 -0
- dolphin/lib/skillkits/mcp_skillkit.py +771 -0
- dolphin/lib/skillkits/memory_skillkit.py +650 -0
- dolphin/lib/skillkits/noop_skillkit.py +31 -0
- dolphin/lib/skillkits/ontology_skillkit.py +89 -0
- dolphin/lib/skillkits/plan_act_skillkit.py +452 -0
- dolphin/lib/skillkits/resource/__init__.py +52 -0
- dolphin/lib/skillkits/resource/models/__init__.py +6 -0
- dolphin/lib/skillkits/resource/models/skill_config.py +109 -0
- dolphin/lib/skillkits/resource/models/skill_meta.py +127 -0
- dolphin/lib/skillkits/resource/resource_skillkit.py +393 -0
- dolphin/lib/skillkits/resource/skill_cache.py +215 -0
- dolphin/lib/skillkits/resource/skill_loader.py +395 -0
- dolphin/lib/skillkits/resource/skill_validator.py +406 -0
- dolphin/lib/skillkits/resource_skillkit.py +11 -0
- dolphin/lib/skillkits/search_skillkit.py +163 -0
- dolphin/lib/skillkits/sql_skillkit.py +274 -0
- dolphin/lib/skillkits/system_skillkit.py +509 -0
- dolphin/lib/skillkits/vm_skillkit.py +65 -0
- dolphin/lib/utils/__init__.py +9 -0
- dolphin/lib/utils/data_process.py +207 -0
- dolphin/lib/utils/handle_progress.py +178 -0
- dolphin/lib/utils/security.py +139 -0
- dolphin/lib/utils/text_retrieval.py +462 -0
- dolphin/lib/vm/__init__.py +11 -0
- dolphin/lib/vm/env_executor.py +895 -0
- dolphin/lib/vm/python_session_manager.py +453 -0
- dolphin/lib/vm/vm.py +610 -0
- dolphin/sdk/__init__.py +60 -0
- dolphin/sdk/agent/__init__.py +12 -0
- dolphin/sdk/agent/agent_factory.py +236 -0
- dolphin/sdk/agent/dolphin_agent.py +1106 -0
- dolphin/sdk/api/__init__.py +4 -0
- dolphin/sdk/runtime/__init__.py +8 -0
- dolphin/sdk/runtime/env.py +363 -0
- dolphin/sdk/skill/__init__.py +10 -0
- dolphin/sdk/skill/global_skills.py +706 -0
- dolphin/sdk/skill/traditional_toolkit.py +260 -0
- kweaver_dolphin-0.1.0.dist-info/METADATA +521 -0
- kweaver_dolphin-0.1.0.dist-info/RECORD +199 -0
- kweaver_dolphin-0.1.0.dist-info/WHEEL +5 -0
- kweaver_dolphin-0.1.0.dist-info/entry_points.txt +27 -0
- kweaver_dolphin-0.1.0.dist-info/licenses/LICENSE.txt +201 -0
- kweaver_dolphin-0.1.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Multimodal input handler for CLI.
|
|
3
|
+
|
|
4
|
+
Integrates parsing, image reading, and processing to convert user input
|
|
5
|
+
with multimodal markers into proper multimodal content.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
import base64
|
|
10
|
+
from typing import Union, List, Dict, Any, Optional
|
|
11
|
+
|
|
12
|
+
from dolphin.cli.multimodal.input_parser import (
|
|
13
|
+
MultimodalInputParser,
|
|
14
|
+
ParsedMultimodalInput,
|
|
15
|
+
ImageSourceType,
|
|
16
|
+
)
|
|
17
|
+
from dolphin.cli.multimodal.clipboard import ClipboardImageReader
|
|
18
|
+
from dolphin.cli.multimodal.image_processor import ImageProcessor, ImageProcessConfig
|
|
19
|
+
from dolphin.core.common.multimodal import (
|
|
20
|
+
text_block,
|
|
21
|
+
image_url_block,
|
|
22
|
+
ClipboardEmptyError,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# Type alias for message content
|
|
27
|
+
MessageContent = Union[str, List[Dict[str, Any]]]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class MultimodalInputHandler:
|
|
31
|
+
"""Handler for processing multimodal CLI input.
|
|
32
|
+
|
|
33
|
+
Converts user input with @paste, @image:, @url: markers into
|
|
34
|
+
proper multimodal content compatible with LLM APIs.
|
|
35
|
+
|
|
36
|
+
Usage:
|
|
37
|
+
handler = MultimodalInputHandler()
|
|
38
|
+
content = handler.process("@paste 请描述这张图片")
|
|
39
|
+
# Returns List[Dict] with image and text blocks
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
def __init__(
|
|
43
|
+
self,
|
|
44
|
+
image_config: Optional[ImageProcessConfig] = None,
|
|
45
|
+
verbose: bool = False
|
|
46
|
+
):
|
|
47
|
+
"""Initialize the handler.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
image_config: Configuration for image processing
|
|
51
|
+
verbose: If True, print status messages
|
|
52
|
+
"""
|
|
53
|
+
self.parser = MultimodalInputParser()
|
|
54
|
+
self.clipboard = ClipboardImageReader()
|
|
55
|
+
self.processor = ImageProcessor(image_config)
|
|
56
|
+
self.verbose = verbose
|
|
57
|
+
|
|
58
|
+
def process(self, raw_input: str) -> MessageContent:
|
|
59
|
+
"""Process user input and convert to message content.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
raw_input: Raw user input string
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
str if no multimodal markers, List[Dict] if multimodal
|
|
66
|
+
|
|
67
|
+
Raises:
|
|
68
|
+
ClipboardEmptyError: If @paste used but clipboard empty
|
|
69
|
+
FileNotFoundError: If @image: file doesn't exist
|
|
70
|
+
Other exceptions from image processing
|
|
71
|
+
"""
|
|
72
|
+
# Quick check - if no markers, return as plain text
|
|
73
|
+
if not self.parser.has_multimodal_markers(raw_input):
|
|
74
|
+
return raw_input
|
|
75
|
+
|
|
76
|
+
# Parse the input
|
|
77
|
+
parsed = self.parser.parse(raw_input)
|
|
78
|
+
|
|
79
|
+
if not parsed.has_images():
|
|
80
|
+
return raw_input
|
|
81
|
+
|
|
82
|
+
# Build multimodal content
|
|
83
|
+
content: List[Dict[str, Any]] = []
|
|
84
|
+
|
|
85
|
+
for i, text_part in enumerate(parsed.text_parts):
|
|
86
|
+
# Add text block if not empty
|
|
87
|
+
if text_part.strip():
|
|
88
|
+
content.append(text_block(text_part.strip()))
|
|
89
|
+
|
|
90
|
+
# Add corresponding image if exists
|
|
91
|
+
if i < len(parsed.image_refs):
|
|
92
|
+
ref = parsed.image_refs[i]
|
|
93
|
+
image_url = self._resolve_image_ref(ref)
|
|
94
|
+
content.append(image_url_block(image_url, detail="auto"))
|
|
95
|
+
|
|
96
|
+
# Ensure we have at least one block
|
|
97
|
+
if not content:
|
|
98
|
+
return raw_input
|
|
99
|
+
|
|
100
|
+
return content
|
|
101
|
+
|
|
102
|
+
def _resolve_image_ref(self, ref) -> str:
|
|
103
|
+
"""Resolve an image reference to a usable URL.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
ref: ImageReference object
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
Image URL (data: URL for local/clipboard, https: for web)
|
|
110
|
+
"""
|
|
111
|
+
if ref.source_type == ImageSourceType.CLIPBOARD:
|
|
112
|
+
return self._read_clipboard()
|
|
113
|
+
|
|
114
|
+
elif ref.source_type == ImageSourceType.FILE:
|
|
115
|
+
return self._read_file(ref.source)
|
|
116
|
+
|
|
117
|
+
elif ref.source_type == ImageSourceType.URL:
|
|
118
|
+
# URL is passed through directly
|
|
119
|
+
return ref.source
|
|
120
|
+
|
|
121
|
+
raise ValueError(f"Unknown source type: {ref.source_type}")
|
|
122
|
+
|
|
123
|
+
def _read_clipboard(self) -> str:
|
|
124
|
+
"""Read and process clipboard image.
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
Base64 data URL
|
|
128
|
+
"""
|
|
129
|
+
data = self.clipboard.read()
|
|
130
|
+
if data is None:
|
|
131
|
+
raise ClipboardEmptyError(
|
|
132
|
+
"No image found in clipboard. "
|
|
133
|
+
"Please copy an image first (Cmd/Ctrl+C on an image)."
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
if self.verbose:
|
|
137
|
+
info = self.processor.get_image_info(data)
|
|
138
|
+
print(f"📎 Read clipboard image: {info.get('width', '?')}x{info.get('height', '?')}, "
|
|
139
|
+
f"{info.get('size_bytes', 0) // 1024}KB")
|
|
140
|
+
|
|
141
|
+
# Process the image (resize, compress if needed)
|
|
142
|
+
processed = self.processor.process(data)
|
|
143
|
+
|
|
144
|
+
if self.verbose and len(processed) != len(data):
|
|
145
|
+
print(f" Compressed to {len(processed) // 1024}KB")
|
|
146
|
+
|
|
147
|
+
return self.clipboard.to_base64_url(processed)
|
|
148
|
+
|
|
149
|
+
def _read_file(self, path: str) -> str:
|
|
150
|
+
"""Read and process image file.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
path: File path (can be relative or use ~)
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
Base64 data URL
|
|
157
|
+
"""
|
|
158
|
+
# Expand user home and resolve path
|
|
159
|
+
expanded_path = os.path.expanduser(path)
|
|
160
|
+
if not os.path.isabs(expanded_path):
|
|
161
|
+
expanded_path = os.path.abspath(expanded_path)
|
|
162
|
+
|
|
163
|
+
if not os.path.exists(expanded_path):
|
|
164
|
+
raise FileNotFoundError(f"Image file not found: {path}")
|
|
165
|
+
|
|
166
|
+
if self.verbose:
|
|
167
|
+
print(f"📁 Reading image file: {path}")
|
|
168
|
+
|
|
169
|
+
with open(expanded_path, "rb") as f:
|
|
170
|
+
data = f.read()
|
|
171
|
+
|
|
172
|
+
# Detect MIME type
|
|
173
|
+
mime_type = self._detect_mime_type(data)
|
|
174
|
+
|
|
175
|
+
# Process the image
|
|
176
|
+
processed = self.processor.process(data)
|
|
177
|
+
|
|
178
|
+
if self.verbose:
|
|
179
|
+
info = self.processor.get_image_info(processed)
|
|
180
|
+
print(f" Image: {info.get('width', '?')}x{info.get('height', '?')}, "
|
|
181
|
+
f"{len(processed) // 1024}KB")
|
|
182
|
+
|
|
183
|
+
return self._to_base64_url(processed, mime_type)
|
|
184
|
+
|
|
185
|
+
def _detect_mime_type(self, data: bytes) -> str:
|
|
186
|
+
"""Detect MIME type from image data.
|
|
187
|
+
|
|
188
|
+
Args:
|
|
189
|
+
data: Image bytes
|
|
190
|
+
|
|
191
|
+
Returns:
|
|
192
|
+
MIME type string
|
|
193
|
+
"""
|
|
194
|
+
# Check magic bytes
|
|
195
|
+
if data[:8] == b'\x89PNG\r\n\x1a\n':
|
|
196
|
+
return "image/png"
|
|
197
|
+
elif data[:2] == b'\xff\xd8':
|
|
198
|
+
return "image/jpeg"
|
|
199
|
+
elif data[:6] in (b'GIF87a', b'GIF89a'):
|
|
200
|
+
return "image/gif"
|
|
201
|
+
elif data[:4] == b'RIFF' and data[8:12] == b'WEBP':
|
|
202
|
+
return "image/webp"
|
|
203
|
+
else:
|
|
204
|
+
# Default to PNG (processor will convert)
|
|
205
|
+
return "image/png"
|
|
206
|
+
|
|
207
|
+
def _to_base64_url(self, data: bytes, mime_type: str = "image/png") -> str:
|
|
208
|
+
"""Convert image data to base64 data URL.
|
|
209
|
+
|
|
210
|
+
Args:
|
|
211
|
+
data: Image bytes
|
|
212
|
+
mime_type: MIME type
|
|
213
|
+
|
|
214
|
+
Returns:
|
|
215
|
+
Data URL string
|
|
216
|
+
"""
|
|
217
|
+
b64 = base64.b64encode(data).decode('utf-8')
|
|
218
|
+
return f"data:{mime_type};base64,{b64}"
|
|
219
|
+
|
|
220
|
+
def check_clipboard_status(self) -> dict:
|
|
221
|
+
"""Check if clipboard contains an image.
|
|
222
|
+
|
|
223
|
+
Returns:
|
|
224
|
+
Status dict with has_image and optional info
|
|
225
|
+
"""
|
|
226
|
+
data = self.clipboard.read()
|
|
227
|
+
if data is None:
|
|
228
|
+
return {"has_image": False}
|
|
229
|
+
|
|
230
|
+
info = self.processor.get_image_info(data)
|
|
231
|
+
return {
|
|
232
|
+
"has_image": True,
|
|
233
|
+
"info": info
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
# Convenience function for quick processing
|
|
238
|
+
def process_multimodal_input(raw_input: str, verbose: bool = False) -> MessageContent:
|
|
239
|
+
"""Process user input and convert multimodal markers to content blocks.
|
|
240
|
+
|
|
241
|
+
Args:
|
|
242
|
+
raw_input: Raw user input string
|
|
243
|
+
verbose: If True, print status messages
|
|
244
|
+
|
|
245
|
+
Returns:
|
|
246
|
+
str if no multimodal markers, List[Dict] if multimodal
|
|
247
|
+
"""
|
|
248
|
+
handler = MultimodalInputHandler(verbose=verbose)
|
|
249
|
+
return handler.process(raw_input)
|
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Image processor for multimodal CLI input.
|
|
3
|
+
|
|
4
|
+
Handles image validation, format conversion, and compression.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import io
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
from typing import Tuple, Optional
|
|
10
|
+
|
|
11
|
+
from dolphin.core.common.multimodal import (
|
|
12
|
+
UnsupportedImageFormatError,
|
|
13
|
+
ImagePayloadTooLargeError,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class ImageProcessConfig:
|
|
19
|
+
"""Configuration for image processing."""
|
|
20
|
+
max_size_bytes: int = 4 * 1024 * 1024 # 4MB
|
|
21
|
+
max_dimension: int = 2048 # Maximum edge length
|
|
22
|
+
quality: int = 85 # JPEG compression quality
|
|
23
|
+
# Added MPO (Multi-Picture Object) - used by some cameras/phones, JPEG-based
|
|
24
|
+
allowed_formats: Tuple[str, ...] = ("PNG", "JPEG", "GIF", "WEBP", "MPO")
|
|
25
|
+
auto_compress: bool = True # Auto-compress oversized images
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class ImageProcessor:
|
|
29
|
+
"""Image preprocessor for CLI multimodal input.
|
|
30
|
+
|
|
31
|
+
Handles:
|
|
32
|
+
- Format validation
|
|
33
|
+
- Size checking and compression
|
|
34
|
+
- Format conversion
|
|
35
|
+
|
|
36
|
+
Usage:
|
|
37
|
+
processor = ImageProcessor()
|
|
38
|
+
processed_data = processor.process(raw_image_data)
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def __init__(self, config: Optional[ImageProcessConfig] = None):
|
|
42
|
+
"""Initialize the processor with configuration.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
config: Processing configuration (uses defaults if not provided)
|
|
46
|
+
"""
|
|
47
|
+
self.config = config or ImageProcessConfig()
|
|
48
|
+
|
|
49
|
+
def process(self, image_data: bytes) -> bytes:
|
|
50
|
+
"""Process image data: validate, resize if needed, optimize.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
image_data: Raw image bytes
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
Processed image bytes
|
|
57
|
+
|
|
58
|
+
Raises:
|
|
59
|
+
UnsupportedImageFormatError: If format not allowed
|
|
60
|
+
ImagePayloadTooLargeError: If size exceeds limit (after compression attempt)
|
|
61
|
+
"""
|
|
62
|
+
try:
|
|
63
|
+
from PIL import Image
|
|
64
|
+
except ImportError:
|
|
65
|
+
# Pillow not installed, return data as-is but check size
|
|
66
|
+
if len(image_data) > self.config.max_size_bytes:
|
|
67
|
+
raise ImagePayloadTooLargeError(
|
|
68
|
+
f"Image size {len(image_data)} exceeds limit {self.config.max_size_bytes}. "
|
|
69
|
+
f"Install Pillow for automatic compression."
|
|
70
|
+
)
|
|
71
|
+
return image_data
|
|
72
|
+
|
|
73
|
+
# Open and validate image
|
|
74
|
+
img = Image.open(io.BytesIO(image_data))
|
|
75
|
+
|
|
76
|
+
# Check format - treat MPO as JPEG (MPO is JPEG-based multi-picture format)
|
|
77
|
+
img_format = img.format.upper() if img.format else None
|
|
78
|
+
if img_format and img_format not in self.config.allowed_formats:
|
|
79
|
+
raise UnsupportedImageFormatError(
|
|
80
|
+
f"Image format '{img.format}' not supported. "
|
|
81
|
+
f"Allowed formats: {', '.join(self.config.allowed_formats)}"
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# Check and resize if needed
|
|
85
|
+
if max(img.size) > self.config.max_dimension:
|
|
86
|
+
if self.config.auto_compress:
|
|
87
|
+
img = self._resize(img)
|
|
88
|
+
else:
|
|
89
|
+
raise ImagePayloadTooLargeError(
|
|
90
|
+
f"Image dimensions {img.size} exceed limit {self.config.max_dimension}. "
|
|
91
|
+
f"Enable auto_compress to automatically resize."
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
# Convert to output format
|
|
95
|
+
output = io.BytesIO()
|
|
96
|
+
|
|
97
|
+
# Choose format based on image mode
|
|
98
|
+
if img.mode in ('RGBA', 'LA', 'P'):
|
|
99
|
+
# Preserve transparency with PNG
|
|
100
|
+
if img.mode == 'P' and 'transparency' in img.info:
|
|
101
|
+
img = img.convert('RGBA')
|
|
102
|
+
output_format = "PNG"
|
|
103
|
+
img.save(output, format=output_format, optimize=True)
|
|
104
|
+
else:
|
|
105
|
+
# Use JPEG for photos (smaller size)
|
|
106
|
+
if img.mode != 'RGB':
|
|
107
|
+
img = img.convert('RGB')
|
|
108
|
+
output_format = "JPEG"
|
|
109
|
+
img.save(output, format=output_format, quality=self.config.quality, optimize=True)
|
|
110
|
+
|
|
111
|
+
result = output.getvalue()
|
|
112
|
+
|
|
113
|
+
# Final size check
|
|
114
|
+
if len(result) > self.config.max_size_bytes:
|
|
115
|
+
if self.config.auto_compress:
|
|
116
|
+
# Try more aggressive compression
|
|
117
|
+
result = self._aggressive_compress(img, output_format)
|
|
118
|
+
|
|
119
|
+
if len(result) > self.config.max_size_bytes:
|
|
120
|
+
raise ImagePayloadTooLargeError(
|
|
121
|
+
f"Image size {len(result)} exceeds limit {self.config.max_size_bytes} "
|
|
122
|
+
f"even after compression."
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
return result
|
|
126
|
+
|
|
127
|
+
def _resize(self, img) -> "Image.Image":
|
|
128
|
+
"""Resize image to fit within max_dimension while preserving aspect ratio.
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
img: PIL Image object
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
Resized PIL Image
|
|
135
|
+
"""
|
|
136
|
+
from PIL import Image
|
|
137
|
+
|
|
138
|
+
ratio = self.config.max_dimension / max(img.size)
|
|
139
|
+
new_size = (int(img.width * ratio), int(img.height * ratio))
|
|
140
|
+
return img.resize(new_size, Image.Resampling.LANCZOS)
|
|
141
|
+
|
|
142
|
+
def _aggressive_compress(self, img, output_format: str) -> bytes:
|
|
143
|
+
"""Apply aggressive compression to reduce file size.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
img: PIL Image object
|
|
147
|
+
output_format: Target format
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
Compressed image bytes
|
|
151
|
+
"""
|
|
152
|
+
output = io.BytesIO()
|
|
153
|
+
|
|
154
|
+
# Reduce dimensions further
|
|
155
|
+
max_dim = min(self.config.max_dimension, 1024)
|
|
156
|
+
if max(img.size) > max_dim:
|
|
157
|
+
from PIL import Image
|
|
158
|
+
ratio = max_dim / max(img.size)
|
|
159
|
+
new_size = (int(img.width * ratio), int(img.height * ratio))
|
|
160
|
+
img = img.resize(new_size, Image.Resampling.LANCZOS)
|
|
161
|
+
|
|
162
|
+
# Use lower quality
|
|
163
|
+
if output_format == "JPEG":
|
|
164
|
+
if img.mode != 'RGB':
|
|
165
|
+
img = img.convert('RGB')
|
|
166
|
+
img.save(output, format="JPEG", quality=60, optimize=True)
|
|
167
|
+
else:
|
|
168
|
+
img.save(output, format="PNG", optimize=True)
|
|
169
|
+
|
|
170
|
+
return output.getvalue()
|
|
171
|
+
|
|
172
|
+
def get_image_info(self, image_data: bytes) -> dict:
|
|
173
|
+
"""Get information about an image.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
image_data: Raw image bytes
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
Dict with format, size, mode, dimensions
|
|
180
|
+
"""
|
|
181
|
+
try:
|
|
182
|
+
from PIL import Image
|
|
183
|
+
|
|
184
|
+
img = Image.open(io.BytesIO(image_data))
|
|
185
|
+
return {
|
|
186
|
+
"format": img.format,
|
|
187
|
+
"mode": img.mode,
|
|
188
|
+
"width": img.width,
|
|
189
|
+
"height": img.height,
|
|
190
|
+
"size_bytes": len(image_data),
|
|
191
|
+
}
|
|
192
|
+
except Exception as e:
|
|
193
|
+
return {
|
|
194
|
+
"error": str(e),
|
|
195
|
+
"size_bytes": len(image_data),
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
def validate_file(self, file_path: str) -> bool:
|
|
199
|
+
"""Validate that a file is a valid image.
|
|
200
|
+
|
|
201
|
+
Args:
|
|
202
|
+
file_path: Path to image file
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
True if valid image
|
|
206
|
+
"""
|
|
207
|
+
try:
|
|
208
|
+
from PIL import Image
|
|
209
|
+
|
|
210
|
+
with Image.open(file_path) as img:
|
|
211
|
+
img.verify()
|
|
212
|
+
return True
|
|
213
|
+
except Exception:
|
|
214
|
+
return False
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Input parser for multimodal CLI input.
|
|
3
|
+
|
|
4
|
+
Parses user input to extract multimodal references like @paste, @image:<path>, @url:<url>.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import re
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from enum import Enum
|
|
10
|
+
from typing import List, Tuple
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ImageSourceType(Enum):
|
|
14
|
+
"""Type of image source."""
|
|
15
|
+
CLIPBOARD = "clipboard" # @paste
|
|
16
|
+
FILE = "file" # @image:<path>
|
|
17
|
+
URL = "url" # @url:<url>
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class ImageReference:
|
|
22
|
+
"""Reference to an image in user input."""
|
|
23
|
+
source_type: ImageSourceType
|
|
24
|
+
source: str # Path, URL, or "clipboard"
|
|
25
|
+
position: int # Position in original text
|
|
26
|
+
original_text: str # Original matched text (e.g., "@paste", "@image:./foo.png")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class ParsedMultimodalInput:
|
|
31
|
+
"""Result of parsing multimodal input."""
|
|
32
|
+
text_parts: List[str] # Text fragments after removing image references
|
|
33
|
+
image_refs: List[ImageReference] # List of image references
|
|
34
|
+
original_input: str # Original unmodified input
|
|
35
|
+
|
|
36
|
+
def has_images(self) -> bool:
|
|
37
|
+
"""Check if input contains any image references."""
|
|
38
|
+
return len(self.image_refs) > 0
|
|
39
|
+
|
|
40
|
+
def get_combined_text(self) -> str:
|
|
41
|
+
"""Get all text parts combined (without image markers)."""
|
|
42
|
+
return " ".join(part.strip() for part in self.text_parts if part.strip())
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class MultimodalInputParser:
|
|
46
|
+
"""Parser for multimodal CLI input.
|
|
47
|
+
|
|
48
|
+
Supports:
|
|
49
|
+
- @paste: Read image from clipboard
|
|
50
|
+
- @image:<path>: Read image from local file
|
|
51
|
+
- @url:<url>: Reference image by URL (https only)
|
|
52
|
+
|
|
53
|
+
Example:
|
|
54
|
+
parser = MultimodalInputParser()
|
|
55
|
+
result = parser.parse("@paste 请描述这张图片")
|
|
56
|
+
# result.has_images() == True
|
|
57
|
+
# result.image_refs[0].source_type == ImageSourceType.CLIPBOARD
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
# Pattern definitions
|
|
61
|
+
PASTE_PATTERN = r"@paste"
|
|
62
|
+
IMAGE_PATTERN = r"@image:([^\s]+)"
|
|
63
|
+
URL_PATTERN = r"@url:(https://[^\s]+)"
|
|
64
|
+
|
|
65
|
+
def __init__(self):
|
|
66
|
+
"""Initialize the parser with compiled regex patterns."""
|
|
67
|
+
self._patterns = [
|
|
68
|
+
(re.compile(self.PASTE_PATTERN, re.IGNORECASE), ImageSourceType.CLIPBOARD),
|
|
69
|
+
(re.compile(self.IMAGE_PATTERN, re.IGNORECASE), ImageSourceType.FILE),
|
|
70
|
+
(re.compile(self.URL_PATTERN, re.IGNORECASE), ImageSourceType.URL),
|
|
71
|
+
]
|
|
72
|
+
|
|
73
|
+
def parse(self, raw_input: str) -> ParsedMultimodalInput:
|
|
74
|
+
"""Parse raw input to extract multimodal references.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
raw_input: User's raw input string
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
ParsedMultimodalInput containing text parts and image references
|
|
81
|
+
"""
|
|
82
|
+
if not raw_input:
|
|
83
|
+
return ParsedMultimodalInput(
|
|
84
|
+
text_parts=[""],
|
|
85
|
+
image_refs=[],
|
|
86
|
+
original_input=raw_input
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
# Find all matches with their positions
|
|
90
|
+
matches: List[Tuple[int, int, ImageReference]] = []
|
|
91
|
+
|
|
92
|
+
for pattern, source_type in self._patterns:
|
|
93
|
+
for match in pattern.finditer(raw_input):
|
|
94
|
+
start, end = match.span()
|
|
95
|
+
|
|
96
|
+
if source_type == ImageSourceType.CLIPBOARD:
|
|
97
|
+
source = "clipboard"
|
|
98
|
+
elif source_type == ImageSourceType.FILE:
|
|
99
|
+
source = match.group(1) # The path
|
|
100
|
+
elif source_type == ImageSourceType.URL:
|
|
101
|
+
source = match.group(1) # The URL
|
|
102
|
+
else:
|
|
103
|
+
continue
|
|
104
|
+
|
|
105
|
+
ref = ImageReference(
|
|
106
|
+
source_type=source_type,
|
|
107
|
+
source=source,
|
|
108
|
+
position=start,
|
|
109
|
+
original_text=match.group(0)
|
|
110
|
+
)
|
|
111
|
+
matches.append((start, end, ref))
|
|
112
|
+
|
|
113
|
+
# Sort by position
|
|
114
|
+
matches.sort(key=lambda x: x[0])
|
|
115
|
+
|
|
116
|
+
# Extract text parts and image references
|
|
117
|
+
text_parts: List[str] = []
|
|
118
|
+
image_refs: List[ImageReference] = []
|
|
119
|
+
last_end = 0
|
|
120
|
+
|
|
121
|
+
for start, end, ref in matches:
|
|
122
|
+
# Add text before this match
|
|
123
|
+
text_before = raw_input[last_end:start]
|
|
124
|
+
text_parts.append(text_before)
|
|
125
|
+
image_refs.append(ref)
|
|
126
|
+
last_end = end
|
|
127
|
+
|
|
128
|
+
# Add remaining text after last match
|
|
129
|
+
text_parts.append(raw_input[last_end:])
|
|
130
|
+
|
|
131
|
+
return ParsedMultimodalInput(
|
|
132
|
+
text_parts=text_parts,
|
|
133
|
+
image_refs=image_refs,
|
|
134
|
+
original_input=raw_input
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
def has_multimodal_markers(self, text: str) -> bool:
|
|
138
|
+
"""Quick check if text contains any multimodal markers.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
text: Text to check
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
True if text contains @paste, @image:, or @url:
|
|
145
|
+
"""
|
|
146
|
+
for pattern, _ in self._patterns:
|
|
147
|
+
if pattern.search(text):
|
|
148
|
+
return True
|
|
149
|
+
return False
|