kweaver-dolphin 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- DolphinLanguageSDK/__init__.py +58 -0
- dolphin/__init__.py +62 -0
- dolphin/cli/__init__.py +20 -0
- dolphin/cli/args/__init__.py +9 -0
- dolphin/cli/args/parser.py +567 -0
- dolphin/cli/builtin_agents/__init__.py +22 -0
- dolphin/cli/commands/__init__.py +4 -0
- dolphin/cli/interrupt/__init__.py +8 -0
- dolphin/cli/interrupt/handler.py +205 -0
- dolphin/cli/interrupt/keyboard.py +82 -0
- dolphin/cli/main.py +49 -0
- dolphin/cli/multimodal/__init__.py +34 -0
- dolphin/cli/multimodal/clipboard.py +327 -0
- dolphin/cli/multimodal/handler.py +249 -0
- dolphin/cli/multimodal/image_processor.py +214 -0
- dolphin/cli/multimodal/input_parser.py +149 -0
- dolphin/cli/runner/__init__.py +8 -0
- dolphin/cli/runner/runner.py +989 -0
- dolphin/cli/ui/__init__.py +10 -0
- dolphin/cli/ui/console.py +2795 -0
- dolphin/cli/ui/input.py +340 -0
- dolphin/cli/ui/layout.py +425 -0
- dolphin/cli/ui/stream_renderer.py +302 -0
- dolphin/cli/utils/__init__.py +8 -0
- dolphin/cli/utils/helpers.py +135 -0
- dolphin/cli/utils/version.py +49 -0
- dolphin/core/__init__.py +107 -0
- dolphin/core/agent/__init__.py +10 -0
- dolphin/core/agent/agent_state.py +69 -0
- dolphin/core/agent/base_agent.py +970 -0
- dolphin/core/code_block/__init__.py +0 -0
- dolphin/core/code_block/agent_init_block.py +0 -0
- dolphin/core/code_block/assign_block.py +98 -0
- dolphin/core/code_block/basic_code_block.py +1865 -0
- dolphin/core/code_block/explore_block.py +1327 -0
- dolphin/core/code_block/explore_block_v2.py +712 -0
- dolphin/core/code_block/explore_strategy.py +672 -0
- dolphin/core/code_block/judge_block.py +220 -0
- dolphin/core/code_block/prompt_block.py +32 -0
- dolphin/core/code_block/skill_call_deduplicator.py +291 -0
- dolphin/core/code_block/tool_block.py +129 -0
- dolphin/core/common/__init__.py +17 -0
- dolphin/core/common/constants.py +176 -0
- dolphin/core/common/enums.py +1173 -0
- dolphin/core/common/exceptions.py +133 -0
- dolphin/core/common/multimodal.py +539 -0
- dolphin/core/common/object_type.py +165 -0
- dolphin/core/common/output_format.py +432 -0
- dolphin/core/common/types.py +36 -0
- dolphin/core/config/__init__.py +16 -0
- dolphin/core/config/global_config.py +1289 -0
- dolphin/core/config/ontology_config.py +133 -0
- dolphin/core/context/__init__.py +12 -0
- dolphin/core/context/context.py +1580 -0
- dolphin/core/context/context_manager.py +161 -0
- dolphin/core/context/var_output.py +82 -0
- dolphin/core/context/variable_pool.py +356 -0
- dolphin/core/context_engineer/__init__.py +41 -0
- dolphin/core/context_engineer/config/__init__.py +5 -0
- dolphin/core/context_engineer/config/settings.py +402 -0
- dolphin/core/context_engineer/core/__init__.py +7 -0
- dolphin/core/context_engineer/core/budget_manager.py +327 -0
- dolphin/core/context_engineer/core/context_assembler.py +583 -0
- dolphin/core/context_engineer/core/context_manager.py +637 -0
- dolphin/core/context_engineer/core/tokenizer_service.py +260 -0
- dolphin/core/context_engineer/example/incremental_example.py +267 -0
- dolphin/core/context_engineer/example/traditional_example.py +334 -0
- dolphin/core/context_engineer/services/__init__.py +5 -0
- dolphin/core/context_engineer/services/compressor.py +399 -0
- dolphin/core/context_engineer/utils/__init__.py +6 -0
- dolphin/core/context_engineer/utils/context_utils.py +441 -0
- dolphin/core/context_engineer/utils/message_formatter.py +270 -0
- dolphin/core/context_engineer/utils/token_utils.py +139 -0
- dolphin/core/coroutine/__init__.py +15 -0
- dolphin/core/coroutine/context_snapshot.py +154 -0
- dolphin/core/coroutine/context_snapshot_profile.py +922 -0
- dolphin/core/coroutine/context_snapshot_store.py +268 -0
- dolphin/core/coroutine/execution_frame.py +145 -0
- dolphin/core/coroutine/execution_state_registry.py +161 -0
- dolphin/core/coroutine/resume_handle.py +101 -0
- dolphin/core/coroutine/step_result.py +101 -0
- dolphin/core/executor/__init__.py +18 -0
- dolphin/core/executor/debug_controller.py +630 -0
- dolphin/core/executor/dolphin_executor.py +1063 -0
- dolphin/core/executor/executor.py +624 -0
- dolphin/core/flags/__init__.py +27 -0
- dolphin/core/flags/definitions.py +49 -0
- dolphin/core/flags/manager.py +113 -0
- dolphin/core/hook/__init__.py +95 -0
- dolphin/core/hook/expression_evaluator.py +499 -0
- dolphin/core/hook/hook_dispatcher.py +380 -0
- dolphin/core/hook/hook_types.py +248 -0
- dolphin/core/hook/isolated_variable_pool.py +284 -0
- dolphin/core/interfaces.py +53 -0
- dolphin/core/llm/__init__.py +0 -0
- dolphin/core/llm/llm.py +495 -0
- dolphin/core/llm/llm_call.py +100 -0
- dolphin/core/llm/llm_client.py +1285 -0
- dolphin/core/llm/message_sanitizer.py +120 -0
- dolphin/core/logging/__init__.py +20 -0
- dolphin/core/logging/logger.py +526 -0
- dolphin/core/message/__init__.py +8 -0
- dolphin/core/message/compressor.py +749 -0
- dolphin/core/parser/__init__.py +8 -0
- dolphin/core/parser/parser.py +405 -0
- dolphin/core/runtime/__init__.py +10 -0
- dolphin/core/runtime/runtime_graph.py +926 -0
- dolphin/core/runtime/runtime_instance.py +446 -0
- dolphin/core/skill/__init__.py +14 -0
- dolphin/core/skill/context_retention.py +157 -0
- dolphin/core/skill/skill_function.py +686 -0
- dolphin/core/skill/skill_matcher.py +282 -0
- dolphin/core/skill/skillkit.py +700 -0
- dolphin/core/skill/skillset.py +72 -0
- dolphin/core/trajectory/__init__.py +10 -0
- dolphin/core/trajectory/recorder.py +189 -0
- dolphin/core/trajectory/trajectory.py +522 -0
- dolphin/core/utils/__init__.py +9 -0
- dolphin/core/utils/cache_kv.py +212 -0
- dolphin/core/utils/tools.py +340 -0
- dolphin/lib/__init__.py +93 -0
- dolphin/lib/debug/__init__.py +8 -0
- dolphin/lib/debug/visualizer.py +409 -0
- dolphin/lib/memory/__init__.py +28 -0
- dolphin/lib/memory/async_processor.py +220 -0
- dolphin/lib/memory/llm_calls.py +195 -0
- dolphin/lib/memory/manager.py +78 -0
- dolphin/lib/memory/sandbox.py +46 -0
- dolphin/lib/memory/storage.py +245 -0
- dolphin/lib/memory/utils.py +51 -0
- dolphin/lib/ontology/__init__.py +12 -0
- dolphin/lib/ontology/basic/__init__.py +0 -0
- dolphin/lib/ontology/basic/base.py +102 -0
- dolphin/lib/ontology/basic/concept.py +130 -0
- dolphin/lib/ontology/basic/object.py +11 -0
- dolphin/lib/ontology/basic/relation.py +63 -0
- dolphin/lib/ontology/datasource/__init__.py +27 -0
- dolphin/lib/ontology/datasource/datasource.py +66 -0
- dolphin/lib/ontology/datasource/oracle_datasource.py +338 -0
- dolphin/lib/ontology/datasource/sql.py +845 -0
- dolphin/lib/ontology/mapping.py +177 -0
- dolphin/lib/ontology/ontology.py +733 -0
- dolphin/lib/ontology/ontology_context.py +16 -0
- dolphin/lib/ontology/ontology_manager.py +107 -0
- dolphin/lib/skill_results/__init__.py +31 -0
- dolphin/lib/skill_results/cache_backend.py +559 -0
- dolphin/lib/skill_results/result_processor.py +181 -0
- dolphin/lib/skill_results/result_reference.py +179 -0
- dolphin/lib/skill_results/skillkit_hook.py +324 -0
- dolphin/lib/skill_results/strategies.py +328 -0
- dolphin/lib/skill_results/strategy_registry.py +150 -0
- dolphin/lib/skillkits/__init__.py +44 -0
- dolphin/lib/skillkits/agent_skillkit.py +155 -0
- dolphin/lib/skillkits/cognitive_skillkit.py +82 -0
- dolphin/lib/skillkits/env_skillkit.py +250 -0
- dolphin/lib/skillkits/mcp_adapter.py +616 -0
- dolphin/lib/skillkits/mcp_skillkit.py +771 -0
- dolphin/lib/skillkits/memory_skillkit.py +650 -0
- dolphin/lib/skillkits/noop_skillkit.py +31 -0
- dolphin/lib/skillkits/ontology_skillkit.py +89 -0
- dolphin/lib/skillkits/plan_act_skillkit.py +452 -0
- dolphin/lib/skillkits/resource/__init__.py +52 -0
- dolphin/lib/skillkits/resource/models/__init__.py +6 -0
- dolphin/lib/skillkits/resource/models/skill_config.py +109 -0
- dolphin/lib/skillkits/resource/models/skill_meta.py +127 -0
- dolphin/lib/skillkits/resource/resource_skillkit.py +393 -0
- dolphin/lib/skillkits/resource/skill_cache.py +215 -0
- dolphin/lib/skillkits/resource/skill_loader.py +395 -0
- dolphin/lib/skillkits/resource/skill_validator.py +406 -0
- dolphin/lib/skillkits/resource_skillkit.py +11 -0
- dolphin/lib/skillkits/search_skillkit.py +163 -0
- dolphin/lib/skillkits/sql_skillkit.py +274 -0
- dolphin/lib/skillkits/system_skillkit.py +509 -0
- dolphin/lib/skillkits/vm_skillkit.py +65 -0
- dolphin/lib/utils/__init__.py +9 -0
- dolphin/lib/utils/data_process.py +207 -0
- dolphin/lib/utils/handle_progress.py +178 -0
- dolphin/lib/utils/security.py +139 -0
- dolphin/lib/utils/text_retrieval.py +462 -0
- dolphin/lib/vm/__init__.py +11 -0
- dolphin/lib/vm/env_executor.py +895 -0
- dolphin/lib/vm/python_session_manager.py +453 -0
- dolphin/lib/vm/vm.py +610 -0
- dolphin/sdk/__init__.py +60 -0
- dolphin/sdk/agent/__init__.py +12 -0
- dolphin/sdk/agent/agent_factory.py +236 -0
- dolphin/sdk/agent/dolphin_agent.py +1106 -0
- dolphin/sdk/api/__init__.py +4 -0
- dolphin/sdk/runtime/__init__.py +8 -0
- dolphin/sdk/runtime/env.py +363 -0
- dolphin/sdk/skill/__init__.py +10 -0
- dolphin/sdk/skill/global_skills.py +706 -0
- dolphin/sdk/skill/traditional_toolkit.py +260 -0
- kweaver_dolphin-0.1.0.dist-info/METADATA +521 -0
- kweaver_dolphin-0.1.0.dist-info/RECORD +199 -0
- kweaver_dolphin-0.1.0.dist-info/WHEEL +5 -0
- kweaver_dolphin-0.1.0.dist-info/entry_points.txt +27 -0
- kweaver_dolphin-0.1.0.dist-info/licenses/LICENSE.txt +201 -0
- kweaver_dolphin-0.1.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,441 @@
|
|
|
1
|
+
"""Context utility functions."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from typing import List, Dict, Optional
|
|
5
|
+
from ..core.tokenizer_service import TokenizerService
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def extract_key_info(
|
|
9
|
+
text: str, max_sentences: int = 3, keyword_weight: float = 2.0
|
|
10
|
+
) -> str:
|
|
11
|
+
"""
|
|
12
|
+
Extract key information from text using simple heuristics.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
text: Input text
|
|
16
|
+
max_sentences: Maximum number of sentences to extract
|
|
17
|
+
keyword_weight: Weight for keyword-containing sentences
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
Extracted key information
|
|
21
|
+
"""
|
|
22
|
+
if not text:
|
|
23
|
+
return ""
|
|
24
|
+
|
|
25
|
+
# Split into sentences
|
|
26
|
+
sentences = re.split(r"[.!?]+", text)
|
|
27
|
+
sentences = [s.strip() for s in sentences if s.strip()]
|
|
28
|
+
|
|
29
|
+
if not sentences:
|
|
30
|
+
return ""
|
|
31
|
+
|
|
32
|
+
# Simple keyword extraction
|
|
33
|
+
words = re.findall(r"\b\w+\b", text.lower())
|
|
34
|
+
word_freq = {}
|
|
35
|
+
for word in words:
|
|
36
|
+
if len(word) > 3: # Filter short words
|
|
37
|
+
word_freq[word] = word_freq.get(word, 0) + 1
|
|
38
|
+
|
|
39
|
+
# Get top keywords (excluding common words)
|
|
40
|
+
common_words = {
|
|
41
|
+
"the",
|
|
42
|
+
"and",
|
|
43
|
+
"or",
|
|
44
|
+
"but",
|
|
45
|
+
"in",
|
|
46
|
+
"on",
|
|
47
|
+
"at",
|
|
48
|
+
"to",
|
|
49
|
+
"for",
|
|
50
|
+
"of",
|
|
51
|
+
"with",
|
|
52
|
+
"by",
|
|
53
|
+
"is",
|
|
54
|
+
"are",
|
|
55
|
+
"was",
|
|
56
|
+
"were",
|
|
57
|
+
"be",
|
|
58
|
+
"been",
|
|
59
|
+
"have",
|
|
60
|
+
"has",
|
|
61
|
+
"had",
|
|
62
|
+
"do",
|
|
63
|
+
"does",
|
|
64
|
+
"did",
|
|
65
|
+
"will",
|
|
66
|
+
"would",
|
|
67
|
+
"could",
|
|
68
|
+
"should",
|
|
69
|
+
"may",
|
|
70
|
+
"might",
|
|
71
|
+
"can",
|
|
72
|
+
"this",
|
|
73
|
+
"that",
|
|
74
|
+
"these",
|
|
75
|
+
"those",
|
|
76
|
+
"a",
|
|
77
|
+
"an",
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
keywords = {
|
|
81
|
+
word: freq
|
|
82
|
+
for word, freq in word_freq.items()
|
|
83
|
+
if word not in common_words and freq > 1
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
# Score sentences based on keywords and position
|
|
87
|
+
sentence_scores = []
|
|
88
|
+
for i, sentence in enumerate(sentences):
|
|
89
|
+
score = 0
|
|
90
|
+
sentence_lower = sentence.lower()
|
|
91
|
+
|
|
92
|
+
# Keyword matches
|
|
93
|
+
for keyword in keywords:
|
|
94
|
+
if keyword in sentence_lower:
|
|
95
|
+
score += keyword_weight * keywords[keyword]
|
|
96
|
+
|
|
97
|
+
# Position bonus (first and last sentences get higher scores)
|
|
98
|
+
if i == 0 or i == len(sentences) - 1:
|
|
99
|
+
score += 1.0
|
|
100
|
+
|
|
101
|
+
sentence_scores.append((i, sentence, score))
|
|
102
|
+
|
|
103
|
+
# Sort by score and take top sentences
|
|
104
|
+
sentence_scores.sort(key=lambda x: x[2], reverse=True)
|
|
105
|
+
top_sentences = sentence_scores[:max_sentences]
|
|
106
|
+
top_sentences.sort(key=lambda x: x[0]) # Restore original order
|
|
107
|
+
|
|
108
|
+
return ". ".join(sentence for _, sentence, _ in top_sentences) + "."
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def summarize_content(
|
|
112
|
+
text: str, target_ratio: float = 0.3, preserve_keywords: bool = True
|
|
113
|
+
) -> str:
|
|
114
|
+
"""
|
|
115
|
+
Summarize content by extracting key sentences and compressing.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
text: Input text
|
|
119
|
+
target_ratio: Target compression ratio
|
|
120
|
+
preserve_keywords: Whether to preserve important keywords
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
Summarized content
|
|
124
|
+
"""
|
|
125
|
+
if not text:
|
|
126
|
+
return ""
|
|
127
|
+
|
|
128
|
+
# Extract key information first
|
|
129
|
+
key_info = extract_key_info(text, max_sentences=int(1 / target_ratio))
|
|
130
|
+
|
|
131
|
+
if preserve_keywords:
|
|
132
|
+
# Extract keywords from original text
|
|
133
|
+
words = re.findall(r"\b\w+\b", text.lower())
|
|
134
|
+
word_freq = {}
|
|
135
|
+
for word in words:
|
|
136
|
+
if len(word) > 4: # Filter short words
|
|
137
|
+
word_freq[word] = word_freq.get(word, 0) + 1
|
|
138
|
+
|
|
139
|
+
# Get top keywords
|
|
140
|
+
keywords = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:10]
|
|
141
|
+
keyword_list = [word for word, freq in keywords if freq > 1]
|
|
142
|
+
|
|
143
|
+
# Ensure keywords are included in summary
|
|
144
|
+
summary_parts = [key_info]
|
|
145
|
+
|
|
146
|
+
# Add keyword context if not already included
|
|
147
|
+
for keyword in keyword_list[:5]: # Top 5 keywords
|
|
148
|
+
if keyword not in key_info.lower():
|
|
149
|
+
# Find a sentence containing this keyword
|
|
150
|
+
sentences = re.split(r"[.!?]+", text)
|
|
151
|
+
for sentence in sentences:
|
|
152
|
+
if keyword in sentence.lower():
|
|
153
|
+
summary_parts.append(sentence.strip())
|
|
154
|
+
break
|
|
155
|
+
|
|
156
|
+
return " ".join(summary_parts)
|
|
157
|
+
|
|
158
|
+
return key_info
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def extract_entities(text: str) -> Dict[str, List[str]]:
|
|
162
|
+
"""
|
|
163
|
+
Simple entity extraction using regex patterns.
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
text: Input text
|
|
167
|
+
|
|
168
|
+
Returns:
|
|
169
|
+
Dictionary of extracted entities by type
|
|
170
|
+
"""
|
|
171
|
+
entities = {"dates": [], "emails": [], "urls": [], "numbers": [], "cap_words": []}
|
|
172
|
+
|
|
173
|
+
if not text:
|
|
174
|
+
return entities
|
|
175
|
+
|
|
176
|
+
# Date patterns
|
|
177
|
+
date_patterns = [
|
|
178
|
+
r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b", # MM/DD/YYYY or DD-MM-YYYY
|
|
179
|
+
r"\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b", # YYYY/MM/DD
|
|
180
|
+
r"\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\.?\s+\d{1,2},?\s+\d{4}\b", # Month DD, YYYY
|
|
181
|
+
]
|
|
182
|
+
|
|
183
|
+
for pattern in date_patterns:
|
|
184
|
+
entities["dates"].extend(re.findall(pattern, text, re.IGNORECASE))
|
|
185
|
+
|
|
186
|
+
# Email patterns
|
|
187
|
+
entities["emails"] = re.findall(
|
|
188
|
+
r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", text
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
# URL patterns
|
|
192
|
+
entities["urls"] = re.findall(
|
|
193
|
+
r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",
|
|
194
|
+
text,
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
# Number patterns
|
|
198
|
+
entities["numbers"] = re.findall(r"\b\d+(?:\.\d+)?\b", text)
|
|
199
|
+
|
|
200
|
+
# Capitalized words (potential proper nouns)
|
|
201
|
+
cap_words = re.findall(r"\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b", text)
|
|
202
|
+
entities["cap_words"] = list(set(cap_words)) # Remove duplicates
|
|
203
|
+
|
|
204
|
+
# Remove duplicates from other lists too
|
|
205
|
+
for key in entities:
|
|
206
|
+
entities[key] = list(set(entities[key]))
|
|
207
|
+
|
|
208
|
+
return entities
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def calculate_relevance_score(
|
|
212
|
+
query: str, content: str, tokenizer_service: Optional[TokenizerService] = None
|
|
213
|
+
) -> float:
|
|
214
|
+
"""
|
|
215
|
+
Calculate relevance score between query and content.
|
|
216
|
+
|
|
217
|
+
Args:
|
|
218
|
+
query: Search query or task description
|
|
219
|
+
content: Content to score
|
|
220
|
+
tokenizer_service: TokenizerService instance
|
|
221
|
+
|
|
222
|
+
Returns:
|
|
223
|
+
Relevance score (0-1)
|
|
224
|
+
"""
|
|
225
|
+
if not query or not content:
|
|
226
|
+
return 0.0
|
|
227
|
+
|
|
228
|
+
# Simple keyword-based relevance
|
|
229
|
+
query_words = set(re.findall(r"\b\w+\b", query.lower()))
|
|
230
|
+
content_words = set(re.findall(r"\b\w+\b", content.lower()))
|
|
231
|
+
|
|
232
|
+
# Remove common words
|
|
233
|
+
common_words = {
|
|
234
|
+
"the",
|
|
235
|
+
"and",
|
|
236
|
+
"or",
|
|
237
|
+
"but",
|
|
238
|
+
"in",
|
|
239
|
+
"on",
|
|
240
|
+
"at",
|
|
241
|
+
"to",
|
|
242
|
+
"for",
|
|
243
|
+
"of",
|
|
244
|
+
"with",
|
|
245
|
+
"by",
|
|
246
|
+
"is",
|
|
247
|
+
"are",
|
|
248
|
+
"was",
|
|
249
|
+
"were",
|
|
250
|
+
"be",
|
|
251
|
+
"been",
|
|
252
|
+
"have",
|
|
253
|
+
"has",
|
|
254
|
+
"had",
|
|
255
|
+
"do",
|
|
256
|
+
"does",
|
|
257
|
+
"did",
|
|
258
|
+
"will",
|
|
259
|
+
"would",
|
|
260
|
+
"could",
|
|
261
|
+
"should",
|
|
262
|
+
"may",
|
|
263
|
+
"might",
|
|
264
|
+
"can",
|
|
265
|
+
"this",
|
|
266
|
+
"that",
|
|
267
|
+
"these",
|
|
268
|
+
"those",
|
|
269
|
+
"a",
|
|
270
|
+
"an",
|
|
271
|
+
"it",
|
|
272
|
+
"its",
|
|
273
|
+
"they",
|
|
274
|
+
"them",
|
|
275
|
+
"their",
|
|
276
|
+
"we",
|
|
277
|
+
"us",
|
|
278
|
+
"our",
|
|
279
|
+
"you",
|
|
280
|
+
"your",
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
query_keywords = query_words - common_words
|
|
284
|
+
content_keywords = content_words - common_words
|
|
285
|
+
|
|
286
|
+
if not query_keywords:
|
|
287
|
+
return 0.0
|
|
288
|
+
|
|
289
|
+
# Calculate overlap
|
|
290
|
+
overlap = len(query_keywords.intersection(content_keywords))
|
|
291
|
+
total_unique = len(query_keywords.union(content_keywords))
|
|
292
|
+
|
|
293
|
+
# Jaccard similarity
|
|
294
|
+
jaccard = overlap / total_unique if total_unique > 0 else 0.0
|
|
295
|
+
|
|
296
|
+
# Bonus for exact phrase matches
|
|
297
|
+
phrase_bonus = 0.0
|
|
298
|
+
query_phrases = re.findall(r'"([^"]+)"', query)
|
|
299
|
+
for phrase in query_phrases:
|
|
300
|
+
if phrase.lower() in content.lower():
|
|
301
|
+
phrase_bonus += 0.2
|
|
302
|
+
|
|
303
|
+
return min(1.0, jaccard + phrase_bonus)
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def extract_task_keywords(task_description: str) -> List[str]:
|
|
307
|
+
"""
|
|
308
|
+
Extract keywords from task description.
|
|
309
|
+
|
|
310
|
+
Args:
|
|
311
|
+
task_description: Task description text
|
|
312
|
+
|
|
313
|
+
Returns:
|
|
314
|
+
List of extracted keywords
|
|
315
|
+
"""
|
|
316
|
+
if not task_description:
|
|
317
|
+
return []
|
|
318
|
+
|
|
319
|
+
# Extract action verbs and important nouns
|
|
320
|
+
words = re.findall(r"\b\w+\b", task_description.lower())
|
|
321
|
+
|
|
322
|
+
# Common action verbs in tasks
|
|
323
|
+
action_verbs = {
|
|
324
|
+
"analyze",
|
|
325
|
+
"create",
|
|
326
|
+
"generate",
|
|
327
|
+
"find",
|
|
328
|
+
"search",
|
|
329
|
+
"extract",
|
|
330
|
+
"summarize",
|
|
331
|
+
"compare",
|
|
332
|
+
"evaluate",
|
|
333
|
+
"assess",
|
|
334
|
+
"review",
|
|
335
|
+
"examine",
|
|
336
|
+
"investigate",
|
|
337
|
+
"determine",
|
|
338
|
+
"identify",
|
|
339
|
+
"classify",
|
|
340
|
+
"organize",
|
|
341
|
+
"optimize",
|
|
342
|
+
"improve",
|
|
343
|
+
"fix",
|
|
344
|
+
"solve",
|
|
345
|
+
"calculate",
|
|
346
|
+
"compute",
|
|
347
|
+
"predict",
|
|
348
|
+
"recommend",
|
|
349
|
+
"suggest",
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
# Technical terms (expand as needed)
|
|
353
|
+
tech_terms = {
|
|
354
|
+
"data",
|
|
355
|
+
"algorithm",
|
|
356
|
+
"model",
|
|
357
|
+
"function",
|
|
358
|
+
"variable",
|
|
359
|
+
"database",
|
|
360
|
+
"api",
|
|
361
|
+
"json",
|
|
362
|
+
"xml",
|
|
363
|
+
"html",
|
|
364
|
+
"css",
|
|
365
|
+
"javascript",
|
|
366
|
+
"python",
|
|
367
|
+
"code",
|
|
368
|
+
"program",
|
|
369
|
+
"software",
|
|
370
|
+
"system",
|
|
371
|
+
"architecture",
|
|
372
|
+
"design",
|
|
373
|
+
"implementation",
|
|
374
|
+
"testing",
|
|
375
|
+
"debugging",
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
keywords = []
|
|
379
|
+
for word in words:
|
|
380
|
+
if len(word) > 3 and (word in action_verbs or word in tech_terms):
|
|
381
|
+
keywords.append(word)
|
|
382
|
+
|
|
383
|
+
# Remove duplicates while preserving order
|
|
384
|
+
seen = set()
|
|
385
|
+
unique_keywords = []
|
|
386
|
+
for keyword in keywords:
|
|
387
|
+
if keyword not in seen:
|
|
388
|
+
seen.add(keyword)
|
|
389
|
+
unique_keywords.append(keyword)
|
|
390
|
+
|
|
391
|
+
return unique_keywords[:10] # Return top 10 keywords
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
def format_context_section(
|
|
395
|
+
name: str, content: str, max_length: Optional[int] = None
|
|
396
|
+
) -> str:
|
|
397
|
+
"""
|
|
398
|
+
Format a context section with proper headers and structure.
|
|
399
|
+
|
|
400
|
+
Args:
|
|
401
|
+
name: Section name
|
|
402
|
+
content: Section content
|
|
403
|
+
max_length: Maximum length for content (will truncate if exceeded)
|
|
404
|
+
|
|
405
|
+
Returns:
|
|
406
|
+
Formatted section
|
|
407
|
+
"""
|
|
408
|
+
if not content:
|
|
409
|
+
return ""
|
|
410
|
+
|
|
411
|
+
# Truncate if necessary
|
|
412
|
+
if max_length and len(content) > max_length:
|
|
413
|
+
content = content[: max_length - 3] + "..."
|
|
414
|
+
|
|
415
|
+
# Format with header
|
|
416
|
+
formatted = f"### {name.upper()} ###\n{content}"
|
|
417
|
+
|
|
418
|
+
return formatted
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
def merge_context_sections(sections: Dict[str, str], separator: str = "\n\n") -> str:
|
|
422
|
+
"""
|
|
423
|
+
Merge multiple context sections into a single string.
|
|
424
|
+
|
|
425
|
+
Args:
|
|
426
|
+
sections: Dictionary of section names to content
|
|
427
|
+
separator: Separator between sections
|
|
428
|
+
|
|
429
|
+
Returns:
|
|
430
|
+
Merged context string
|
|
431
|
+
"""
|
|
432
|
+
if not sections:
|
|
433
|
+
return ""
|
|
434
|
+
|
|
435
|
+
parts = []
|
|
436
|
+
for name, content in sections.items():
|
|
437
|
+
if content and content.strip():
|
|
438
|
+
formatted_section = format_context_section(name, content.strip())
|
|
439
|
+
parts.append(formatted_section)
|
|
440
|
+
|
|
441
|
+
return separator.join(parts)
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
"""
|
|
3
|
+
Message formatter for converting AssembledContext to various LLM message formats.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from typing import List, Dict, Any, Optional
|
|
7
|
+
|
|
8
|
+
from dolphin.core.common.enums import Messages, MessageRole
|
|
9
|
+
from ..core.context_assembler import AssembledContext
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class MessageFormatter:
|
|
13
|
+
"""Converts AssembledContext to different LLM message formats."""
|
|
14
|
+
|
|
15
|
+
def __init__(self):
|
|
16
|
+
"""Initialize message formatter."""
|
|
17
|
+
# Default section to role mapping
|
|
18
|
+
self.default_role_mapping = {
|
|
19
|
+
"system": "system",
|
|
20
|
+
"user": "user",
|
|
21
|
+
"assistant": "assistant",
|
|
22
|
+
"task": "user", # Task descriptions are usually provided as user input.
|
|
23
|
+
"tools": "system", # Tool information as system context
|
|
24
|
+
"history": "user", # Historical conversation as user input
|
|
25
|
+
"memory": "system", # Memory information as system context
|
|
26
|
+
"rag": "system", # RAG information as system context
|
|
27
|
+
"fewshot": "assistant", # few-shot examples as assistant responses
|
|
28
|
+
"scratchpad": "assistant", # Thought process as internal state of the assistant
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
def to_openai_messages(
|
|
32
|
+
self,
|
|
33
|
+
assembled_context: AssembledContext,
|
|
34
|
+
role_mapping: Optional[Dict[str, str]] = None,
|
|
35
|
+
include_placement: bool = False,
|
|
36
|
+
) -> List[Dict[str, str]]:
|
|
37
|
+
"""Convert AssembledContext to OpenAI message format.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
assembled_context: assembled context
|
|
41
|
+
role_mapping: custom mapping from section to role
|
|
42
|
+
include_placement: whether to include placement information in the message
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
list of messages in OpenAI format
|
|
46
|
+
"""
|
|
47
|
+
if not assembled_context.sections:
|
|
48
|
+
return []
|
|
49
|
+
|
|
50
|
+
# Use custom mapping or default mapping
|
|
51
|
+
mapping = role_mapping or self.default_role_mapping
|
|
52
|
+
|
|
53
|
+
messages = []
|
|
54
|
+
|
|
55
|
+
# Process in placement order: head -> middle -> tail
|
|
56
|
+
placement_order = ["head", "middle", "tail"]
|
|
57
|
+
|
|
58
|
+
for placement in placement_order:
|
|
59
|
+
if placement in assembled_context.placement_map:
|
|
60
|
+
section_names = assembled_context.placement_map[placement]
|
|
61
|
+
|
|
62
|
+
for section_name in section_names:
|
|
63
|
+
# Find the corresponding section
|
|
64
|
+
section = next(
|
|
65
|
+
(
|
|
66
|
+
s
|
|
67
|
+
for s in assembled_context.sections
|
|
68
|
+
if s.name == section_name
|
|
69
|
+
),
|
|
70
|
+
None,
|
|
71
|
+
)
|
|
72
|
+
if not section or not section.content.strip():
|
|
73
|
+
continue
|
|
74
|
+
|
|
75
|
+
# Determine role
|
|
76
|
+
role = mapping.get(section_name, "system") # Default uses system role
|
|
77
|
+
|
|
78
|
+
# Build message content
|
|
79
|
+
content = section.content.strip()
|
|
80
|
+
if include_placement:
|
|
81
|
+
content = f"[{placement}] {content}"
|
|
82
|
+
|
|
83
|
+
messages.append({"role": role, "content": content})
|
|
84
|
+
|
|
85
|
+
return messages
|
|
86
|
+
|
|
87
|
+
def to_openai_messages_simple(
|
|
88
|
+
self,
|
|
89
|
+
assembled_context: AssembledContext,
|
|
90
|
+
user_sections: Optional[List[str]] = None,
|
|
91
|
+
) -> List[Dict[str, str]]:
|
|
92
|
+
"""Minimal OpenAI message format: a single system message + user message.
|
|
93
|
+
|
|
94
|
+
Strategy:
|
|
95
|
+
- Decide the role of each section based on the message_role in bucket configuration
|
|
96
|
+
- Merge all except the user message into a single system message
|
|
97
|
+
- Maintain position order and logical structure
|
|
98
|
+
- Maximize LLM compatibility
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
assembled_context: Assembled context
|
|
102
|
+
user_sections: Specify which sections should be treated as user role (default: ["user_query", "user", "input"])
|
|
103
|
+
bucket_configs: Bucket configuration for retrieving message_role (optional)
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
Simplified message list [system, user] or similar structure
|
|
107
|
+
"""
|
|
108
|
+
if not assembled_context.sections:
|
|
109
|
+
return []
|
|
110
|
+
|
|
111
|
+
# Default user section
|
|
112
|
+
default_user = ["user_query", "user", "input"]
|
|
113
|
+
user_sections = user_sections or default_user
|
|
114
|
+
|
|
115
|
+
messages = []
|
|
116
|
+
system_parts = []
|
|
117
|
+
|
|
118
|
+
for section in assembled_context.sections:
|
|
119
|
+
if section.message_role == MessageRole.SYSTEM:
|
|
120
|
+
system_parts.append(section.content)
|
|
121
|
+
elif section.message_role == MessageRole.USER:
|
|
122
|
+
messages.append({"role": "user", "content": section.content})
|
|
123
|
+
elif section.message_role == MessageRole.ASSISTANT:
|
|
124
|
+
messages.append({"role": "assistant", "content": section.content})
|
|
125
|
+
elif section.message_role == MessageRole.ASSISTANT:
|
|
126
|
+
messages.append({"role": "tool", "content": section.content})
|
|
127
|
+
else:
|
|
128
|
+
continue
|
|
129
|
+
|
|
130
|
+
# Build final message
|
|
131
|
+
if system_parts:
|
|
132
|
+
# Merge all system messages into one
|
|
133
|
+
combined_system = "\n\n".join(system_parts)
|
|
134
|
+
messages.insert(0, {"role": "system", "content": combined_system})
|
|
135
|
+
return messages
|
|
136
|
+
|
|
137
|
+
def to_dph_messages_simple(
|
|
138
|
+
self,
|
|
139
|
+
assembled_context: Optional[AssembledContext],
|
|
140
|
+
user_sections: Optional[List[str]] = None,
|
|
141
|
+
) -> Messages:
|
|
142
|
+
"""The most simplified DolphinLanguage message format: a single system message + user message.
|
|
143
|
+
|
|
144
|
+
Strategy:
|
|
145
|
+
- Decide the role of each section based on the message_role in bucket configuration
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
assembled_context: The assembled context
|
|
149
|
+
user_sections: Specify which sections should be treated as user role (default: ["user_query", "user", "input"])
|
|
150
|
+
bucket_configs: Bucket configurations for retrieving message_role (optional)
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
A simplified message list [system, user] or similar structure
|
|
154
|
+
"""
|
|
155
|
+
if not assembled_context or not assembled_context.sections:
|
|
156
|
+
return Messages()
|
|
157
|
+
|
|
158
|
+
# Default user section
|
|
159
|
+
default_user = ["user_query", "user", "input"]
|
|
160
|
+
user_sections = user_sections or default_user
|
|
161
|
+
|
|
162
|
+
messages = Messages()
|
|
163
|
+
system_parts = []
|
|
164
|
+
|
|
165
|
+
for section in assembled_context.sections:
|
|
166
|
+
if section.message_role == MessageRole.SYSTEM:
|
|
167
|
+
# Process system message content
|
|
168
|
+
if isinstance(section.content, Messages):
|
|
169
|
+
# If it is a Messages type, merge the messages directly.
|
|
170
|
+
messages.extend_messages(section.content)
|
|
171
|
+
else:
|
|
172
|
+
system_parts.append(section.content)
|
|
173
|
+
elif section.message_role == MessageRole.USER:
|
|
174
|
+
# Process user message content
|
|
175
|
+
if isinstance(section.content, Messages):
|
|
176
|
+
# If it is a Messages type, merge the messages directly.
|
|
177
|
+
messages.extend_messages(section.content)
|
|
178
|
+
else:
|
|
179
|
+
messages.add_message(role=MessageRole.USER, content=section.content)
|
|
180
|
+
elif section.message_role == MessageRole.ASSISTANT:
|
|
181
|
+
# Process assistant message content
|
|
182
|
+
if isinstance(section.content, Messages):
|
|
183
|
+
# If it is a Messages type, merge the messages directly.
|
|
184
|
+
messages.extend_messages(section.content)
|
|
185
|
+
else:
|
|
186
|
+
messages.add_message(
|
|
187
|
+
role=MessageRole.ASSISTANT, content=section.content
|
|
188
|
+
)
|
|
189
|
+
elif section.message_role == MessageRole.TOOL:
|
|
190
|
+
# Process tool message content
|
|
191
|
+
if isinstance(section.content, Messages):
|
|
192
|
+
# If it is a Messages type, merge the messages directly.
|
|
193
|
+
messages.extend_messages(section.content)
|
|
194
|
+
else:
|
|
195
|
+
messages.add_message(role=MessageRole.TOOL, content=section.content)
|
|
196
|
+
else:
|
|
197
|
+
continue
|
|
198
|
+
|
|
199
|
+
# Build final message
|
|
200
|
+
if system_parts:
|
|
201
|
+
# Merge all system messages into one
|
|
202
|
+
combined_system = "\n\n".join(system_parts)
|
|
203
|
+
messages.insert_message(role=MessageRole.SYSTEM, content=combined_system)
|
|
204
|
+
|
|
205
|
+
return messages
|
|
206
|
+
|
|
207
|
+
def to_anthropic_messages(
|
|
208
|
+
self,
|
|
209
|
+
assembled_context: AssembledContext,
|
|
210
|
+
role_mapping: Optional[Dict[str, str]] = None,
|
|
211
|
+
) -> List[Dict[str, Any]]:
|
|
212
|
+
"""Convert to Anthropic Claude message format.
|
|
213
|
+
|
|
214
|
+
Args:
|
|
215
|
+
assembled_context: Assembled context
|
|
216
|
+
role_mapping: Custom mapping from section to role
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
List of messages in Anthropic format
|
|
220
|
+
"""
|
|
221
|
+
if not assembled_context.sections:
|
|
222
|
+
return []
|
|
223
|
+
|
|
224
|
+
# Anthropic mainly uses user and assistant, merging system content into user.
|
|
225
|
+
mapping = role_mapping or {
|
|
226
|
+
"system": "user",
|
|
227
|
+
"user": "user",
|
|
228
|
+
"assistant": "assistant",
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
messages = []
|
|
232
|
+
|
|
233
|
+
# Process in placement order
|
|
234
|
+
placement_order = ["head", "middle", "tail"]
|
|
235
|
+
|
|
236
|
+
for placement in placement_order:
|
|
237
|
+
if placement in assembled_context.placement_map:
|
|
238
|
+
section_names = assembled_context.placement_map[placement]
|
|
239
|
+
|
|
240
|
+
for section_name in section_names:
|
|
241
|
+
section = next(
|
|
242
|
+
(
|
|
243
|
+
s
|
|
244
|
+
for s in assembled_context.sections
|
|
245
|
+
if s.name == section_name
|
|
246
|
+
),
|
|
247
|
+
None,
|
|
248
|
+
)
|
|
249
|
+
if not section or not section.content.strip():
|
|
250
|
+
continue
|
|
251
|
+
|
|
252
|
+
role = mapping.get(section_name, "user")
|
|
253
|
+
|
|
254
|
+
messages.append({"role": role, "content": section.content.strip()})
|
|
255
|
+
|
|
256
|
+
return messages
|
|
257
|
+
|
|
258
|
+
def create_custom_mapping(self, section_roles: Dict[str, str]) -> Dict[str, str]:
|
|
259
|
+
"""Create a custom section-to-role mapping.
|
|
260
|
+
|
|
261
|
+
Args:
|
|
262
|
+
section_roles: User-defined section-to-role mapping
|
|
263
|
+
|
|
264
|
+
Returns:
|
|
265
|
+
Complete mapping dictionary
|
|
266
|
+
"""
|
|
267
|
+
# Allow users to customize based on the default mapping
|
|
268
|
+
custom_mapping = self.default_role_mapping.copy()
|
|
269
|
+
custom_mapping.update(section_roles)
|
|
270
|
+
return custom_mapping
|