kweaver-dolphin 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- DolphinLanguageSDK/__init__.py +58 -0
- dolphin/__init__.py +62 -0
- dolphin/cli/__init__.py +20 -0
- dolphin/cli/args/__init__.py +9 -0
- dolphin/cli/args/parser.py +567 -0
- dolphin/cli/builtin_agents/__init__.py +22 -0
- dolphin/cli/commands/__init__.py +4 -0
- dolphin/cli/interrupt/__init__.py +8 -0
- dolphin/cli/interrupt/handler.py +205 -0
- dolphin/cli/interrupt/keyboard.py +82 -0
- dolphin/cli/main.py +49 -0
- dolphin/cli/multimodal/__init__.py +34 -0
- dolphin/cli/multimodal/clipboard.py +327 -0
- dolphin/cli/multimodal/handler.py +249 -0
- dolphin/cli/multimodal/image_processor.py +214 -0
- dolphin/cli/multimodal/input_parser.py +149 -0
- dolphin/cli/runner/__init__.py +8 -0
- dolphin/cli/runner/runner.py +989 -0
- dolphin/cli/ui/__init__.py +10 -0
- dolphin/cli/ui/console.py +2795 -0
- dolphin/cli/ui/input.py +340 -0
- dolphin/cli/ui/layout.py +425 -0
- dolphin/cli/ui/stream_renderer.py +302 -0
- dolphin/cli/utils/__init__.py +8 -0
- dolphin/cli/utils/helpers.py +135 -0
- dolphin/cli/utils/version.py +49 -0
- dolphin/core/__init__.py +107 -0
- dolphin/core/agent/__init__.py +10 -0
- dolphin/core/agent/agent_state.py +69 -0
- dolphin/core/agent/base_agent.py +970 -0
- dolphin/core/code_block/__init__.py +0 -0
- dolphin/core/code_block/agent_init_block.py +0 -0
- dolphin/core/code_block/assign_block.py +98 -0
- dolphin/core/code_block/basic_code_block.py +1865 -0
- dolphin/core/code_block/explore_block.py +1327 -0
- dolphin/core/code_block/explore_block_v2.py +712 -0
- dolphin/core/code_block/explore_strategy.py +672 -0
- dolphin/core/code_block/judge_block.py +220 -0
- dolphin/core/code_block/prompt_block.py +32 -0
- dolphin/core/code_block/skill_call_deduplicator.py +291 -0
- dolphin/core/code_block/tool_block.py +129 -0
- dolphin/core/common/__init__.py +17 -0
- dolphin/core/common/constants.py +176 -0
- dolphin/core/common/enums.py +1173 -0
- dolphin/core/common/exceptions.py +133 -0
- dolphin/core/common/multimodal.py +539 -0
- dolphin/core/common/object_type.py +165 -0
- dolphin/core/common/output_format.py +432 -0
- dolphin/core/common/types.py +36 -0
- dolphin/core/config/__init__.py +16 -0
- dolphin/core/config/global_config.py +1289 -0
- dolphin/core/config/ontology_config.py +133 -0
- dolphin/core/context/__init__.py +12 -0
- dolphin/core/context/context.py +1580 -0
- dolphin/core/context/context_manager.py +161 -0
- dolphin/core/context/var_output.py +82 -0
- dolphin/core/context/variable_pool.py +356 -0
- dolphin/core/context_engineer/__init__.py +41 -0
- dolphin/core/context_engineer/config/__init__.py +5 -0
- dolphin/core/context_engineer/config/settings.py +402 -0
- dolphin/core/context_engineer/core/__init__.py +7 -0
- dolphin/core/context_engineer/core/budget_manager.py +327 -0
- dolphin/core/context_engineer/core/context_assembler.py +583 -0
- dolphin/core/context_engineer/core/context_manager.py +637 -0
- dolphin/core/context_engineer/core/tokenizer_service.py +260 -0
- dolphin/core/context_engineer/example/incremental_example.py +267 -0
- dolphin/core/context_engineer/example/traditional_example.py +334 -0
- dolphin/core/context_engineer/services/__init__.py +5 -0
- dolphin/core/context_engineer/services/compressor.py +399 -0
- dolphin/core/context_engineer/utils/__init__.py +6 -0
- dolphin/core/context_engineer/utils/context_utils.py +441 -0
- dolphin/core/context_engineer/utils/message_formatter.py +270 -0
- dolphin/core/context_engineer/utils/token_utils.py +139 -0
- dolphin/core/coroutine/__init__.py +15 -0
- dolphin/core/coroutine/context_snapshot.py +154 -0
- dolphin/core/coroutine/context_snapshot_profile.py +922 -0
- dolphin/core/coroutine/context_snapshot_store.py +268 -0
- dolphin/core/coroutine/execution_frame.py +145 -0
- dolphin/core/coroutine/execution_state_registry.py +161 -0
- dolphin/core/coroutine/resume_handle.py +101 -0
- dolphin/core/coroutine/step_result.py +101 -0
- dolphin/core/executor/__init__.py +18 -0
- dolphin/core/executor/debug_controller.py +630 -0
- dolphin/core/executor/dolphin_executor.py +1063 -0
- dolphin/core/executor/executor.py +624 -0
- dolphin/core/flags/__init__.py +27 -0
- dolphin/core/flags/definitions.py +49 -0
- dolphin/core/flags/manager.py +113 -0
- dolphin/core/hook/__init__.py +95 -0
- dolphin/core/hook/expression_evaluator.py +499 -0
- dolphin/core/hook/hook_dispatcher.py +380 -0
- dolphin/core/hook/hook_types.py +248 -0
- dolphin/core/hook/isolated_variable_pool.py +284 -0
- dolphin/core/interfaces.py +53 -0
- dolphin/core/llm/__init__.py +0 -0
- dolphin/core/llm/llm.py +495 -0
- dolphin/core/llm/llm_call.py +100 -0
- dolphin/core/llm/llm_client.py +1285 -0
- dolphin/core/llm/message_sanitizer.py +120 -0
- dolphin/core/logging/__init__.py +20 -0
- dolphin/core/logging/logger.py +526 -0
- dolphin/core/message/__init__.py +8 -0
- dolphin/core/message/compressor.py +749 -0
- dolphin/core/parser/__init__.py +8 -0
- dolphin/core/parser/parser.py +405 -0
- dolphin/core/runtime/__init__.py +10 -0
- dolphin/core/runtime/runtime_graph.py +926 -0
- dolphin/core/runtime/runtime_instance.py +446 -0
- dolphin/core/skill/__init__.py +14 -0
- dolphin/core/skill/context_retention.py +157 -0
- dolphin/core/skill/skill_function.py +686 -0
- dolphin/core/skill/skill_matcher.py +282 -0
- dolphin/core/skill/skillkit.py +700 -0
- dolphin/core/skill/skillset.py +72 -0
- dolphin/core/trajectory/__init__.py +10 -0
- dolphin/core/trajectory/recorder.py +189 -0
- dolphin/core/trajectory/trajectory.py +522 -0
- dolphin/core/utils/__init__.py +9 -0
- dolphin/core/utils/cache_kv.py +212 -0
- dolphin/core/utils/tools.py +340 -0
- dolphin/lib/__init__.py +93 -0
- dolphin/lib/debug/__init__.py +8 -0
- dolphin/lib/debug/visualizer.py +409 -0
- dolphin/lib/memory/__init__.py +28 -0
- dolphin/lib/memory/async_processor.py +220 -0
- dolphin/lib/memory/llm_calls.py +195 -0
- dolphin/lib/memory/manager.py +78 -0
- dolphin/lib/memory/sandbox.py +46 -0
- dolphin/lib/memory/storage.py +245 -0
- dolphin/lib/memory/utils.py +51 -0
- dolphin/lib/ontology/__init__.py +12 -0
- dolphin/lib/ontology/basic/__init__.py +0 -0
- dolphin/lib/ontology/basic/base.py +102 -0
- dolphin/lib/ontology/basic/concept.py +130 -0
- dolphin/lib/ontology/basic/object.py +11 -0
- dolphin/lib/ontology/basic/relation.py +63 -0
- dolphin/lib/ontology/datasource/__init__.py +27 -0
- dolphin/lib/ontology/datasource/datasource.py +66 -0
- dolphin/lib/ontology/datasource/oracle_datasource.py +338 -0
- dolphin/lib/ontology/datasource/sql.py +845 -0
- dolphin/lib/ontology/mapping.py +177 -0
- dolphin/lib/ontology/ontology.py +733 -0
- dolphin/lib/ontology/ontology_context.py +16 -0
- dolphin/lib/ontology/ontology_manager.py +107 -0
- dolphin/lib/skill_results/__init__.py +31 -0
- dolphin/lib/skill_results/cache_backend.py +559 -0
- dolphin/lib/skill_results/result_processor.py +181 -0
- dolphin/lib/skill_results/result_reference.py +179 -0
- dolphin/lib/skill_results/skillkit_hook.py +324 -0
- dolphin/lib/skill_results/strategies.py +328 -0
- dolphin/lib/skill_results/strategy_registry.py +150 -0
- dolphin/lib/skillkits/__init__.py +44 -0
- dolphin/lib/skillkits/agent_skillkit.py +155 -0
- dolphin/lib/skillkits/cognitive_skillkit.py +82 -0
- dolphin/lib/skillkits/env_skillkit.py +250 -0
- dolphin/lib/skillkits/mcp_adapter.py +616 -0
- dolphin/lib/skillkits/mcp_skillkit.py +771 -0
- dolphin/lib/skillkits/memory_skillkit.py +650 -0
- dolphin/lib/skillkits/noop_skillkit.py +31 -0
- dolphin/lib/skillkits/ontology_skillkit.py +89 -0
- dolphin/lib/skillkits/plan_act_skillkit.py +452 -0
- dolphin/lib/skillkits/resource/__init__.py +52 -0
- dolphin/lib/skillkits/resource/models/__init__.py +6 -0
- dolphin/lib/skillkits/resource/models/skill_config.py +109 -0
- dolphin/lib/skillkits/resource/models/skill_meta.py +127 -0
- dolphin/lib/skillkits/resource/resource_skillkit.py +393 -0
- dolphin/lib/skillkits/resource/skill_cache.py +215 -0
- dolphin/lib/skillkits/resource/skill_loader.py +395 -0
- dolphin/lib/skillkits/resource/skill_validator.py +406 -0
- dolphin/lib/skillkits/resource_skillkit.py +11 -0
- dolphin/lib/skillkits/search_skillkit.py +163 -0
- dolphin/lib/skillkits/sql_skillkit.py +274 -0
- dolphin/lib/skillkits/system_skillkit.py +509 -0
- dolphin/lib/skillkits/vm_skillkit.py +65 -0
- dolphin/lib/utils/__init__.py +9 -0
- dolphin/lib/utils/data_process.py +207 -0
- dolphin/lib/utils/handle_progress.py +178 -0
- dolphin/lib/utils/security.py +139 -0
- dolphin/lib/utils/text_retrieval.py +462 -0
- dolphin/lib/vm/__init__.py +11 -0
- dolphin/lib/vm/env_executor.py +895 -0
- dolphin/lib/vm/python_session_manager.py +453 -0
- dolphin/lib/vm/vm.py +610 -0
- dolphin/sdk/__init__.py +60 -0
- dolphin/sdk/agent/__init__.py +12 -0
- dolphin/sdk/agent/agent_factory.py +236 -0
- dolphin/sdk/agent/dolphin_agent.py +1106 -0
- dolphin/sdk/api/__init__.py +4 -0
- dolphin/sdk/runtime/__init__.py +8 -0
- dolphin/sdk/runtime/env.py +363 -0
- dolphin/sdk/skill/__init__.py +10 -0
- dolphin/sdk/skill/global_skills.py +706 -0
- dolphin/sdk/skill/traditional_toolkit.py +260 -0
- kweaver_dolphin-0.1.0.dist-info/METADATA +521 -0
- kweaver_dolphin-0.1.0.dist-info/RECORD +199 -0
- kweaver_dolphin-0.1.0.dist-info/WHEEL +5 -0
- kweaver_dolphin-0.1.0.dist-info/entry_points.txt +27 -0
- kweaver_dolphin-0.1.0.dist-info/licenses/LICENSE.txt +201 -0
- kweaver_dolphin-0.1.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,399 @@
|
|
|
1
|
+
"""Compressor service for content optimization and compression."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from typing import Dict, List, Optional, Any, Union
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from abc import ABC, abstractmethod
|
|
7
|
+
|
|
8
|
+
from dolphin.core.common.enums import Messages
|
|
9
|
+
from ..core.tokenizer_service import TokenizerService
|
|
10
|
+
from ..utils.context_utils import extract_key_info, summarize_content
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class CompressionResult:
|
|
15
|
+
"""Result of compression operation."""
|
|
16
|
+
|
|
17
|
+
compressed_content: str
|
|
18
|
+
original_tokens: int
|
|
19
|
+
compressed_tokens: int
|
|
20
|
+
compression_ratio: float
|
|
21
|
+
method_used: str
|
|
22
|
+
metadata: Dict[str, Any]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class BaseCompressor(ABC):
|
|
26
|
+
"""Abstract base class for compressors."""
|
|
27
|
+
|
|
28
|
+
@abstractmethod
|
|
29
|
+
def compress(self, content: str, target_tokens: int, **kwargs) -> CompressionResult:
|
|
30
|
+
"""Compress content to target token count."""
|
|
31
|
+
pass
|
|
32
|
+
|
|
33
|
+
@abstractmethod
|
|
34
|
+
def get_name(self) -> str:
|
|
35
|
+
"""Get compressor name."""
|
|
36
|
+
pass
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class TruncateCompressor(BaseCompressor):
|
|
40
|
+
"""Simple truncation-based compressor."""
|
|
41
|
+
|
|
42
|
+
def __init__(self, tokenizer_service: Optional[TokenizerService] = None):
|
|
43
|
+
self.tokenizer = tokenizer_service or TokenizerService()
|
|
44
|
+
|
|
45
|
+
def compress(self, content: str, target_tokens: int, **kwargs) -> CompressionResult:
|
|
46
|
+
"""Compress by truncating content."""
|
|
47
|
+
original_tokens = self.tokenizer.count_tokens(content)
|
|
48
|
+
|
|
49
|
+
if original_tokens <= target_tokens:
|
|
50
|
+
return CompressionResult(
|
|
51
|
+
compressed_content=content,
|
|
52
|
+
original_tokens=original_tokens,
|
|
53
|
+
compressed_tokens=original_tokens,
|
|
54
|
+
compression_ratio=1.0,
|
|
55
|
+
method_used="no_compression",
|
|
56
|
+
metadata={"reason": "content_already_fits"},
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
# Truncate content
|
|
60
|
+
from ..utils.token_utils import truncate_to_tokens
|
|
61
|
+
|
|
62
|
+
truncated = truncate_to_tokens(content, target_tokens, self.tokenizer)
|
|
63
|
+
compressed_tokens = self.tokenizer.count_tokens(truncated)
|
|
64
|
+
|
|
65
|
+
return CompressionResult(
|
|
66
|
+
compressed_content=truncated,
|
|
67
|
+
original_tokens=original_tokens,
|
|
68
|
+
compressed_tokens=compressed_tokens,
|
|
69
|
+
compression_ratio=compressed_tokens / original_tokens,
|
|
70
|
+
method_used="truncate",
|
|
71
|
+
metadata={"strategy": "simple_truncate"},
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
def get_name(self) -> str:
|
|
75
|
+
return "truncate"
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class ExtractiveCompressor(BaseCompressor):
|
|
79
|
+
"""Extractive compressor that keeps key sentences."""
|
|
80
|
+
|
|
81
|
+
def __init__(self, tokenizer_service: Optional[TokenizerService] = None):
|
|
82
|
+
self.tokenizer = tokenizer_service or TokenizerService()
|
|
83
|
+
|
|
84
|
+
def compress(self, content: str, target_tokens: int, **kwargs) -> CompressionResult:
|
|
85
|
+
"""Compress by extracting key sentences."""
|
|
86
|
+
original_tokens = self.tokenizer.count_tokens(content)
|
|
87
|
+
|
|
88
|
+
if original_tokens <= target_tokens:
|
|
89
|
+
return CompressionResult(
|
|
90
|
+
compressed_content=content,
|
|
91
|
+
original_tokens=original_tokens,
|
|
92
|
+
compressed_tokens=original_tokens,
|
|
93
|
+
compression_ratio=1.0,
|
|
94
|
+
method_used="no_compression",
|
|
95
|
+
metadata={"reason": "content_already_fits"},
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
# Extract key information
|
|
99
|
+
target_sentences = max(1, int(target_tokens / 20)) # Rough estimate
|
|
100
|
+
key_info = extract_key_info(content, max_sentences=target_sentences)
|
|
101
|
+
|
|
102
|
+
compressed_tokens = self.tokenizer.count_tokens(key_info)
|
|
103
|
+
|
|
104
|
+
return CompressionResult(
|
|
105
|
+
compressed_content=key_info,
|
|
106
|
+
original_tokens=original_tokens,
|
|
107
|
+
compressed_tokens=compressed_tokens,
|
|
108
|
+
compression_ratio=compressed_tokens / original_tokens,
|
|
109
|
+
method_used="extractive",
|
|
110
|
+
metadata={"sentences_extracted": target_sentences},
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
def get_name(self) -> str:
|
|
114
|
+
return "extractive"
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
class AbstractiveCompressor(BaseCompressor):
|
|
118
|
+
"""Abstractive compressor using summarization."""
|
|
119
|
+
|
|
120
|
+
def __init__(self, tokenizer_service: Optional[TokenizerService] = None):
|
|
121
|
+
self.tokenizer = tokenizer_service or TokenizerService()
|
|
122
|
+
|
|
123
|
+
def compress(self, content: str, target_tokens: int, **kwargs) -> CompressionResult:
|
|
124
|
+
"""Compress using abstractive summarization."""
|
|
125
|
+
original_tokens = self.tokenizer.count_tokens(content)
|
|
126
|
+
|
|
127
|
+
if original_tokens <= target_tokens:
|
|
128
|
+
return CompressionResult(
|
|
129
|
+
compressed_content=content,
|
|
130
|
+
original_tokens=original_tokens,
|
|
131
|
+
compressed_tokens=original_tokens,
|
|
132
|
+
compression_ratio=1.0,
|
|
133
|
+
method_used="no_compression",
|
|
134
|
+
metadata={"reason": "content_already_fits"},
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
# Calculate target compression ratio
|
|
138
|
+
target_ratio = target_tokens / original_tokens
|
|
139
|
+
|
|
140
|
+
# Use summarization
|
|
141
|
+
summary = summarize_content(
|
|
142
|
+
content, target_ratio=target_ratio, preserve_keywords=True
|
|
143
|
+
)
|
|
144
|
+
compressed_tokens = self.tokenizer.count_tokens(summary)
|
|
145
|
+
|
|
146
|
+
return CompressionResult(
|
|
147
|
+
compressed_content=summary,
|
|
148
|
+
original_tokens=original_tokens,
|
|
149
|
+
compressed_tokens=compressed_tokens,
|
|
150
|
+
compression_ratio=compressed_tokens / original_tokens,
|
|
151
|
+
method_used="abstractive",
|
|
152
|
+
metadata={"target_ratio": target_ratio},
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
def get_name(self) -> str:
|
|
156
|
+
return "abstractive"
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
class SignatureOnlyCompressor(BaseCompressor):
|
|
160
|
+
"""Compressor that keeps only function signatures or essential info."""
|
|
161
|
+
|
|
162
|
+
def __init__(self, tokenizer_service: Optional[TokenizerService] = None):
|
|
163
|
+
self.tokenizer = tokenizer_service or TokenizerService()
|
|
164
|
+
|
|
165
|
+
def compress(self, content: str, target_tokens: int, **kwargs) -> CompressionResult:
|
|
166
|
+
"""Compress by keeping only signatures and essential information."""
|
|
167
|
+
original_tokens = self.tokenizer.count_tokens(content)
|
|
168
|
+
|
|
169
|
+
# Extract function signatures, class definitions, etc.
|
|
170
|
+
signatures = self._extract_signatures(content)
|
|
171
|
+
|
|
172
|
+
if not signatures:
|
|
173
|
+
# Fallback to simple truncation
|
|
174
|
+
return TruncateCompressor(self.tokenizer).compress(content, target_tokens)
|
|
175
|
+
|
|
176
|
+
# Join signatures
|
|
177
|
+
compressed = "\n".join(signatures)
|
|
178
|
+
compressed_tokens = self.tokenizer.count_tokens(compressed)
|
|
179
|
+
|
|
180
|
+
return CompressionResult(
|
|
181
|
+
compressed_content=compressed,
|
|
182
|
+
original_tokens=original_tokens,
|
|
183
|
+
compressed_tokens=compressed_tokens,
|
|
184
|
+
compression_ratio=compressed_tokens / original_tokens,
|
|
185
|
+
method_used="signature_only",
|
|
186
|
+
metadata={"signatures_extracted": len(signatures)},
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
def _extract_signatures(self, content: str) -> List[str]:
|
|
190
|
+
"""Extract function signatures and class definitions."""
|
|
191
|
+
signatures = []
|
|
192
|
+
|
|
193
|
+
# Python function signatures
|
|
194
|
+
func_pattern = r"^\s*(?:def|class)\s+(\w+)\s*\([^)]*\)\s*(?:->\s*\w+)?\s*:"
|
|
195
|
+
for match in re.finditer(func_pattern, content, re.MULTILINE):
|
|
196
|
+
signatures.append(match.group(0).strip())
|
|
197
|
+
|
|
198
|
+
# API endpoint signatures
|
|
199
|
+
api_pattern = r"^\s*(GET|POST|PUT|DELETE|PATCH)\s+[^\s]+"
|
|
200
|
+
for match in re.finditer(api_pattern, content, re.MULTILINE):
|
|
201
|
+
signatures.append(match.group(0).strip())
|
|
202
|
+
|
|
203
|
+
# Database schema signatures
|
|
204
|
+
schema_pattern = r"^\s*(CREATE TABLE|ALTER TABLE|DROP TABLE)\s+\w+"
|
|
205
|
+
for match in re.finditer(schema_pattern, content, re.MULTILINE | re.IGNORECASE):
|
|
206
|
+
signatures.append(match.group(0).strip())
|
|
207
|
+
|
|
208
|
+
return signatures
|
|
209
|
+
|
|
210
|
+
def get_name(self) -> str:
|
|
211
|
+
return "signature_only"
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
class Compressor:
|
|
215
|
+
"""Main compressor service with multiple compression strategies."""
|
|
216
|
+
|
|
217
|
+
def __init__(self, tokenizer_service: Optional[TokenizerService] = None):
|
|
218
|
+
"""
|
|
219
|
+
Initialize compressor with available compression methods.
|
|
220
|
+
|
|
221
|
+
Args:
|
|
222
|
+
tokenizer_service: TokenizerService instance for token counting
|
|
223
|
+
"""
|
|
224
|
+
self.tokenizer = tokenizer_service or TokenizerService()
|
|
225
|
+
self.compressors = {
|
|
226
|
+
"truncate": TruncateCompressor(self.tokenizer),
|
|
227
|
+
"extractive": ExtractiveCompressor(self.tokenizer),
|
|
228
|
+
"abstractive": AbstractiveCompressor(self.tokenizer),
|
|
229
|
+
"signature_only": SignatureOnlyCompressor(self.tokenizer),
|
|
230
|
+
"task_summary": ExtractiveCompressor(
|
|
231
|
+
self.tokenizer
|
|
232
|
+
), # Alias for extractive
|
|
233
|
+
"aggressive_extract": ExtractiveCompressor(
|
|
234
|
+
self.tokenizer
|
|
235
|
+
), # More aggressive extraction
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
def compress(
|
|
239
|
+
self,
|
|
240
|
+
content: Union[str, Messages],
|
|
241
|
+
target_tokens: int,
|
|
242
|
+
method: str = "extractive",
|
|
243
|
+
**kwargs,
|
|
244
|
+
) -> CompressionResult:
|
|
245
|
+
"""
|
|
246
|
+
Compress content using specified method.
|
|
247
|
+
|
|
248
|
+
Args:
|
|
249
|
+
content: Content to compress
|
|
250
|
+
target_tokens: Target token count
|
|
251
|
+
method: Compression method to use
|
|
252
|
+
**kwargs: Additional arguments for specific compressors
|
|
253
|
+
|
|
254
|
+
Returns:
|
|
255
|
+
CompressionResult with compressed content and metadata
|
|
256
|
+
"""
|
|
257
|
+
if method not in self.compressors:
|
|
258
|
+
raise ValueError(f"Unknown compression method: {method}")
|
|
259
|
+
|
|
260
|
+
compressor = self.compressors[method]
|
|
261
|
+
|
|
262
|
+
# Add method-specific parameters
|
|
263
|
+
if method == "aggressive_extract":
|
|
264
|
+
kwargs["max_sentences"] = kwargs.get("max_sentences", 1)
|
|
265
|
+
|
|
266
|
+
return compressor.compress(content, target_tokens, **kwargs)
|
|
267
|
+
|
|
268
|
+
def compress_with_fallback(
|
|
269
|
+
self, content: str, target_tokens: int, preferred_methods: List[str], **kwargs
|
|
270
|
+
) -> CompressionResult:
|
|
271
|
+
"""
|
|
272
|
+
Compress content trying multiple methods in order of preference.
|
|
273
|
+
|
|
274
|
+
Args:
|
|
275
|
+
content: Content to compress
|
|
276
|
+
target_tokens: Target token count
|
|
277
|
+
preferred_methods: List of compression methods to try
|
|
278
|
+
**kwargs: Additional arguments for compressors
|
|
279
|
+
|
|
280
|
+
Returns:
|
|
281
|
+
CompressionResult from the first successful method
|
|
282
|
+
"""
|
|
283
|
+
for method in preferred_methods:
|
|
284
|
+
try:
|
|
285
|
+
result = self.compress(content, target_tokens, method, **kwargs)
|
|
286
|
+
if result.compression_ratio <= 1.0: # Valid compression
|
|
287
|
+
return result
|
|
288
|
+
except Exception:
|
|
289
|
+
continue
|
|
290
|
+
|
|
291
|
+
# Fallback to truncate
|
|
292
|
+
return self.compress(content, target_tokens, "truncate", **kwargs)
|
|
293
|
+
|
|
294
|
+
def batch_compress(
|
|
295
|
+
self,
|
|
296
|
+
contents: Dict[str, str],
|
|
297
|
+
allocations: Dict[str, int],
|
|
298
|
+
method: str = "extractive",
|
|
299
|
+
**kwargs,
|
|
300
|
+
) -> Dict[str, CompressionResult]:
|
|
301
|
+
"""
|
|
302
|
+
Compress multiple content sections with different token allocations.
|
|
303
|
+
|
|
304
|
+
Args:
|
|
305
|
+
contents: Dictionary of section names to content
|
|
306
|
+
allocations: Dictionary of section names to token allocations
|
|
307
|
+
method: Compression method to use
|
|
308
|
+
**kwargs: Additional arguments for compressors
|
|
309
|
+
|
|
310
|
+
Returns:
|
|
311
|
+
Dictionary of section names to CompressionResults
|
|
312
|
+
"""
|
|
313
|
+
results = {}
|
|
314
|
+
|
|
315
|
+
for section_name, content in contents.items():
|
|
316
|
+
if section_name in allocations:
|
|
317
|
+
target_tokens = allocations[section_name]
|
|
318
|
+
try:
|
|
319
|
+
result = self.compress(content, target_tokens, method, **kwargs)
|
|
320
|
+
results[section_name] = result
|
|
321
|
+
except Exception as e:
|
|
322
|
+
# Create error result
|
|
323
|
+
results[section_name] = CompressionResult(
|
|
324
|
+
compressed_content=content,
|
|
325
|
+
original_tokens=self.tokenizer.count_tokens(content),
|
|
326
|
+
compressed_tokens=0,
|
|
327
|
+
compression_ratio=1.0,
|
|
328
|
+
method_used="error",
|
|
329
|
+
metadata={"error": str(e)},
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
return results
|
|
333
|
+
|
|
334
|
+
def get_compression_stats(
|
|
335
|
+
self, results: Dict[str, CompressionResult]
|
|
336
|
+
) -> Dict[str, Any]:
|
|
337
|
+
"""
|
|
338
|
+
Get statistics from batch compression results.
|
|
339
|
+
|
|
340
|
+
Args:
|
|
341
|
+
results: Dictionary of compression results
|
|
342
|
+
|
|
343
|
+
Returns:
|
|
344
|
+
Statistics about the compression operation
|
|
345
|
+
"""
|
|
346
|
+
if not results:
|
|
347
|
+
return {}
|
|
348
|
+
|
|
349
|
+
total_original = sum(result.original_tokens for result in results.values())
|
|
350
|
+
total_compressed = sum(result.compressed_tokens for result in results.values())
|
|
351
|
+
|
|
352
|
+
stats = {
|
|
353
|
+
"total_sections": len(results),
|
|
354
|
+
"total_original_tokens": total_original,
|
|
355
|
+
"total_compressed_tokens": total_compressed,
|
|
356
|
+
"overall_compression_ratio": (
|
|
357
|
+
total_compressed / total_original if total_original > 0 else 0
|
|
358
|
+
),
|
|
359
|
+
"methods_used": list(
|
|
360
|
+
set(result.method_used for result in results.values())
|
|
361
|
+
),
|
|
362
|
+
"section_stats": {},
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
for section_name, result in results.items():
|
|
366
|
+
stats["section_stats"][section_name] = {
|
|
367
|
+
"original_tokens": result.original_tokens,
|
|
368
|
+
"compressed_tokens": result.compressed_tokens,
|
|
369
|
+
"compression_ratio": result.compression_ratio,
|
|
370
|
+
"method_used": result.method_used,
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
return stats
|
|
374
|
+
|
|
375
|
+
def add_custom_compressor(self, name: str, compressor: BaseCompressor):
|
|
376
|
+
"""Add a custom compressor implementation."""
|
|
377
|
+
self.compressors[name] = compressor
|
|
378
|
+
|
|
379
|
+
def get_available_methods(self) -> List[str]:
|
|
380
|
+
"""Get list of available compression methods."""
|
|
381
|
+
return list(self.compressors.keys())
|
|
382
|
+
|
|
383
|
+
def get_compressor_info(self, method: str) -> Dict[str, Any]:
|
|
384
|
+
"""Get information about a specific compression method."""
|
|
385
|
+
if method not in self.compressors:
|
|
386
|
+
return {}
|
|
387
|
+
|
|
388
|
+
compressor = self.compressors[method]
|
|
389
|
+
return {
|
|
390
|
+
"name": compressor.get_name(),
|
|
391
|
+
"description": compressor.__class__.__doc__,
|
|
392
|
+
"type": (
|
|
393
|
+
"extractive"
|
|
394
|
+
if "extract" in method
|
|
395
|
+
else "abstractive"
|
|
396
|
+
if "abstract" in method
|
|
397
|
+
else "truncation"
|
|
398
|
+
),
|
|
399
|
+
}
|