kweaver-dolphin 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (199) hide show
  1. DolphinLanguageSDK/__init__.py +58 -0
  2. dolphin/__init__.py +62 -0
  3. dolphin/cli/__init__.py +20 -0
  4. dolphin/cli/args/__init__.py +9 -0
  5. dolphin/cli/args/parser.py +567 -0
  6. dolphin/cli/builtin_agents/__init__.py +22 -0
  7. dolphin/cli/commands/__init__.py +4 -0
  8. dolphin/cli/interrupt/__init__.py +8 -0
  9. dolphin/cli/interrupt/handler.py +205 -0
  10. dolphin/cli/interrupt/keyboard.py +82 -0
  11. dolphin/cli/main.py +49 -0
  12. dolphin/cli/multimodal/__init__.py +34 -0
  13. dolphin/cli/multimodal/clipboard.py +327 -0
  14. dolphin/cli/multimodal/handler.py +249 -0
  15. dolphin/cli/multimodal/image_processor.py +214 -0
  16. dolphin/cli/multimodal/input_parser.py +149 -0
  17. dolphin/cli/runner/__init__.py +8 -0
  18. dolphin/cli/runner/runner.py +989 -0
  19. dolphin/cli/ui/__init__.py +10 -0
  20. dolphin/cli/ui/console.py +2795 -0
  21. dolphin/cli/ui/input.py +340 -0
  22. dolphin/cli/ui/layout.py +425 -0
  23. dolphin/cli/ui/stream_renderer.py +302 -0
  24. dolphin/cli/utils/__init__.py +8 -0
  25. dolphin/cli/utils/helpers.py +135 -0
  26. dolphin/cli/utils/version.py +49 -0
  27. dolphin/core/__init__.py +107 -0
  28. dolphin/core/agent/__init__.py +10 -0
  29. dolphin/core/agent/agent_state.py +69 -0
  30. dolphin/core/agent/base_agent.py +970 -0
  31. dolphin/core/code_block/__init__.py +0 -0
  32. dolphin/core/code_block/agent_init_block.py +0 -0
  33. dolphin/core/code_block/assign_block.py +98 -0
  34. dolphin/core/code_block/basic_code_block.py +1865 -0
  35. dolphin/core/code_block/explore_block.py +1327 -0
  36. dolphin/core/code_block/explore_block_v2.py +712 -0
  37. dolphin/core/code_block/explore_strategy.py +672 -0
  38. dolphin/core/code_block/judge_block.py +220 -0
  39. dolphin/core/code_block/prompt_block.py +32 -0
  40. dolphin/core/code_block/skill_call_deduplicator.py +291 -0
  41. dolphin/core/code_block/tool_block.py +129 -0
  42. dolphin/core/common/__init__.py +17 -0
  43. dolphin/core/common/constants.py +176 -0
  44. dolphin/core/common/enums.py +1173 -0
  45. dolphin/core/common/exceptions.py +133 -0
  46. dolphin/core/common/multimodal.py +539 -0
  47. dolphin/core/common/object_type.py +165 -0
  48. dolphin/core/common/output_format.py +432 -0
  49. dolphin/core/common/types.py +36 -0
  50. dolphin/core/config/__init__.py +16 -0
  51. dolphin/core/config/global_config.py +1289 -0
  52. dolphin/core/config/ontology_config.py +133 -0
  53. dolphin/core/context/__init__.py +12 -0
  54. dolphin/core/context/context.py +1580 -0
  55. dolphin/core/context/context_manager.py +161 -0
  56. dolphin/core/context/var_output.py +82 -0
  57. dolphin/core/context/variable_pool.py +356 -0
  58. dolphin/core/context_engineer/__init__.py +41 -0
  59. dolphin/core/context_engineer/config/__init__.py +5 -0
  60. dolphin/core/context_engineer/config/settings.py +402 -0
  61. dolphin/core/context_engineer/core/__init__.py +7 -0
  62. dolphin/core/context_engineer/core/budget_manager.py +327 -0
  63. dolphin/core/context_engineer/core/context_assembler.py +583 -0
  64. dolphin/core/context_engineer/core/context_manager.py +637 -0
  65. dolphin/core/context_engineer/core/tokenizer_service.py +260 -0
  66. dolphin/core/context_engineer/example/incremental_example.py +267 -0
  67. dolphin/core/context_engineer/example/traditional_example.py +334 -0
  68. dolphin/core/context_engineer/services/__init__.py +5 -0
  69. dolphin/core/context_engineer/services/compressor.py +399 -0
  70. dolphin/core/context_engineer/utils/__init__.py +6 -0
  71. dolphin/core/context_engineer/utils/context_utils.py +441 -0
  72. dolphin/core/context_engineer/utils/message_formatter.py +270 -0
  73. dolphin/core/context_engineer/utils/token_utils.py +139 -0
  74. dolphin/core/coroutine/__init__.py +15 -0
  75. dolphin/core/coroutine/context_snapshot.py +154 -0
  76. dolphin/core/coroutine/context_snapshot_profile.py +922 -0
  77. dolphin/core/coroutine/context_snapshot_store.py +268 -0
  78. dolphin/core/coroutine/execution_frame.py +145 -0
  79. dolphin/core/coroutine/execution_state_registry.py +161 -0
  80. dolphin/core/coroutine/resume_handle.py +101 -0
  81. dolphin/core/coroutine/step_result.py +101 -0
  82. dolphin/core/executor/__init__.py +18 -0
  83. dolphin/core/executor/debug_controller.py +630 -0
  84. dolphin/core/executor/dolphin_executor.py +1063 -0
  85. dolphin/core/executor/executor.py +624 -0
  86. dolphin/core/flags/__init__.py +27 -0
  87. dolphin/core/flags/definitions.py +49 -0
  88. dolphin/core/flags/manager.py +113 -0
  89. dolphin/core/hook/__init__.py +95 -0
  90. dolphin/core/hook/expression_evaluator.py +499 -0
  91. dolphin/core/hook/hook_dispatcher.py +380 -0
  92. dolphin/core/hook/hook_types.py +248 -0
  93. dolphin/core/hook/isolated_variable_pool.py +284 -0
  94. dolphin/core/interfaces.py +53 -0
  95. dolphin/core/llm/__init__.py +0 -0
  96. dolphin/core/llm/llm.py +495 -0
  97. dolphin/core/llm/llm_call.py +100 -0
  98. dolphin/core/llm/llm_client.py +1285 -0
  99. dolphin/core/llm/message_sanitizer.py +120 -0
  100. dolphin/core/logging/__init__.py +20 -0
  101. dolphin/core/logging/logger.py +526 -0
  102. dolphin/core/message/__init__.py +8 -0
  103. dolphin/core/message/compressor.py +749 -0
  104. dolphin/core/parser/__init__.py +8 -0
  105. dolphin/core/parser/parser.py +405 -0
  106. dolphin/core/runtime/__init__.py +10 -0
  107. dolphin/core/runtime/runtime_graph.py +926 -0
  108. dolphin/core/runtime/runtime_instance.py +446 -0
  109. dolphin/core/skill/__init__.py +14 -0
  110. dolphin/core/skill/context_retention.py +157 -0
  111. dolphin/core/skill/skill_function.py +686 -0
  112. dolphin/core/skill/skill_matcher.py +282 -0
  113. dolphin/core/skill/skillkit.py +700 -0
  114. dolphin/core/skill/skillset.py +72 -0
  115. dolphin/core/trajectory/__init__.py +10 -0
  116. dolphin/core/trajectory/recorder.py +189 -0
  117. dolphin/core/trajectory/trajectory.py +522 -0
  118. dolphin/core/utils/__init__.py +9 -0
  119. dolphin/core/utils/cache_kv.py +212 -0
  120. dolphin/core/utils/tools.py +340 -0
  121. dolphin/lib/__init__.py +93 -0
  122. dolphin/lib/debug/__init__.py +8 -0
  123. dolphin/lib/debug/visualizer.py +409 -0
  124. dolphin/lib/memory/__init__.py +28 -0
  125. dolphin/lib/memory/async_processor.py +220 -0
  126. dolphin/lib/memory/llm_calls.py +195 -0
  127. dolphin/lib/memory/manager.py +78 -0
  128. dolphin/lib/memory/sandbox.py +46 -0
  129. dolphin/lib/memory/storage.py +245 -0
  130. dolphin/lib/memory/utils.py +51 -0
  131. dolphin/lib/ontology/__init__.py +12 -0
  132. dolphin/lib/ontology/basic/__init__.py +0 -0
  133. dolphin/lib/ontology/basic/base.py +102 -0
  134. dolphin/lib/ontology/basic/concept.py +130 -0
  135. dolphin/lib/ontology/basic/object.py +11 -0
  136. dolphin/lib/ontology/basic/relation.py +63 -0
  137. dolphin/lib/ontology/datasource/__init__.py +27 -0
  138. dolphin/lib/ontology/datasource/datasource.py +66 -0
  139. dolphin/lib/ontology/datasource/oracle_datasource.py +338 -0
  140. dolphin/lib/ontology/datasource/sql.py +845 -0
  141. dolphin/lib/ontology/mapping.py +177 -0
  142. dolphin/lib/ontology/ontology.py +733 -0
  143. dolphin/lib/ontology/ontology_context.py +16 -0
  144. dolphin/lib/ontology/ontology_manager.py +107 -0
  145. dolphin/lib/skill_results/__init__.py +31 -0
  146. dolphin/lib/skill_results/cache_backend.py +559 -0
  147. dolphin/lib/skill_results/result_processor.py +181 -0
  148. dolphin/lib/skill_results/result_reference.py +179 -0
  149. dolphin/lib/skill_results/skillkit_hook.py +324 -0
  150. dolphin/lib/skill_results/strategies.py +328 -0
  151. dolphin/lib/skill_results/strategy_registry.py +150 -0
  152. dolphin/lib/skillkits/__init__.py +44 -0
  153. dolphin/lib/skillkits/agent_skillkit.py +155 -0
  154. dolphin/lib/skillkits/cognitive_skillkit.py +82 -0
  155. dolphin/lib/skillkits/env_skillkit.py +250 -0
  156. dolphin/lib/skillkits/mcp_adapter.py +616 -0
  157. dolphin/lib/skillkits/mcp_skillkit.py +771 -0
  158. dolphin/lib/skillkits/memory_skillkit.py +650 -0
  159. dolphin/lib/skillkits/noop_skillkit.py +31 -0
  160. dolphin/lib/skillkits/ontology_skillkit.py +89 -0
  161. dolphin/lib/skillkits/plan_act_skillkit.py +452 -0
  162. dolphin/lib/skillkits/resource/__init__.py +52 -0
  163. dolphin/lib/skillkits/resource/models/__init__.py +6 -0
  164. dolphin/lib/skillkits/resource/models/skill_config.py +109 -0
  165. dolphin/lib/skillkits/resource/models/skill_meta.py +127 -0
  166. dolphin/lib/skillkits/resource/resource_skillkit.py +393 -0
  167. dolphin/lib/skillkits/resource/skill_cache.py +215 -0
  168. dolphin/lib/skillkits/resource/skill_loader.py +395 -0
  169. dolphin/lib/skillkits/resource/skill_validator.py +406 -0
  170. dolphin/lib/skillkits/resource_skillkit.py +11 -0
  171. dolphin/lib/skillkits/search_skillkit.py +163 -0
  172. dolphin/lib/skillkits/sql_skillkit.py +274 -0
  173. dolphin/lib/skillkits/system_skillkit.py +509 -0
  174. dolphin/lib/skillkits/vm_skillkit.py +65 -0
  175. dolphin/lib/utils/__init__.py +9 -0
  176. dolphin/lib/utils/data_process.py +207 -0
  177. dolphin/lib/utils/handle_progress.py +178 -0
  178. dolphin/lib/utils/security.py +139 -0
  179. dolphin/lib/utils/text_retrieval.py +462 -0
  180. dolphin/lib/vm/__init__.py +11 -0
  181. dolphin/lib/vm/env_executor.py +895 -0
  182. dolphin/lib/vm/python_session_manager.py +453 -0
  183. dolphin/lib/vm/vm.py +610 -0
  184. dolphin/sdk/__init__.py +60 -0
  185. dolphin/sdk/agent/__init__.py +12 -0
  186. dolphin/sdk/agent/agent_factory.py +236 -0
  187. dolphin/sdk/agent/dolphin_agent.py +1106 -0
  188. dolphin/sdk/api/__init__.py +4 -0
  189. dolphin/sdk/runtime/__init__.py +8 -0
  190. dolphin/sdk/runtime/env.py +363 -0
  191. dolphin/sdk/skill/__init__.py +10 -0
  192. dolphin/sdk/skill/global_skills.py +706 -0
  193. dolphin/sdk/skill/traditional_toolkit.py +260 -0
  194. kweaver_dolphin-0.1.0.dist-info/METADATA +521 -0
  195. kweaver_dolphin-0.1.0.dist-info/RECORD +199 -0
  196. kweaver_dolphin-0.1.0.dist-info/WHEEL +5 -0
  197. kweaver_dolphin-0.1.0.dist-info/entry_points.txt +27 -0
  198. kweaver_dolphin-0.1.0.dist-info/licenses/LICENSE.txt +201 -0
  199. kweaver_dolphin-0.1.0.dist-info/top_level.txt +2 -0
@@ -0,0 +1,462 @@
1
+ """Text Retrieval Utilities
2
+
3
+ Shared text retrieval related tools, including:
4
+ - Tokenizer
5
+ - BM25Index - based on rank_bm25
6
+ - VectorUtils
7
+
8
+ Shared for use by modules such as memory_skillkit and local_retrieval_skillkit
9
+ """
10
+
11
+ import os
12
+ import re
13
+ from typing import Dict, List, Optional, Tuple
14
+ import math
15
+ from collections import Counter
16
+
17
+ # Allow forcing the use of pure Python BM25 via environment variable to avoid C extension issues in specific environments
18
+ _FORCE_PURE_PY = os.getenv("FORCE_PURE_PY_BM25", "").strip() == "1"
19
+ try:
20
+ if _FORCE_PURE_PY:
21
+ raise ImportError("FORCE_PURE_PY_BM25=1")
22
+ from rank_bm25 import BM25Okapi
23
+
24
+ _HAS_RANK_BM25 = True
25
+ except ImportError:
26
+ _HAS_RANK_BM25 = False
27
+ BM25Okapi = None
28
+
29
+
30
+ # -----------------------------
31
+ # Tokenizer Tool
32
+ # -----------------------------
33
+
34
+ _WORD_RE = re.compile(r"[A-Za-z0-9_]+", re.UNICODE)
35
+
36
+
37
+ def is_cjk(ch: str) -> bool:
38
+ """Detect whether a character is a CJK character (Chinese, Japanese, Korean characters)"""
39
+ code = ord(ch)
40
+ return (
41
+ 0x4E00 <= code <= 0x9FFF # CJK Unified Ideographs
42
+ or 0x3400 <= code <= 0x4DBF # CJK Extension A
43
+ or 0x20000 <= code <= 0x2A6DF # CJK Extension B
44
+ or 0x2A700 <= code <= 0x2B73F # CJK Extension C
45
+ or 0x2B740 <= code <= 0x2B81F # CJK Extension D
46
+ or 0x2B820 <= code <= 0x2CEAF # CJK Extension E
47
+ or 0xF900 <= code <= 0xFAFF # CJK Compatibility Ideographs
48
+ or 0x2F800 <= code <= 0x2FA1F # CJK Compatibility Ideographs Supplement
49
+ or 0xAC00 <= code <= 0xD7AF # Hangul Syllables (Korean)
50
+ or 0x1100 <= code <= 0x11FF # Hangul Jamo
51
+ or 0x3130 <= code <= 0x318F # Hangul Compatibility Jamo
52
+ or 0xA960 <= code <= 0xA97F # Hangul Jamo Extended-A
53
+ or 0xD7B0 <= code <= 0xD7FF # Hangul Jamo Extended-B
54
+ )
55
+
56
+
57
+ def tokenize_simple(text: str) -> List[str]:
58
+ """Simple tokenizer: supports ASCII words and CJK characters
59
+
60
+ - ASCII words are split by non-word characters and converted to lowercase
61
+ - CJK characters are treated as individual tokens (unigram)
62
+ """
63
+ if not text:
64
+ return []
65
+
66
+ tokens: List[str] = []
67
+ buf: List[str] = []
68
+
69
+ for ch in text:
70
+ if is_cjk(ch):
71
+ if buf:
72
+ word = "".join(buf).lower()
73
+ if word:
74
+ tokens.append(word)
75
+ buf.clear()
76
+ tokens.append(ch)
77
+ elif ch.isalnum() or ch == "_":
78
+ buf.append(ch)
79
+ else:
80
+ if buf:
81
+ word = "".join(buf).lower()
82
+ if word:
83
+ tokens.append(word)
84
+ buf.clear()
85
+
86
+ if buf:
87
+ word = "".join(buf).lower()
88
+ if word:
89
+ tokens.append(word)
90
+
91
+ return tokens
92
+
93
+
94
+ def tokenize_bigram_cjk(text: str) -> List[str]:
95
+ """Double-byte CJK tokenizer: supports double-byte Chinese segmentation"""
96
+ text = text.lower()
97
+ # Keep Chinese, English, and numbers, convert other characters to spaces
98
+ text = re.sub(r"[^\w\u4e00-\u9fff]+", " ", text)
99
+
100
+ tokens = []
101
+ for chunk in text.split():
102
+ # Check if it contains Chinese
103
+ if re.search(r"[\u4e00-\u9fff]", chunk):
104
+ # Chinese text using bigram (two-byte) segmentation
105
+ if len(chunk) >= 2:
106
+ tokens.extend([chunk[i : i + 2] for i in range(len(chunk) - 1)])
107
+ elif len(chunk) == 1:
108
+ tokens.append(chunk)
109
+ else:
110
+ # English/numbers keep original words
111
+ if len(chunk) > 1:
112
+ tokens.append(chunk)
113
+
114
+ return tokens
115
+
116
+
117
+ # -----------------------------
118
+ # BM25 Index
119
+ # -----------------------------
120
+
121
+
122
+ class BM25Index:
123
+ """BM25 index implementation, based on the rank_bm25 library"""
124
+
125
+ def __init__(self, k1: float = 1.5, b: float = 0.75):
126
+ if not _HAS_RANK_BM25:
127
+ raise ImportError(
128
+ "rank_bm25 is required. Install with: pip install rank-bm25"
129
+ )
130
+
131
+ self.k1 = k1
132
+ self.b = b
133
+ self._bm25_index = None # rank_bm25 instance (if available)
134
+ self._doc_ids: List[int] = []
135
+ self._tokenizer_func = tokenize_simple
136
+
137
+ # Required structures for pure Python fallback implementation
138
+ self._tokenized_docs: List[List[str]] = []
139
+ self._tf_docs: List[Counter] = []
140
+ self._idf: Dict[str, float] = {}
141
+ self._doc_lens: List[int] = []
142
+ self._avgdl: float = 0.0
143
+
144
+ def build_from_corpus(self, documents: Dict[int, str], tokenizer_func=None) -> None:
145
+ """Build index from document corpus"""
146
+ if tokenizer_func is None:
147
+ tokenizer_func = tokenize_simple
148
+
149
+ self._tokenizer_func = tokenizer_func
150
+ self._doc_ids = list(documents.keys())
151
+
152
+ # Handling empty document collections
153
+ if not documents:
154
+ self._bm25_index = None
155
+ self._tokenized_docs = []
156
+ self._tf_docs = []
157
+ self._idf = {}
158
+ self._doc_lens = []
159
+ self._avgdl = 0.0
160
+ return
161
+
162
+ # Tokenization Processing
163
+ tokenized_docs = []
164
+ for doc_id in self._doc_ids:
165
+ tokens = tokenizer_func(documents[doc_id])
166
+ tokenized_docs.append(tokens)
167
+
168
+ if _HAS_RANK_BM25:
169
+ # Build BM25 index (C extension/third-party library)
170
+ self._bm25_index = BM25Okapi(tokenized_docs, k1=self.k1, b=self.b)
171
+ # Clear rollback structure
172
+ self._tokenized_docs = []
173
+ self._tf_docs = []
174
+ self._idf = {}
175
+ self._doc_lens = []
176
+ self._avgdl = 0.0
177
+ else:
178
+ # Pure Python fallback implementation
179
+ self._bm25_index = None
180
+ self._tokenized_docs = tokenized_docs
181
+ self._tf_docs = [Counter(doc) for doc in tokenized_docs]
182
+ self._doc_lens = [len(doc) for doc in tokenized_docs]
183
+ self._avgdl = (
184
+ sum(self._doc_lens) / len(self._doc_lens) if self._doc_lens else 0.0
185
+ )
186
+
187
+ # Calculate idf
188
+ N = len(tokenized_docs)
189
+ df: Counter = Counter()
190
+ for doc in tokenized_docs:
191
+ df.update(set(doc))
192
+ self._idf = {}
193
+ for term, dfi in df.items():
194
+ # Smooth IDF consistent with common implementations
195
+ self._idf[term] = math.log((N - dfi + 0.5) / (dfi + 0.5) + 1)
196
+
197
+ def search(
198
+ self,
199
+ query: str,
200
+ allowed_doc_ids: Optional[set] = None,
201
+ topk: int = 10,
202
+ tokenizer_func=None,
203
+ ) -> List[Tuple[int, float]]:
204
+ """Search related documents"""
205
+ # If there is no rank_bm25 instance and no fallback index data, return empty
206
+ if self._bm25_index is None and not self._tf_docs:
207
+ return []
208
+
209
+ if tokenizer_func is None:
210
+ tokenizer_func = self._tokenizer_func
211
+
212
+ query_tokens = tokenizer_func(query)
213
+ if not query_tokens:
214
+ return []
215
+
216
+ if _HAS_RANK_BM25 and self._bm25_index is not None:
217
+ scores = self._bm25_index.get_scores(query_tokens)
218
+ else:
219
+ # Pure Python BM25 Scoring
220
+ scores = [0.0] * len(self._doc_ids)
221
+ for i, tf in enumerate(self._tf_docs):
222
+ dl = self._doc_lens[i] if self._doc_lens else 0
223
+ for term in query_tokens:
224
+ if term not in self._idf:
225
+ continue
226
+ tf_i = tf.get(term, 0)
227
+ if tf_i == 0:
228
+ continue
229
+ idf = self._idf[term]
230
+ denom = tf_i + self.k1 * (
231
+ 1
232
+ - self.b
233
+ + self.b * (dl / self._avgdl if self._avgdl > 0 else 0)
234
+ )
235
+ score = idf * (tf_i * (self.k1 + 1)) / denom
236
+ scores[i] += score
237
+
238
+ # Combine doc_id and score, return only documents with associations
239
+ # A BM25 score of 0 indicates that the document does not contain the query term and can be filtered out.
240
+ results = []
241
+ for i, score in enumerate(scores):
242
+ if score != 0.0: # Filter out irrelevant documents
243
+ doc_id = self._doc_ids[i]
244
+ if allowed_doc_ids is None or doc_id in allowed_doc_ids:
245
+ results.append((doc_id, float(score)))
246
+
247
+ # Sort and return the topk results
248
+ results.sort(key=lambda x: x[1], reverse=True)
249
+ return results[:topk]
250
+
251
+ def add_or_update(self, doc_id: int, text: str, tokenizer_func=None) -> None:
252
+ """Add or update a single document (requires rebuilding the entire index)"""
253
+ # For dynamic updates, we need to rebuild the index.
254
+ # This is a limitation of rank_bm25, but it is acceptable for most scenarios.
255
+
256
+ # If the index does not exist, create a new one.
257
+ if self._bm25_index is None:
258
+ documents = {doc_id: text}
259
+ self.build_from_corpus(documents, tokenizer_func)
260
+ return
261
+
262
+ # Otherwise, the index needs to be rebuilt (can be optimized to incremental updates, but will increase complexity)
263
+ raise NotImplementedError(
264
+ "Dynamic update requires rebuilding the index. Use build_from_corpus instead."
265
+ )
266
+
267
+ def remove(self, doc_id: int) -> None:
268
+ """Delete document (requires rebuilding the entire index)"""
269
+ raise NotImplementedError(
270
+ "Document removal requires rebuilding the index. Use build_from_corpus instead."
271
+ )
272
+
273
+ @property
274
+ def N(self) -> int:
275
+ """Total number of documents"""
276
+ return len(self._doc_ids) if self._doc_ids else 0
277
+
278
+ # Backward-compatible aliases
279
+ def search_optimized(
280
+ self,
281
+ query: str,
282
+ allowed_doc_ids: Optional[set] = None,
283
+ topk: int = 10,
284
+ tokenizer_func=None,
285
+ ) -> List[Tuple[int, float]]:
286
+ """Alias for optimized search"""
287
+ return self.search(query, allowed_doc_ids, topk, tokenizer_func)
288
+
289
+
290
+ # -----------------------------
291
+ # Vector Calculation Tools
292
+ # -----------------------------
293
+
294
+
295
+ class VectorUtils:
296
+ """Vector calculation utility class"""
297
+
298
+ @staticmethod
299
+ def cosine_similarity(vec1: List[float], vec2: List[float]) -> float:
300
+ """Calculate cosine similarity"""
301
+ if len(vec1) != len(vec2):
302
+ return 0.0
303
+
304
+ dot_product = sum(a * b for a, b in zip(vec1, vec2))
305
+ norm1 = sum(a * a for a in vec1) ** 0.5
306
+ norm2 = sum(b * b for b in vec2) ** 0.5
307
+
308
+ if norm1 == 0 or norm2 == 0:
309
+ return 0.0
310
+
311
+ return dot_product / (norm1 * norm2)
312
+
313
+ @staticmethod
314
+ def euclidean_distance(vec1: List[float], vec2: List[float]) -> float:
315
+ """Calculate Euclidean distance"""
316
+ if len(vec1) != len(vec2):
317
+ return float("inf")
318
+
319
+ return sum((a - b) ** 2 for a, b in zip(vec1, vec2)) ** 0.5
320
+
321
+ @staticmethod
322
+ def normalize_l2(vec: List[float]) -> List[float]:
323
+ """L2 normalization"""
324
+ norm = sum(v * v for v in vec) ** 0.5
325
+ if norm > 0:
326
+ return [v / norm for v in vec]
327
+ return vec.copy()
328
+
329
+ @staticmethod
330
+ def compute_simple_embedding(
331
+ text: str, dim: int = 384, tokenizer_func=None
332
+ ) -> List[float]:
333
+ """Simple embedding implementation based on hash (fallback solution)"""
334
+ import hashlib
335
+
336
+ if tokenizer_func is None:
337
+ tokenizer_func = tokenize_simple
338
+
339
+ vec = [0.0] * dim
340
+ tokens = tokenizer_func(text) or [""]
341
+
342
+ for tok in tokens:
343
+ h = hashlib.md5(tok.encode("utf-8")).digest()
344
+ # Generate multiple indices and weights using the first 16 bytes
345
+ idx1 = int.from_bytes(h[:4], "big") % dim
346
+ idx2 = int.from_bytes(h[4:8], "big") % dim
347
+ w1 = (int.from_bytes(h[8:12], "big") % 100) / 100.0 + 1.0
348
+ w2 = (int.from_bytes(h[12:16], "big") % 100) / 100.0 + 0.5
349
+ vec[idx1] += w1
350
+ vec[idx2] += w2
351
+
352
+ return VectorUtils.normalize_l2(vec)
353
+
354
+
355
+ # -----------------------------
356
+ # Hybrid Retrieval Tool
357
+ # -----------------------------
358
+
359
+
360
+ class HybridRetriever:
361
+ """Hybrid Retriever: Combining BM25 and Vector Similarity"""
362
+
363
+ def __init__(self, bm25_weight: float = 0.7):
364
+ self.bm25_weight = bm25_weight
365
+ self.embedding_weight = 1.0 - bm25_weight
366
+
367
+ def combine_scores(
368
+ self,
369
+ bm25_results: List[Tuple[int, float]],
370
+ embedding_results: List[Tuple[int, float]],
371
+ ) -> List[Tuple[int, float]]:
372
+ """Combining BM25 and embedding scores"""
373
+ bm25_dict = dict(bm25_results)
374
+ embedding_dict = dict(embedding_results)
375
+
376
+ # Get all document IDs
377
+ all_doc_ids = set(bm25_dict.keys()) | set(embedding_dict.keys())
378
+
379
+ if not all_doc_ids:
380
+ return []
381
+
382
+ # Fraction Normalization
383
+ def minmax_normalize(scores: List[float]) -> List[float]:
384
+ if not scores:
385
+ return scores
386
+ min_s, max_s = min(scores), max(scores)
387
+ if max_s == min_s:
388
+ return [1.0] * len(scores)
389
+ return [(s - min_s) / (max_s - min_s) for s in scores]
390
+
391
+ bm25_scores = list(bm25_dict.values())
392
+ embedding_scores = list(embedding_dict.values())
393
+
394
+ # Normalize only when there are scores.
395
+ bm25_normalized = {}
396
+ if bm25_scores:
397
+ bm25_norm_scores = minmax_normalize(bm25_scores)
398
+ bm25_normalized = dict(zip(bm25_dict.keys(), bm25_norm_scores))
399
+
400
+ embedding_normalized = {}
401
+ if embedding_scores:
402
+ emb_norm_scores = minmax_normalize(embedding_scores)
403
+ embedding_normalized = dict(zip(embedding_dict.keys(), emb_norm_scores))
404
+
405
+ # Combined Score
406
+ combined_results = []
407
+ for doc_id in all_doc_ids:
408
+ bm25_score = bm25_normalized.get(doc_id, 0.0)
409
+ emb_score = embedding_normalized.get(doc_id, 0.0)
410
+ combined_score = (
411
+ self.bm25_weight * bm25_score + self.embedding_weight * emb_score
412
+ )
413
+ combined_results.append((doc_id, combined_score))
414
+
415
+ return sorted(combined_results, key=lambda x: x[1], reverse=True)
416
+
417
+
418
+ # -----------------------------
419
+ # Convenience functions
420
+ # -----------------------------
421
+
422
+
423
+ def create_bm25_index(
424
+ documents: Dict[int, str],
425
+ tokenizer: str = "simple",
426
+ k1: float = 1.5,
427
+ b: float = 0.75,
428
+ ) -> BM25Index:
429
+ """Convenience function: Create BM25 index
430
+
431
+ Args:
432
+ documents: Document dictionary {doc_id: content}
433
+ tokenizer: Tokenizer type ("simple", "bigram_cjk")
434
+ k1, b: BM25 parameters
435
+ """
436
+ tokenizer_func = {
437
+ "simple": tokenize_simple,
438
+ "bigram_cjk": tokenize_bigram_cjk,
439
+ }.get(tokenizer, tokenize_simple)
440
+
441
+ index = BM25Index(k1=k1, b=b)
442
+ index.build_from_corpus(documents, tokenizer_func)
443
+ return index
444
+
445
+
446
+ def search_documents(
447
+ index: BM25Index, query: str, topk: int = 10, tokenizer: str = "simple"
448
+ ) -> List[Tuple[int, float]]:
449
+ """Convenient function: search documents
450
+
451
+ Args:
452
+ index: BM25 index
453
+ query: query string
454
+ topk: number of results to return
455
+ tokenizer: tokenizer type
456
+ """
457
+ tokenizer_func = {
458
+ "simple": tokenize_simple,
459
+ "bigram_cjk": tokenize_bigram_cjk,
460
+ }.get(tokenizer, tokenize_simple)
461
+
462
+ return index.search(query, topk=topk, tokenizer_func=tokenizer_func)
@@ -0,0 +1,11 @@
1
+ # -*- coding: utf-8 -*-
2
+ """VM 模块 - 虚拟机"""
3
+
4
+ from dolphin.lib.vm.vm import VM, VMSSH
5
+ from dolphin.lib.vm.python_session_manager import PythonSessionManager
6
+
7
+ __all__ = [
8
+ "VM",
9
+ "VMSSH",
10
+ "PythonSessionManager",
11
+ ]