kweaver-dolphin 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- DolphinLanguageSDK/__init__.py +58 -0
- dolphin/__init__.py +62 -0
- dolphin/cli/__init__.py +20 -0
- dolphin/cli/args/__init__.py +9 -0
- dolphin/cli/args/parser.py +567 -0
- dolphin/cli/builtin_agents/__init__.py +22 -0
- dolphin/cli/commands/__init__.py +4 -0
- dolphin/cli/interrupt/__init__.py +8 -0
- dolphin/cli/interrupt/handler.py +205 -0
- dolphin/cli/interrupt/keyboard.py +82 -0
- dolphin/cli/main.py +49 -0
- dolphin/cli/multimodal/__init__.py +34 -0
- dolphin/cli/multimodal/clipboard.py +327 -0
- dolphin/cli/multimodal/handler.py +249 -0
- dolphin/cli/multimodal/image_processor.py +214 -0
- dolphin/cli/multimodal/input_parser.py +149 -0
- dolphin/cli/runner/__init__.py +8 -0
- dolphin/cli/runner/runner.py +989 -0
- dolphin/cli/ui/__init__.py +10 -0
- dolphin/cli/ui/console.py +2795 -0
- dolphin/cli/ui/input.py +340 -0
- dolphin/cli/ui/layout.py +425 -0
- dolphin/cli/ui/stream_renderer.py +302 -0
- dolphin/cli/utils/__init__.py +8 -0
- dolphin/cli/utils/helpers.py +135 -0
- dolphin/cli/utils/version.py +49 -0
- dolphin/core/__init__.py +107 -0
- dolphin/core/agent/__init__.py +10 -0
- dolphin/core/agent/agent_state.py +69 -0
- dolphin/core/agent/base_agent.py +970 -0
- dolphin/core/code_block/__init__.py +0 -0
- dolphin/core/code_block/agent_init_block.py +0 -0
- dolphin/core/code_block/assign_block.py +98 -0
- dolphin/core/code_block/basic_code_block.py +1865 -0
- dolphin/core/code_block/explore_block.py +1327 -0
- dolphin/core/code_block/explore_block_v2.py +712 -0
- dolphin/core/code_block/explore_strategy.py +672 -0
- dolphin/core/code_block/judge_block.py +220 -0
- dolphin/core/code_block/prompt_block.py +32 -0
- dolphin/core/code_block/skill_call_deduplicator.py +291 -0
- dolphin/core/code_block/tool_block.py +129 -0
- dolphin/core/common/__init__.py +17 -0
- dolphin/core/common/constants.py +176 -0
- dolphin/core/common/enums.py +1173 -0
- dolphin/core/common/exceptions.py +133 -0
- dolphin/core/common/multimodal.py +539 -0
- dolphin/core/common/object_type.py +165 -0
- dolphin/core/common/output_format.py +432 -0
- dolphin/core/common/types.py +36 -0
- dolphin/core/config/__init__.py +16 -0
- dolphin/core/config/global_config.py +1289 -0
- dolphin/core/config/ontology_config.py +133 -0
- dolphin/core/context/__init__.py +12 -0
- dolphin/core/context/context.py +1580 -0
- dolphin/core/context/context_manager.py +161 -0
- dolphin/core/context/var_output.py +82 -0
- dolphin/core/context/variable_pool.py +356 -0
- dolphin/core/context_engineer/__init__.py +41 -0
- dolphin/core/context_engineer/config/__init__.py +5 -0
- dolphin/core/context_engineer/config/settings.py +402 -0
- dolphin/core/context_engineer/core/__init__.py +7 -0
- dolphin/core/context_engineer/core/budget_manager.py +327 -0
- dolphin/core/context_engineer/core/context_assembler.py +583 -0
- dolphin/core/context_engineer/core/context_manager.py +637 -0
- dolphin/core/context_engineer/core/tokenizer_service.py +260 -0
- dolphin/core/context_engineer/example/incremental_example.py +267 -0
- dolphin/core/context_engineer/example/traditional_example.py +334 -0
- dolphin/core/context_engineer/services/__init__.py +5 -0
- dolphin/core/context_engineer/services/compressor.py +399 -0
- dolphin/core/context_engineer/utils/__init__.py +6 -0
- dolphin/core/context_engineer/utils/context_utils.py +441 -0
- dolphin/core/context_engineer/utils/message_formatter.py +270 -0
- dolphin/core/context_engineer/utils/token_utils.py +139 -0
- dolphin/core/coroutine/__init__.py +15 -0
- dolphin/core/coroutine/context_snapshot.py +154 -0
- dolphin/core/coroutine/context_snapshot_profile.py +922 -0
- dolphin/core/coroutine/context_snapshot_store.py +268 -0
- dolphin/core/coroutine/execution_frame.py +145 -0
- dolphin/core/coroutine/execution_state_registry.py +161 -0
- dolphin/core/coroutine/resume_handle.py +101 -0
- dolphin/core/coroutine/step_result.py +101 -0
- dolphin/core/executor/__init__.py +18 -0
- dolphin/core/executor/debug_controller.py +630 -0
- dolphin/core/executor/dolphin_executor.py +1063 -0
- dolphin/core/executor/executor.py +624 -0
- dolphin/core/flags/__init__.py +27 -0
- dolphin/core/flags/definitions.py +49 -0
- dolphin/core/flags/manager.py +113 -0
- dolphin/core/hook/__init__.py +95 -0
- dolphin/core/hook/expression_evaluator.py +499 -0
- dolphin/core/hook/hook_dispatcher.py +380 -0
- dolphin/core/hook/hook_types.py +248 -0
- dolphin/core/hook/isolated_variable_pool.py +284 -0
- dolphin/core/interfaces.py +53 -0
- dolphin/core/llm/__init__.py +0 -0
- dolphin/core/llm/llm.py +495 -0
- dolphin/core/llm/llm_call.py +100 -0
- dolphin/core/llm/llm_client.py +1285 -0
- dolphin/core/llm/message_sanitizer.py +120 -0
- dolphin/core/logging/__init__.py +20 -0
- dolphin/core/logging/logger.py +526 -0
- dolphin/core/message/__init__.py +8 -0
- dolphin/core/message/compressor.py +749 -0
- dolphin/core/parser/__init__.py +8 -0
- dolphin/core/parser/parser.py +405 -0
- dolphin/core/runtime/__init__.py +10 -0
- dolphin/core/runtime/runtime_graph.py +926 -0
- dolphin/core/runtime/runtime_instance.py +446 -0
- dolphin/core/skill/__init__.py +14 -0
- dolphin/core/skill/context_retention.py +157 -0
- dolphin/core/skill/skill_function.py +686 -0
- dolphin/core/skill/skill_matcher.py +282 -0
- dolphin/core/skill/skillkit.py +700 -0
- dolphin/core/skill/skillset.py +72 -0
- dolphin/core/trajectory/__init__.py +10 -0
- dolphin/core/trajectory/recorder.py +189 -0
- dolphin/core/trajectory/trajectory.py +522 -0
- dolphin/core/utils/__init__.py +9 -0
- dolphin/core/utils/cache_kv.py +212 -0
- dolphin/core/utils/tools.py +340 -0
- dolphin/lib/__init__.py +93 -0
- dolphin/lib/debug/__init__.py +8 -0
- dolphin/lib/debug/visualizer.py +409 -0
- dolphin/lib/memory/__init__.py +28 -0
- dolphin/lib/memory/async_processor.py +220 -0
- dolphin/lib/memory/llm_calls.py +195 -0
- dolphin/lib/memory/manager.py +78 -0
- dolphin/lib/memory/sandbox.py +46 -0
- dolphin/lib/memory/storage.py +245 -0
- dolphin/lib/memory/utils.py +51 -0
- dolphin/lib/ontology/__init__.py +12 -0
- dolphin/lib/ontology/basic/__init__.py +0 -0
- dolphin/lib/ontology/basic/base.py +102 -0
- dolphin/lib/ontology/basic/concept.py +130 -0
- dolphin/lib/ontology/basic/object.py +11 -0
- dolphin/lib/ontology/basic/relation.py +63 -0
- dolphin/lib/ontology/datasource/__init__.py +27 -0
- dolphin/lib/ontology/datasource/datasource.py +66 -0
- dolphin/lib/ontology/datasource/oracle_datasource.py +338 -0
- dolphin/lib/ontology/datasource/sql.py +845 -0
- dolphin/lib/ontology/mapping.py +177 -0
- dolphin/lib/ontology/ontology.py +733 -0
- dolphin/lib/ontology/ontology_context.py +16 -0
- dolphin/lib/ontology/ontology_manager.py +107 -0
- dolphin/lib/skill_results/__init__.py +31 -0
- dolphin/lib/skill_results/cache_backend.py +559 -0
- dolphin/lib/skill_results/result_processor.py +181 -0
- dolphin/lib/skill_results/result_reference.py +179 -0
- dolphin/lib/skill_results/skillkit_hook.py +324 -0
- dolphin/lib/skill_results/strategies.py +328 -0
- dolphin/lib/skill_results/strategy_registry.py +150 -0
- dolphin/lib/skillkits/__init__.py +44 -0
- dolphin/lib/skillkits/agent_skillkit.py +155 -0
- dolphin/lib/skillkits/cognitive_skillkit.py +82 -0
- dolphin/lib/skillkits/env_skillkit.py +250 -0
- dolphin/lib/skillkits/mcp_adapter.py +616 -0
- dolphin/lib/skillkits/mcp_skillkit.py +771 -0
- dolphin/lib/skillkits/memory_skillkit.py +650 -0
- dolphin/lib/skillkits/noop_skillkit.py +31 -0
- dolphin/lib/skillkits/ontology_skillkit.py +89 -0
- dolphin/lib/skillkits/plan_act_skillkit.py +452 -0
- dolphin/lib/skillkits/resource/__init__.py +52 -0
- dolphin/lib/skillkits/resource/models/__init__.py +6 -0
- dolphin/lib/skillkits/resource/models/skill_config.py +109 -0
- dolphin/lib/skillkits/resource/models/skill_meta.py +127 -0
- dolphin/lib/skillkits/resource/resource_skillkit.py +393 -0
- dolphin/lib/skillkits/resource/skill_cache.py +215 -0
- dolphin/lib/skillkits/resource/skill_loader.py +395 -0
- dolphin/lib/skillkits/resource/skill_validator.py +406 -0
- dolphin/lib/skillkits/resource_skillkit.py +11 -0
- dolphin/lib/skillkits/search_skillkit.py +163 -0
- dolphin/lib/skillkits/sql_skillkit.py +274 -0
- dolphin/lib/skillkits/system_skillkit.py +509 -0
- dolphin/lib/skillkits/vm_skillkit.py +65 -0
- dolphin/lib/utils/__init__.py +9 -0
- dolphin/lib/utils/data_process.py +207 -0
- dolphin/lib/utils/handle_progress.py +178 -0
- dolphin/lib/utils/security.py +139 -0
- dolphin/lib/utils/text_retrieval.py +462 -0
- dolphin/lib/vm/__init__.py +11 -0
- dolphin/lib/vm/env_executor.py +895 -0
- dolphin/lib/vm/python_session_manager.py +453 -0
- dolphin/lib/vm/vm.py +610 -0
- dolphin/sdk/__init__.py +60 -0
- dolphin/sdk/agent/__init__.py +12 -0
- dolphin/sdk/agent/agent_factory.py +236 -0
- dolphin/sdk/agent/dolphin_agent.py +1106 -0
- dolphin/sdk/api/__init__.py +4 -0
- dolphin/sdk/runtime/__init__.py +8 -0
- dolphin/sdk/runtime/env.py +363 -0
- dolphin/sdk/skill/__init__.py +10 -0
- dolphin/sdk/skill/global_skills.py +706 -0
- dolphin/sdk/skill/traditional_toolkit.py +260 -0
- kweaver_dolphin-0.1.0.dist-info/METADATA +521 -0
- kweaver_dolphin-0.1.0.dist-info/RECORD +199 -0
- kweaver_dolphin-0.1.0.dist-info/WHEEL +5 -0
- kweaver_dolphin-0.1.0.dist-info/entry_points.txt +27 -0
- kweaver_dolphin-0.1.0.dist-info/licenses/LICENSE.txt +201 -0
- kweaver_dolphin-0.1.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,462 @@
|
|
|
1
|
+
"""Text Retrieval Utilities
|
|
2
|
+
|
|
3
|
+
Shared text retrieval related tools, including:
|
|
4
|
+
- Tokenizer
|
|
5
|
+
- BM25Index - based on rank_bm25
|
|
6
|
+
- VectorUtils
|
|
7
|
+
|
|
8
|
+
Shared for use by modules such as memory_skillkit and local_retrieval_skillkit
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import os
|
|
12
|
+
import re
|
|
13
|
+
from typing import Dict, List, Optional, Tuple
|
|
14
|
+
import math
|
|
15
|
+
from collections import Counter
|
|
16
|
+
|
|
17
|
+
# Allow forcing the use of pure Python BM25 via environment variable to avoid C extension issues in specific environments
|
|
18
|
+
_FORCE_PURE_PY = os.getenv("FORCE_PURE_PY_BM25", "").strip() == "1"
|
|
19
|
+
try:
|
|
20
|
+
if _FORCE_PURE_PY:
|
|
21
|
+
raise ImportError("FORCE_PURE_PY_BM25=1")
|
|
22
|
+
from rank_bm25 import BM25Okapi
|
|
23
|
+
|
|
24
|
+
_HAS_RANK_BM25 = True
|
|
25
|
+
except ImportError:
|
|
26
|
+
_HAS_RANK_BM25 = False
|
|
27
|
+
BM25Okapi = None
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# -----------------------------
|
|
31
|
+
# Tokenizer Tool
|
|
32
|
+
# -----------------------------
|
|
33
|
+
|
|
34
|
+
_WORD_RE = re.compile(r"[A-Za-z0-9_]+", re.UNICODE)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def is_cjk(ch: str) -> bool:
|
|
38
|
+
"""Detect whether a character is a CJK character (Chinese, Japanese, Korean characters)"""
|
|
39
|
+
code = ord(ch)
|
|
40
|
+
return (
|
|
41
|
+
0x4E00 <= code <= 0x9FFF # CJK Unified Ideographs
|
|
42
|
+
or 0x3400 <= code <= 0x4DBF # CJK Extension A
|
|
43
|
+
or 0x20000 <= code <= 0x2A6DF # CJK Extension B
|
|
44
|
+
or 0x2A700 <= code <= 0x2B73F # CJK Extension C
|
|
45
|
+
or 0x2B740 <= code <= 0x2B81F # CJK Extension D
|
|
46
|
+
or 0x2B820 <= code <= 0x2CEAF # CJK Extension E
|
|
47
|
+
or 0xF900 <= code <= 0xFAFF # CJK Compatibility Ideographs
|
|
48
|
+
or 0x2F800 <= code <= 0x2FA1F # CJK Compatibility Ideographs Supplement
|
|
49
|
+
or 0xAC00 <= code <= 0xD7AF # Hangul Syllables (Korean)
|
|
50
|
+
or 0x1100 <= code <= 0x11FF # Hangul Jamo
|
|
51
|
+
or 0x3130 <= code <= 0x318F # Hangul Compatibility Jamo
|
|
52
|
+
or 0xA960 <= code <= 0xA97F # Hangul Jamo Extended-A
|
|
53
|
+
or 0xD7B0 <= code <= 0xD7FF # Hangul Jamo Extended-B
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def tokenize_simple(text: str) -> List[str]:
|
|
58
|
+
"""Simple tokenizer: supports ASCII words and CJK characters
|
|
59
|
+
|
|
60
|
+
- ASCII words are split by non-word characters and converted to lowercase
|
|
61
|
+
- CJK characters are treated as individual tokens (unigram)
|
|
62
|
+
"""
|
|
63
|
+
if not text:
|
|
64
|
+
return []
|
|
65
|
+
|
|
66
|
+
tokens: List[str] = []
|
|
67
|
+
buf: List[str] = []
|
|
68
|
+
|
|
69
|
+
for ch in text:
|
|
70
|
+
if is_cjk(ch):
|
|
71
|
+
if buf:
|
|
72
|
+
word = "".join(buf).lower()
|
|
73
|
+
if word:
|
|
74
|
+
tokens.append(word)
|
|
75
|
+
buf.clear()
|
|
76
|
+
tokens.append(ch)
|
|
77
|
+
elif ch.isalnum() or ch == "_":
|
|
78
|
+
buf.append(ch)
|
|
79
|
+
else:
|
|
80
|
+
if buf:
|
|
81
|
+
word = "".join(buf).lower()
|
|
82
|
+
if word:
|
|
83
|
+
tokens.append(word)
|
|
84
|
+
buf.clear()
|
|
85
|
+
|
|
86
|
+
if buf:
|
|
87
|
+
word = "".join(buf).lower()
|
|
88
|
+
if word:
|
|
89
|
+
tokens.append(word)
|
|
90
|
+
|
|
91
|
+
return tokens
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def tokenize_bigram_cjk(text: str) -> List[str]:
|
|
95
|
+
"""Double-byte CJK tokenizer: supports double-byte Chinese segmentation"""
|
|
96
|
+
text = text.lower()
|
|
97
|
+
# Keep Chinese, English, and numbers, convert other characters to spaces
|
|
98
|
+
text = re.sub(r"[^\w\u4e00-\u9fff]+", " ", text)
|
|
99
|
+
|
|
100
|
+
tokens = []
|
|
101
|
+
for chunk in text.split():
|
|
102
|
+
# Check if it contains Chinese
|
|
103
|
+
if re.search(r"[\u4e00-\u9fff]", chunk):
|
|
104
|
+
# Chinese text using bigram (two-byte) segmentation
|
|
105
|
+
if len(chunk) >= 2:
|
|
106
|
+
tokens.extend([chunk[i : i + 2] for i in range(len(chunk) - 1)])
|
|
107
|
+
elif len(chunk) == 1:
|
|
108
|
+
tokens.append(chunk)
|
|
109
|
+
else:
|
|
110
|
+
# English/numbers keep original words
|
|
111
|
+
if len(chunk) > 1:
|
|
112
|
+
tokens.append(chunk)
|
|
113
|
+
|
|
114
|
+
return tokens
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
# -----------------------------
|
|
118
|
+
# BM25 Index
|
|
119
|
+
# -----------------------------
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
class BM25Index:
|
|
123
|
+
"""BM25 index implementation, based on the rank_bm25 library"""
|
|
124
|
+
|
|
125
|
+
def __init__(self, k1: float = 1.5, b: float = 0.75):
|
|
126
|
+
if not _HAS_RANK_BM25:
|
|
127
|
+
raise ImportError(
|
|
128
|
+
"rank_bm25 is required. Install with: pip install rank-bm25"
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
self.k1 = k1
|
|
132
|
+
self.b = b
|
|
133
|
+
self._bm25_index = None # rank_bm25 instance (if available)
|
|
134
|
+
self._doc_ids: List[int] = []
|
|
135
|
+
self._tokenizer_func = tokenize_simple
|
|
136
|
+
|
|
137
|
+
# Required structures for pure Python fallback implementation
|
|
138
|
+
self._tokenized_docs: List[List[str]] = []
|
|
139
|
+
self._tf_docs: List[Counter] = []
|
|
140
|
+
self._idf: Dict[str, float] = {}
|
|
141
|
+
self._doc_lens: List[int] = []
|
|
142
|
+
self._avgdl: float = 0.0
|
|
143
|
+
|
|
144
|
+
def build_from_corpus(self, documents: Dict[int, str], tokenizer_func=None) -> None:
|
|
145
|
+
"""Build index from document corpus"""
|
|
146
|
+
if tokenizer_func is None:
|
|
147
|
+
tokenizer_func = tokenize_simple
|
|
148
|
+
|
|
149
|
+
self._tokenizer_func = tokenizer_func
|
|
150
|
+
self._doc_ids = list(documents.keys())
|
|
151
|
+
|
|
152
|
+
# Handling empty document collections
|
|
153
|
+
if not documents:
|
|
154
|
+
self._bm25_index = None
|
|
155
|
+
self._tokenized_docs = []
|
|
156
|
+
self._tf_docs = []
|
|
157
|
+
self._idf = {}
|
|
158
|
+
self._doc_lens = []
|
|
159
|
+
self._avgdl = 0.0
|
|
160
|
+
return
|
|
161
|
+
|
|
162
|
+
# Tokenization Processing
|
|
163
|
+
tokenized_docs = []
|
|
164
|
+
for doc_id in self._doc_ids:
|
|
165
|
+
tokens = tokenizer_func(documents[doc_id])
|
|
166
|
+
tokenized_docs.append(tokens)
|
|
167
|
+
|
|
168
|
+
if _HAS_RANK_BM25:
|
|
169
|
+
# Build BM25 index (C extension/third-party library)
|
|
170
|
+
self._bm25_index = BM25Okapi(tokenized_docs, k1=self.k1, b=self.b)
|
|
171
|
+
# Clear rollback structure
|
|
172
|
+
self._tokenized_docs = []
|
|
173
|
+
self._tf_docs = []
|
|
174
|
+
self._idf = {}
|
|
175
|
+
self._doc_lens = []
|
|
176
|
+
self._avgdl = 0.0
|
|
177
|
+
else:
|
|
178
|
+
# Pure Python fallback implementation
|
|
179
|
+
self._bm25_index = None
|
|
180
|
+
self._tokenized_docs = tokenized_docs
|
|
181
|
+
self._tf_docs = [Counter(doc) for doc in tokenized_docs]
|
|
182
|
+
self._doc_lens = [len(doc) for doc in tokenized_docs]
|
|
183
|
+
self._avgdl = (
|
|
184
|
+
sum(self._doc_lens) / len(self._doc_lens) if self._doc_lens else 0.0
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
# Calculate idf
|
|
188
|
+
N = len(tokenized_docs)
|
|
189
|
+
df: Counter = Counter()
|
|
190
|
+
for doc in tokenized_docs:
|
|
191
|
+
df.update(set(doc))
|
|
192
|
+
self._idf = {}
|
|
193
|
+
for term, dfi in df.items():
|
|
194
|
+
# Smooth IDF consistent with common implementations
|
|
195
|
+
self._idf[term] = math.log((N - dfi + 0.5) / (dfi + 0.5) + 1)
|
|
196
|
+
|
|
197
|
+
def search(
|
|
198
|
+
self,
|
|
199
|
+
query: str,
|
|
200
|
+
allowed_doc_ids: Optional[set] = None,
|
|
201
|
+
topk: int = 10,
|
|
202
|
+
tokenizer_func=None,
|
|
203
|
+
) -> List[Tuple[int, float]]:
|
|
204
|
+
"""Search related documents"""
|
|
205
|
+
# If there is no rank_bm25 instance and no fallback index data, return empty
|
|
206
|
+
if self._bm25_index is None and not self._tf_docs:
|
|
207
|
+
return []
|
|
208
|
+
|
|
209
|
+
if tokenizer_func is None:
|
|
210
|
+
tokenizer_func = self._tokenizer_func
|
|
211
|
+
|
|
212
|
+
query_tokens = tokenizer_func(query)
|
|
213
|
+
if not query_tokens:
|
|
214
|
+
return []
|
|
215
|
+
|
|
216
|
+
if _HAS_RANK_BM25 and self._bm25_index is not None:
|
|
217
|
+
scores = self._bm25_index.get_scores(query_tokens)
|
|
218
|
+
else:
|
|
219
|
+
# Pure Python BM25 Scoring
|
|
220
|
+
scores = [0.0] * len(self._doc_ids)
|
|
221
|
+
for i, tf in enumerate(self._tf_docs):
|
|
222
|
+
dl = self._doc_lens[i] if self._doc_lens else 0
|
|
223
|
+
for term in query_tokens:
|
|
224
|
+
if term not in self._idf:
|
|
225
|
+
continue
|
|
226
|
+
tf_i = tf.get(term, 0)
|
|
227
|
+
if tf_i == 0:
|
|
228
|
+
continue
|
|
229
|
+
idf = self._idf[term]
|
|
230
|
+
denom = tf_i + self.k1 * (
|
|
231
|
+
1
|
|
232
|
+
- self.b
|
|
233
|
+
+ self.b * (dl / self._avgdl if self._avgdl > 0 else 0)
|
|
234
|
+
)
|
|
235
|
+
score = idf * (tf_i * (self.k1 + 1)) / denom
|
|
236
|
+
scores[i] += score
|
|
237
|
+
|
|
238
|
+
# Combine doc_id and score, return only documents with associations
|
|
239
|
+
# A BM25 score of 0 indicates that the document does not contain the query term and can be filtered out.
|
|
240
|
+
results = []
|
|
241
|
+
for i, score in enumerate(scores):
|
|
242
|
+
if score != 0.0: # Filter out irrelevant documents
|
|
243
|
+
doc_id = self._doc_ids[i]
|
|
244
|
+
if allowed_doc_ids is None or doc_id in allowed_doc_ids:
|
|
245
|
+
results.append((doc_id, float(score)))
|
|
246
|
+
|
|
247
|
+
# Sort and return the topk results
|
|
248
|
+
results.sort(key=lambda x: x[1], reverse=True)
|
|
249
|
+
return results[:topk]
|
|
250
|
+
|
|
251
|
+
def add_or_update(self, doc_id: int, text: str, tokenizer_func=None) -> None:
|
|
252
|
+
"""Add or update a single document (requires rebuilding the entire index)"""
|
|
253
|
+
# For dynamic updates, we need to rebuild the index.
|
|
254
|
+
# This is a limitation of rank_bm25, but it is acceptable for most scenarios.
|
|
255
|
+
|
|
256
|
+
# If the index does not exist, create a new one.
|
|
257
|
+
if self._bm25_index is None:
|
|
258
|
+
documents = {doc_id: text}
|
|
259
|
+
self.build_from_corpus(documents, tokenizer_func)
|
|
260
|
+
return
|
|
261
|
+
|
|
262
|
+
# Otherwise, the index needs to be rebuilt (can be optimized to incremental updates, but will increase complexity)
|
|
263
|
+
raise NotImplementedError(
|
|
264
|
+
"Dynamic update requires rebuilding the index. Use build_from_corpus instead."
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
def remove(self, doc_id: int) -> None:
|
|
268
|
+
"""Delete document (requires rebuilding the entire index)"""
|
|
269
|
+
raise NotImplementedError(
|
|
270
|
+
"Document removal requires rebuilding the index. Use build_from_corpus instead."
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
@property
|
|
274
|
+
def N(self) -> int:
|
|
275
|
+
"""Total number of documents"""
|
|
276
|
+
return len(self._doc_ids) if self._doc_ids else 0
|
|
277
|
+
|
|
278
|
+
# Backward-compatible aliases
|
|
279
|
+
def search_optimized(
|
|
280
|
+
self,
|
|
281
|
+
query: str,
|
|
282
|
+
allowed_doc_ids: Optional[set] = None,
|
|
283
|
+
topk: int = 10,
|
|
284
|
+
tokenizer_func=None,
|
|
285
|
+
) -> List[Tuple[int, float]]:
|
|
286
|
+
"""Alias for optimized search"""
|
|
287
|
+
return self.search(query, allowed_doc_ids, topk, tokenizer_func)
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
# -----------------------------
|
|
291
|
+
# Vector Calculation Tools
|
|
292
|
+
# -----------------------------
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
class VectorUtils:
|
|
296
|
+
"""Vector calculation utility class"""
|
|
297
|
+
|
|
298
|
+
@staticmethod
|
|
299
|
+
def cosine_similarity(vec1: List[float], vec2: List[float]) -> float:
|
|
300
|
+
"""Calculate cosine similarity"""
|
|
301
|
+
if len(vec1) != len(vec2):
|
|
302
|
+
return 0.0
|
|
303
|
+
|
|
304
|
+
dot_product = sum(a * b for a, b in zip(vec1, vec2))
|
|
305
|
+
norm1 = sum(a * a for a in vec1) ** 0.5
|
|
306
|
+
norm2 = sum(b * b for b in vec2) ** 0.5
|
|
307
|
+
|
|
308
|
+
if norm1 == 0 or norm2 == 0:
|
|
309
|
+
return 0.0
|
|
310
|
+
|
|
311
|
+
return dot_product / (norm1 * norm2)
|
|
312
|
+
|
|
313
|
+
@staticmethod
|
|
314
|
+
def euclidean_distance(vec1: List[float], vec2: List[float]) -> float:
|
|
315
|
+
"""Calculate Euclidean distance"""
|
|
316
|
+
if len(vec1) != len(vec2):
|
|
317
|
+
return float("inf")
|
|
318
|
+
|
|
319
|
+
return sum((a - b) ** 2 for a, b in zip(vec1, vec2)) ** 0.5
|
|
320
|
+
|
|
321
|
+
@staticmethod
|
|
322
|
+
def normalize_l2(vec: List[float]) -> List[float]:
|
|
323
|
+
"""L2 normalization"""
|
|
324
|
+
norm = sum(v * v for v in vec) ** 0.5
|
|
325
|
+
if norm > 0:
|
|
326
|
+
return [v / norm for v in vec]
|
|
327
|
+
return vec.copy()
|
|
328
|
+
|
|
329
|
+
@staticmethod
|
|
330
|
+
def compute_simple_embedding(
|
|
331
|
+
text: str, dim: int = 384, tokenizer_func=None
|
|
332
|
+
) -> List[float]:
|
|
333
|
+
"""Simple embedding implementation based on hash (fallback solution)"""
|
|
334
|
+
import hashlib
|
|
335
|
+
|
|
336
|
+
if tokenizer_func is None:
|
|
337
|
+
tokenizer_func = tokenize_simple
|
|
338
|
+
|
|
339
|
+
vec = [0.0] * dim
|
|
340
|
+
tokens = tokenizer_func(text) or [""]
|
|
341
|
+
|
|
342
|
+
for tok in tokens:
|
|
343
|
+
h = hashlib.md5(tok.encode("utf-8")).digest()
|
|
344
|
+
# Generate multiple indices and weights using the first 16 bytes
|
|
345
|
+
idx1 = int.from_bytes(h[:4], "big") % dim
|
|
346
|
+
idx2 = int.from_bytes(h[4:8], "big") % dim
|
|
347
|
+
w1 = (int.from_bytes(h[8:12], "big") % 100) / 100.0 + 1.0
|
|
348
|
+
w2 = (int.from_bytes(h[12:16], "big") % 100) / 100.0 + 0.5
|
|
349
|
+
vec[idx1] += w1
|
|
350
|
+
vec[idx2] += w2
|
|
351
|
+
|
|
352
|
+
return VectorUtils.normalize_l2(vec)
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
# -----------------------------
|
|
356
|
+
# Hybrid Retrieval Tool
|
|
357
|
+
# -----------------------------
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
class HybridRetriever:
|
|
361
|
+
"""Hybrid Retriever: Combining BM25 and Vector Similarity"""
|
|
362
|
+
|
|
363
|
+
def __init__(self, bm25_weight: float = 0.7):
|
|
364
|
+
self.bm25_weight = bm25_weight
|
|
365
|
+
self.embedding_weight = 1.0 - bm25_weight
|
|
366
|
+
|
|
367
|
+
def combine_scores(
|
|
368
|
+
self,
|
|
369
|
+
bm25_results: List[Tuple[int, float]],
|
|
370
|
+
embedding_results: List[Tuple[int, float]],
|
|
371
|
+
) -> List[Tuple[int, float]]:
|
|
372
|
+
"""Combining BM25 and embedding scores"""
|
|
373
|
+
bm25_dict = dict(bm25_results)
|
|
374
|
+
embedding_dict = dict(embedding_results)
|
|
375
|
+
|
|
376
|
+
# Get all document IDs
|
|
377
|
+
all_doc_ids = set(bm25_dict.keys()) | set(embedding_dict.keys())
|
|
378
|
+
|
|
379
|
+
if not all_doc_ids:
|
|
380
|
+
return []
|
|
381
|
+
|
|
382
|
+
# Fraction Normalization
|
|
383
|
+
def minmax_normalize(scores: List[float]) -> List[float]:
|
|
384
|
+
if not scores:
|
|
385
|
+
return scores
|
|
386
|
+
min_s, max_s = min(scores), max(scores)
|
|
387
|
+
if max_s == min_s:
|
|
388
|
+
return [1.0] * len(scores)
|
|
389
|
+
return [(s - min_s) / (max_s - min_s) for s in scores]
|
|
390
|
+
|
|
391
|
+
bm25_scores = list(bm25_dict.values())
|
|
392
|
+
embedding_scores = list(embedding_dict.values())
|
|
393
|
+
|
|
394
|
+
# Normalize only when there are scores.
|
|
395
|
+
bm25_normalized = {}
|
|
396
|
+
if bm25_scores:
|
|
397
|
+
bm25_norm_scores = minmax_normalize(bm25_scores)
|
|
398
|
+
bm25_normalized = dict(zip(bm25_dict.keys(), bm25_norm_scores))
|
|
399
|
+
|
|
400
|
+
embedding_normalized = {}
|
|
401
|
+
if embedding_scores:
|
|
402
|
+
emb_norm_scores = minmax_normalize(embedding_scores)
|
|
403
|
+
embedding_normalized = dict(zip(embedding_dict.keys(), emb_norm_scores))
|
|
404
|
+
|
|
405
|
+
# Combined Score
|
|
406
|
+
combined_results = []
|
|
407
|
+
for doc_id in all_doc_ids:
|
|
408
|
+
bm25_score = bm25_normalized.get(doc_id, 0.0)
|
|
409
|
+
emb_score = embedding_normalized.get(doc_id, 0.0)
|
|
410
|
+
combined_score = (
|
|
411
|
+
self.bm25_weight * bm25_score + self.embedding_weight * emb_score
|
|
412
|
+
)
|
|
413
|
+
combined_results.append((doc_id, combined_score))
|
|
414
|
+
|
|
415
|
+
return sorted(combined_results, key=lambda x: x[1], reverse=True)
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
# -----------------------------
|
|
419
|
+
# Convenience functions
|
|
420
|
+
# -----------------------------
|
|
421
|
+
|
|
422
|
+
|
|
423
|
+
def create_bm25_index(
|
|
424
|
+
documents: Dict[int, str],
|
|
425
|
+
tokenizer: str = "simple",
|
|
426
|
+
k1: float = 1.5,
|
|
427
|
+
b: float = 0.75,
|
|
428
|
+
) -> BM25Index:
|
|
429
|
+
"""Convenience function: Create BM25 index
|
|
430
|
+
|
|
431
|
+
Args:
|
|
432
|
+
documents: Document dictionary {doc_id: content}
|
|
433
|
+
tokenizer: Tokenizer type ("simple", "bigram_cjk")
|
|
434
|
+
k1, b: BM25 parameters
|
|
435
|
+
"""
|
|
436
|
+
tokenizer_func = {
|
|
437
|
+
"simple": tokenize_simple,
|
|
438
|
+
"bigram_cjk": tokenize_bigram_cjk,
|
|
439
|
+
}.get(tokenizer, tokenize_simple)
|
|
440
|
+
|
|
441
|
+
index = BM25Index(k1=k1, b=b)
|
|
442
|
+
index.build_from_corpus(documents, tokenizer_func)
|
|
443
|
+
return index
|
|
444
|
+
|
|
445
|
+
|
|
446
|
+
def search_documents(
|
|
447
|
+
index: BM25Index, query: str, topk: int = 10, tokenizer: str = "simple"
|
|
448
|
+
) -> List[Tuple[int, float]]:
|
|
449
|
+
"""Convenient function: search documents
|
|
450
|
+
|
|
451
|
+
Args:
|
|
452
|
+
index: BM25 index
|
|
453
|
+
query: query string
|
|
454
|
+
topk: number of results to return
|
|
455
|
+
tokenizer: tokenizer type
|
|
456
|
+
"""
|
|
457
|
+
tokenizer_func = {
|
|
458
|
+
"simple": tokenize_simple,
|
|
459
|
+
"bigram_cjk": tokenize_bigram_cjk,
|
|
460
|
+
}.get(tokenizer, tokenize_simple)
|
|
461
|
+
|
|
462
|
+
return index.search(query, topk=topk, tokenizer_func=tokenizer_func)
|