cnhkmcp 2.0.4__py3-none-any.whl → 2.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/README.md +38 -0
- cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/config.json +6 -0
- cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/get_knowledgeBase_tool/ace_lib.py +1510 -0
- cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/get_knowledgeBase_tool/fetch_all_datasets.py +157 -0
- cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/get_knowledgeBase_tool/fetch_all_documentation.py +132 -0
- cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/get_knowledgeBase_tool/fetch_all_operators.py +99 -0
- cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/get_knowledgeBase_tool/helpful_functions.py +180 -0
- cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/icon.ico +0 -0
- cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/icon.png +0 -0
- cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/knowledge/test.txt +1 -0
- cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/main.py +576 -0
- cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/process_knowledge_base.py +280 -0
- cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/rag_engine.py +356 -0
- cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/requirements.txt +7 -0
- cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/run.bat +3 -0
- cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/vector_db/_manifest.json +326 -0
- cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/vector_db/_meta.json +1 -0
- cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/vector_db/be5d957c-b724-46e3-91d1-999e9f5f7d28/index_metadata.pickle +0 -0
- cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/vector_db/chroma.sqlite3 +0 -0
- cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242//321/211/320/266/320/246/321/206/320/274/320/261/321/210/342/224/220/320/240/321/210/320/261/320/234/321/206/320/231/320/243/321/205/342/225/235/320/220/321/206/320/230/320/241.py +265 -0
- cnhkmcp/untracked/APP/Tranformer/Transformer.py +2804 -11
- cnhkmcp/untracked/APP/Tranformer/output/Alpha_candidates.json +1524 -889
- cnhkmcp/untracked/APP/Tranformer/output/Alpha_generated_expressions_error.json +884 -111
- cnhkmcp/untracked/APP/Tranformer/output/Alpha_generated_expressions_success.json +442 -168
- cnhkmcp/untracked/APP/Tranformer/template_summary.txt +2775 -1
- cnhkmcp/untracked/APP/ace.log +2 -0
- cnhkmcp/untracked/APP/give_me_idea/fetch_all_datasets.py +157 -0
- cnhkmcp/untracked/APP/give_me_idea/fetch_all_operators.py +99 -0
- cnhkmcp/untracked/APP/simulator/simulator_wqb.py +16 -16
- cnhkmcp/untracked/APP/static/brain.js +61 -0
- cnhkmcp/untracked/APP/static/script.js +140 -0
- cnhkmcp/untracked/APP/templates/index.html +25 -4
- cnhkmcp/untracked/APP//321/210/342/224/220/320/240/321/210/320/261/320/234/321/206/320/231/320/243/321/205/342/225/235/320/220/321/206/320/230/320/241.py +67 -6
- {cnhkmcp-2.0.4.dist-info → cnhkmcp-2.1.1.dist-info}/METADATA +1 -1
- {cnhkmcp-2.0.4.dist-info → cnhkmcp-2.1.1.dist-info}/RECORD +40 -20
- cnhkmcp/untracked/APP/hkSimulator/autosim_20251205_145240.log +0 -0
- cnhkmcp/untracked/APP/hkSimulator/autosim_20251215_030103.log +0 -0
- /cnhkmcp/untracked/{APP/hkSimulator/ace.log → AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/ace.log"} +0 -0
- {cnhkmcp-2.0.4.dist-info → cnhkmcp-2.1.1.dist-info}/WHEEL +0 -0
- {cnhkmcp-2.0.4.dist-info → cnhkmcp-2.1.1.dist-info}/entry_points.txt +0 -0
- {cnhkmcp-2.0.4.dist-info → cnhkmcp-2.1.1.dist-info}/licenses/LICENSE +0 -0
- {cnhkmcp-2.0.4.dist-info → cnhkmcp-2.1.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sys
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import json
|
|
5
|
+
import re
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import List
|
|
8
|
+
|
|
9
|
+
# Add get_knowledgeBase_tool to path
|
|
10
|
+
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
11
|
+
TOOL_DIR = os.path.join(SCRIPT_DIR, "get_knowledgeBase_tool")
|
|
12
|
+
if TOOL_DIR not in sys.path:
|
|
13
|
+
sys.path.insert(0, TOOL_DIR)
|
|
14
|
+
|
|
15
|
+
# Import from tool directory
|
|
16
|
+
sys.path.insert(0, TOOL_DIR)
|
|
17
|
+
import ace_lib
|
|
18
|
+
from fetch_all_operators import fetch_operators, prompt_credentials
|
|
19
|
+
from fetch_all_datasets import (
|
|
20
|
+
fetch_all_combinations,
|
|
21
|
+
fetch_datasets_for_combo,
|
|
22
|
+
merge_and_deduplicate,
|
|
23
|
+
)
|
|
24
|
+
from fetch_all_documentation import (
|
|
25
|
+
fetch_tutorials,
|
|
26
|
+
fetch_tutorial_pages,
|
|
27
|
+
fetch_page,
|
|
28
|
+
_extract_page_id,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def ensure_knowledge_dir():
|
|
33
|
+
"""Ensure knowledge directory exists"""
|
|
34
|
+
knowledge_dir = os.path.join(SCRIPT_DIR, "knowledge")
|
|
35
|
+
os.makedirs(knowledge_dir, exist_ok=True)
|
|
36
|
+
return knowledge_dir
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def to_jsonable(value):
|
|
40
|
+
"""Convert values to JSON-serializable, handling NaN and nested structures."""
|
|
41
|
+
try:
|
|
42
|
+
if isinstance(value, float) and pd.isna(value):
|
|
43
|
+
return None
|
|
44
|
+
except TypeError:
|
|
45
|
+
pass
|
|
46
|
+
|
|
47
|
+
if isinstance(value, list):
|
|
48
|
+
return [to_jsonable(v) for v in value if not (isinstance(v, float) and pd.isna(v))]
|
|
49
|
+
if isinstance(value, dict):
|
|
50
|
+
return {k: to_jsonable(v) for k, v in value.items()}
|
|
51
|
+
if isinstance(value, (str, int, float, bool)) or value is None:
|
|
52
|
+
return value
|
|
53
|
+
return str(value)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def safe_filename(name: str, suffix: str = "") -> str:
|
|
57
|
+
base = re.sub(r"[^A-Za-z0-9._-]+", "_", str(name)).strip("_") or "doc"
|
|
58
|
+
base = base[:80]
|
|
59
|
+
return f"{base}{suffix}"
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def process_operators(session: ace_lib.SingleSession, knowledge_dir: str):
|
|
63
|
+
"""
|
|
64
|
+
Process operators and save as JSON files
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
session: Authenticated BRAIN session
|
|
68
|
+
knowledge_dir: Directory to save JSON files
|
|
69
|
+
"""
|
|
70
|
+
print("\n=== Processing Operators ===")
|
|
71
|
+
|
|
72
|
+
# Fetch operators data
|
|
73
|
+
print("Fetching operators...")
|
|
74
|
+
operators_df = fetch_operators(session)
|
|
75
|
+
|
|
76
|
+
if operators_df.empty:
|
|
77
|
+
print("No operators found!")
|
|
78
|
+
return
|
|
79
|
+
|
|
80
|
+
print(f"Found {len(operators_df)} operator entries")
|
|
81
|
+
|
|
82
|
+
# Get unique categories
|
|
83
|
+
categories = sorted(operators_df['category'].dropna().unique())
|
|
84
|
+
|
|
85
|
+
for category in categories:
|
|
86
|
+
category_data = operators_df[operators_df['category'] == category].copy()
|
|
87
|
+
|
|
88
|
+
# Create JSON file for this category
|
|
89
|
+
filename = f"{category.replace(' ', '_').lower()}_operators.json"
|
|
90
|
+
filepath = os.path.join(knowledge_dir, filename)
|
|
91
|
+
|
|
92
|
+
print(f"Processing category: {category}")
|
|
93
|
+
|
|
94
|
+
# Convert to list of dicts
|
|
95
|
+
category_list = []
|
|
96
|
+
for idx, row in category_data.iterrows():
|
|
97
|
+
operator_dict = {}
|
|
98
|
+
for col in row.index:
|
|
99
|
+
value = row[col]
|
|
100
|
+
operator_dict[col] = to_jsonable(value)
|
|
101
|
+
category_list.append(operator_dict)
|
|
102
|
+
|
|
103
|
+
# Save category JSON
|
|
104
|
+
with open(filepath, 'w', encoding='utf-8') as f:
|
|
105
|
+
json.dump(category_list, f, ensure_ascii=False, indent=2)
|
|
106
|
+
|
|
107
|
+
print(f"✓ Created {filename} with {len(category_list)} operators")
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def process_datasets(session: ace_lib.SingleSession, dataset_dir: str):
|
|
111
|
+
"""Fetch datasets and save one JSON per region."""
|
|
112
|
+
print("=== Processing Datasets ===")
|
|
113
|
+
|
|
114
|
+
print("Fetching valid instrument/region/delay/universe combinations...")
|
|
115
|
+
options_df = fetch_all_combinations(session)
|
|
116
|
+
if options_df is None or options_df.empty:
|
|
117
|
+
print("No simulation options fetched; aborting dataset fetch.")
|
|
118
|
+
return
|
|
119
|
+
|
|
120
|
+
all_datasets: list[pd.DataFrame] = []
|
|
121
|
+
combo_idx = 0
|
|
122
|
+
|
|
123
|
+
for _, row in options_df.iterrows():
|
|
124
|
+
instrument_type = row.get("InstrumentType")
|
|
125
|
+
region = row.get("Region")
|
|
126
|
+
delay = row.get("Delay")
|
|
127
|
+
universes = row.get("Universe") or []
|
|
128
|
+
|
|
129
|
+
for universe in universes:
|
|
130
|
+
combo_idx += 1
|
|
131
|
+
print(f"[{combo_idx}] {instrument_type} / {region} / D{delay} / {universe}")
|
|
132
|
+
try:
|
|
133
|
+
df = fetch_datasets_for_combo(session, instrument_type, region, delay, universe)
|
|
134
|
+
print(f" -> {len(df)} rows")
|
|
135
|
+
all_datasets.append(df)
|
|
136
|
+
except Exception as exc:
|
|
137
|
+
print(f" -> Failed: {exc}")
|
|
138
|
+
|
|
139
|
+
if not all_datasets:
|
|
140
|
+
print("No datasets fetched; nothing to save.")
|
|
141
|
+
return
|
|
142
|
+
|
|
143
|
+
combined_df = pd.concat([df for df in all_datasets if not df.empty], ignore_index=True)
|
|
144
|
+
if combined_df.empty:
|
|
145
|
+
print("No datasets fetched; nothing to save.")
|
|
146
|
+
return
|
|
147
|
+
|
|
148
|
+
regions = sorted(combined_df["param_region"].dropna().unique())
|
|
149
|
+
print(f"Found regions: {', '.join(regions)}")
|
|
150
|
+
|
|
151
|
+
for region in regions:
|
|
152
|
+
region_df = combined_df[combined_df["param_region"] == region]
|
|
153
|
+
region_unique = merge_and_deduplicate([region_df])
|
|
154
|
+
|
|
155
|
+
region_list = []
|
|
156
|
+
for _, row in region_unique.iterrows():
|
|
157
|
+
record = {col: to_jsonable(row[col]) for col in row.index}
|
|
158
|
+
region_list.append(record)
|
|
159
|
+
|
|
160
|
+
filename = f"{region.replace(' ', '_').lower()}_datasets.json"
|
|
161
|
+
filepath = os.path.join(dataset_dir, filename)
|
|
162
|
+
with open(filepath, "w", encoding="utf-8") as f:
|
|
163
|
+
json.dump(region_list, f, ensure_ascii=False, indent=2)
|
|
164
|
+
|
|
165
|
+
print(f"✓ Created {filename} with {len(region_list)} datasets")
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def process_documentation(session: ace_lib.SingleSession, knowledge_dir: str):
|
|
169
|
+
"""Fetch tutorials and pages, save one JSON per page."""
|
|
170
|
+
print("=== Processing Documentation ===")
|
|
171
|
+
|
|
172
|
+
tutorials = fetch_tutorials(session)
|
|
173
|
+
if not tutorials:
|
|
174
|
+
print("No tutorials fetched; skipping documentation.")
|
|
175
|
+
return
|
|
176
|
+
|
|
177
|
+
print(f"Fetched {len(tutorials)} tutorials")
|
|
178
|
+
|
|
179
|
+
page_count = 0
|
|
180
|
+
seen_pages = set()
|
|
181
|
+
|
|
182
|
+
for idx, tutorial in enumerate(tutorials, start=1):
|
|
183
|
+
tutorial_id = _extract_page_id(tutorial) or f"tutorial_{idx}"
|
|
184
|
+
tutorial_title = tutorial.get("title") or tutorial_id
|
|
185
|
+
|
|
186
|
+
page_candidates = []
|
|
187
|
+
if isinstance(tutorial.get("pages"), list):
|
|
188
|
+
page_candidates.extend(tutorial["pages"])
|
|
189
|
+
if tutorial_id:
|
|
190
|
+
try:
|
|
191
|
+
page_candidates.extend(fetch_tutorial_pages(session, tutorial_id))
|
|
192
|
+
except Exception as exc:
|
|
193
|
+
print(f"[{idx:03d}] failed to fetch pages for {tutorial_id}: {exc}")
|
|
194
|
+
|
|
195
|
+
if not page_candidates and tutorial_id:
|
|
196
|
+
page_candidates.append({"id": tutorial_id, "title": tutorial_title})
|
|
197
|
+
|
|
198
|
+
for page_entry in page_candidates:
|
|
199
|
+
page_id = _extract_page_id(page_entry)
|
|
200
|
+
if not page_id or page_id in seen_pages:
|
|
201
|
+
continue
|
|
202
|
+
seen_pages.add(page_id)
|
|
203
|
+
|
|
204
|
+
try:
|
|
205
|
+
page = fetch_page(session, page_id)
|
|
206
|
+
except Exception as exc:
|
|
207
|
+
print(f"[{idx:03d}] page {page_id} failed: {exc}")
|
|
208
|
+
continue
|
|
209
|
+
|
|
210
|
+
page_count += 1
|
|
211
|
+
page_title = page.get("title") or page_entry.get("title") or page_id
|
|
212
|
+
|
|
213
|
+
# Save each page as individual JSON
|
|
214
|
+
filename = safe_filename(f"{idx:03d}_{page_title}", "_documentation.json")
|
|
215
|
+
filepath = os.path.join(knowledge_dir, filename)
|
|
216
|
+
|
|
217
|
+
with open(filepath, "w", encoding="utf-8") as f:
|
|
218
|
+
json.dump(to_jsonable(page), f, ensure_ascii=False, indent=2)
|
|
219
|
+
|
|
220
|
+
print(f"[{idx:03d}] ✓ Created {filename}")
|
|
221
|
+
|
|
222
|
+
print(f"✓ Total: {page_count} documentation pages saved")
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def main():
|
|
226
|
+
print("=== BRAIN Knowledge Base Processor ===")
|
|
227
|
+
print("Starting operator processing...\n")
|
|
228
|
+
|
|
229
|
+
# Get credentials
|
|
230
|
+
email, password = prompt_credentials()
|
|
231
|
+
ace_lib.get_credentials = lambda: (email, password)
|
|
232
|
+
|
|
233
|
+
print("Logging in to BRAIN platform...")
|
|
234
|
+
try:
|
|
235
|
+
session = ace_lib.start_session()
|
|
236
|
+
print("✓ Login successful\n")
|
|
237
|
+
except Exception as exc:
|
|
238
|
+
print(f"✗ Login failed: {exc}")
|
|
239
|
+
return
|
|
240
|
+
|
|
241
|
+
# Ensure knowledge directory exists
|
|
242
|
+
knowledge_dir = ensure_knowledge_dir()
|
|
243
|
+
dataset_dir = knowledge_dir # Save datasets directly under knowledge
|
|
244
|
+
print(f"Knowledge directory: {knowledge_dir}\n")
|
|
245
|
+
|
|
246
|
+
# Process documentation (tutorials/pages)
|
|
247
|
+
print("\nStarting documentation processing...\n")
|
|
248
|
+
try:
|
|
249
|
+
process_documentation(session, knowledge_dir)
|
|
250
|
+
except Exception as exc:
|
|
251
|
+
print(f"✗ Failed to process documentation: {exc}")
|
|
252
|
+
import traceback
|
|
253
|
+
traceback.print_exc()
|
|
254
|
+
return
|
|
255
|
+
|
|
256
|
+
# Process operators
|
|
257
|
+
try:
|
|
258
|
+
process_operators(session, knowledge_dir)
|
|
259
|
+
except Exception as exc:
|
|
260
|
+
print(f"✗ Failed to process operators: {exc}")
|
|
261
|
+
import traceback
|
|
262
|
+
traceback.print_exc()
|
|
263
|
+
return
|
|
264
|
+
|
|
265
|
+
# Process datasets by region
|
|
266
|
+
print("\nStarting dataset processing...\n")
|
|
267
|
+
try:
|
|
268
|
+
process_datasets(session, dataset_dir)
|
|
269
|
+
except Exception as exc:
|
|
270
|
+
print(f"✗ Failed to process datasets: {exc}")
|
|
271
|
+
import traceback
|
|
272
|
+
traceback.print_exc()
|
|
273
|
+
return
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
print("\n=== Processing Complete ===")
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
if __name__ == "__main__":
|
|
280
|
+
main()
|
|
@@ -0,0 +1,356 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
import shutil
|
|
4
|
+
import chromadb
|
|
5
|
+
from fastembed import TextEmbedding
|
|
6
|
+
from watchdog.observers import Observer
|
|
7
|
+
from watchdog.events import FileSystemEventHandler
|
|
8
|
+
import threading
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
PREFERRED_MODELS = [
|
|
12
|
+
"jinaai/jina-embeddings-v2-base-zh", # 中英混合友好,~0.64GB
|
|
13
|
+
"sentence-transformers/paraphrase-multilingual-mpnet-base-v2", # 多语 ~50 语种
|
|
14
|
+
"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", # 多语轻量版
|
|
15
|
+
"intfloat/multilingual-e5-large", # 多语更强,体积约 2.2GB
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
# Final chosen model will be detected at runtime from supported list
|
|
19
|
+
MODEL_NAME = None
|
|
20
|
+
COLLECTION_NAME = "brain_kb_v5"
|
|
21
|
+
|
|
22
|
+
# Optional imports for different file types
|
|
23
|
+
try:
|
|
24
|
+
from pypdf import PdfReader
|
|
25
|
+
except ImportError:
|
|
26
|
+
PdfReader = None
|
|
27
|
+
|
|
28
|
+
try:
|
|
29
|
+
from docx import Document
|
|
30
|
+
except ImportError:
|
|
31
|
+
Document = None
|
|
32
|
+
|
|
33
|
+
class KnowledgeBase:
|
|
34
|
+
def __init__(self, kb_path="knowledge", db_path="vector_db"):
|
|
35
|
+
self.kb_path = os.path.abspath(kb_path)
|
|
36
|
+
self.db_path = os.path.abspath(db_path)
|
|
37
|
+
self.meta_path = os.path.join(self.db_path, "_meta.json")
|
|
38
|
+
self.manifest_path = os.path.join(self.db_path, "_manifest.json")
|
|
39
|
+
self._collection_reset_guard = False
|
|
40
|
+
|
|
41
|
+
if not os.path.exists(self.kb_path):
|
|
42
|
+
os.makedirs(self.kb_path)
|
|
43
|
+
|
|
44
|
+
# Initialize Embedding Model (BAAI/bge-small-zh-v1.5 is ~100MB)
|
|
45
|
+
# This will load from cache if already downloaded
|
|
46
|
+
# Pick the first available model from the preferred list
|
|
47
|
+
_supported_raw = TextEmbedding.list_supported_models()
|
|
48
|
+
supported = set()
|
|
49
|
+
for item in _supported_raw:
|
|
50
|
+
if isinstance(item, dict) and "model" in item:
|
|
51
|
+
supported.add(item["model"])
|
|
52
|
+
elif isinstance(item, str):
|
|
53
|
+
supported.add(item)
|
|
54
|
+
chosen = None
|
|
55
|
+
for name in PREFERRED_MODELS:
|
|
56
|
+
if name in supported:
|
|
57
|
+
chosen = name
|
|
58
|
+
break
|
|
59
|
+
if not chosen:
|
|
60
|
+
raise RuntimeError(
|
|
61
|
+
"No preferred embedding models are supported by fastembed. "
|
|
62
|
+
"Please check available models via TextEmbedding.list_supported_models()."
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
print(f"Loading Knowledge Base Embedding Model: {chosen} (may take some time on first run)...")
|
|
66
|
+
try:
|
|
67
|
+
self.model = TextEmbedding(model_name=chosen)
|
|
68
|
+
print("Embedding Model loaded successfully.")
|
|
69
|
+
except Exception as e:
|
|
70
|
+
print(f"Error loading embedding model: {e}")
|
|
71
|
+
raise
|
|
72
|
+
|
|
73
|
+
# Store chosen model name for reference
|
|
74
|
+
global MODEL_NAME
|
|
75
|
+
MODEL_NAME = chosen
|
|
76
|
+
|
|
77
|
+
# Cache embedding dimension (detects library/model changes that corrupt existing indexes)
|
|
78
|
+
self.embed_dim = self._get_embedding_dim()
|
|
79
|
+
self.chroma_version = getattr(chromadb, "__version__", "unknown")
|
|
80
|
+
|
|
81
|
+
# If the stored index was built with a different model/dimension/chromadb version, wipe it
|
|
82
|
+
self._maybe_reset_for_incompatibility(chosen, self.embed_dim, self.chroma_version)
|
|
83
|
+
|
|
84
|
+
# Initialize Vector DB
|
|
85
|
+
self._init_collection()
|
|
86
|
+
|
|
87
|
+
# Initial sync
|
|
88
|
+
self.sync_knowledge()
|
|
89
|
+
|
|
90
|
+
# Start Watcher
|
|
91
|
+
self.start_watcher()
|
|
92
|
+
|
|
93
|
+
def _init_collection(self, recreate: bool = False):
|
|
94
|
+
"""(Re)initialize Chroma client/collection. If recreate=True, wipe on-disk index."""
|
|
95
|
+
if recreate and os.path.exists(self.db_path):
|
|
96
|
+
shutil.rmtree(self.db_path, ignore_errors=True)
|
|
97
|
+
try:
|
|
98
|
+
self.client = chromadb.PersistentClient(path=self.db_path)
|
|
99
|
+
self.collection = self.client.get_or_create_collection(
|
|
100
|
+
name=COLLECTION_NAME,
|
|
101
|
+
metadata={"hnsw:space": "cosine"}
|
|
102
|
+
)
|
|
103
|
+
except Exception as exc:
|
|
104
|
+
# If collection load itself fails, wipe and retry once to clear corrupted segments
|
|
105
|
+
if not recreate:
|
|
106
|
+
shutil.rmtree(self.db_path, ignore_errors=True)
|
|
107
|
+
return self._init_collection(recreate=True)
|
|
108
|
+
raise
|
|
109
|
+
|
|
110
|
+
# Persist metadata about the embedding model used to build this index
|
|
111
|
+
try:
|
|
112
|
+
os.makedirs(self.db_path, exist_ok=True)
|
|
113
|
+
with open(self.meta_path, "w", encoding="utf-8") as f:
|
|
114
|
+
json.dump({
|
|
115
|
+
"model": MODEL_NAME,
|
|
116
|
+
"embed_dim": self.embed_dim,
|
|
117
|
+
"chroma_version": self.chroma_version,
|
|
118
|
+
}, f)
|
|
119
|
+
except Exception:
|
|
120
|
+
pass # Metadata failure should not block runtime
|
|
121
|
+
|
|
122
|
+
def _maybe_reset_for_incompatibility(self, chosen_model: str, embed_dim: int, chroma_version: str):
|
|
123
|
+
"""If existing index meta differs (model/dimension/chromadb), wipe it."""
|
|
124
|
+
if not os.path.exists(self.db_path):
|
|
125
|
+
return
|
|
126
|
+
try:
|
|
127
|
+
with open(self.meta_path, "r", encoding="utf-8") as f:
|
|
128
|
+
meta = json.load(f)
|
|
129
|
+
prev_model = meta.get("model")
|
|
130
|
+
prev_dim = meta.get("embed_dim")
|
|
131
|
+
prev_chroma = meta.get("chroma_version")
|
|
132
|
+
if prev_model != chosen_model or prev_dim != embed_dim or prev_chroma != chroma_version:
|
|
133
|
+
shutil.rmtree(self.db_path, ignore_errors=True)
|
|
134
|
+
except Exception:
|
|
135
|
+
# If meta cannot be read, assume stale/corrupted and rebuild
|
|
136
|
+
shutil.rmtree(self.db_path, ignore_errors=True)
|
|
137
|
+
|
|
138
|
+
def _get_embedding_dim(self) -> int:
|
|
139
|
+
for vec in self.model.embed(["dimension_probe"]):
|
|
140
|
+
try:
|
|
141
|
+
return len(vec)
|
|
142
|
+
except Exception:
|
|
143
|
+
return len(list(vec))
|
|
144
|
+
raise RuntimeError("Failed to determine embedding dimension")
|
|
145
|
+
|
|
146
|
+
def sync_knowledge(self, allow_reset: bool = True):
|
|
147
|
+
"""Scans the knowledge folder and updates the vector database."""
|
|
148
|
+
print("Syncing knowledge base...")
|
|
149
|
+
manifest = self._load_manifest()
|
|
150
|
+
updated_manifest = {}
|
|
151
|
+
supported_extensions = (".txt", ".md", ".pdf", ".docx", ".json")
|
|
152
|
+
current_files = []
|
|
153
|
+
for filename in os.listdir(self.kb_path):
|
|
154
|
+
file_path = os.path.join(self.kb_path, filename)
|
|
155
|
+
if os.path.isfile(file_path) and filename.lower().endswith(supported_extensions):
|
|
156
|
+
current_files.append(filename)
|
|
157
|
+
mtime = os.path.getmtime(file_path)
|
|
158
|
+
size = os.path.getsize(file_path)
|
|
159
|
+
prev_meta = manifest.get(filename)
|
|
160
|
+
# Skip unchanged files
|
|
161
|
+
if prev_meta and prev_meta.get("mtime") == mtime and prev_meta.get("size") == size:
|
|
162
|
+
updated_manifest[filename] = prev_meta
|
|
163
|
+
continue
|
|
164
|
+
try:
|
|
165
|
+
content = self._extract_text(file_path)
|
|
166
|
+
if content:
|
|
167
|
+
# Sliding window chunking on original text
|
|
168
|
+
chunk_size = 400
|
|
169
|
+
overlap = 80
|
|
170
|
+
original_chunks = []
|
|
171
|
+
for i in range(0, len(content), chunk_size - overlap):
|
|
172
|
+
chunk = content[i:i + chunk_size].strip()
|
|
173
|
+
if chunk:
|
|
174
|
+
original_chunks.append(chunk)
|
|
175
|
+
|
|
176
|
+
if original_chunks:
|
|
177
|
+
# Normalize for embedding generation only (not for storage)
|
|
178
|
+
normalized_chunks = [c.lower().replace('_', ' ') for c in original_chunks]
|
|
179
|
+
|
|
180
|
+
ids = [f"{filename}_{i}" for i in range(len(original_chunks))]
|
|
181
|
+
metadatas = [{"source": filename, "chunk": i} for i in range(len(original_chunks))]
|
|
182
|
+
|
|
183
|
+
# Compute embeddings from normalized text
|
|
184
|
+
embeddings = []
|
|
185
|
+
for v in self.model.embed(normalized_chunks):
|
|
186
|
+
try:
|
|
187
|
+
embeddings.append(v.tolist())
|
|
188
|
+
except Exception:
|
|
189
|
+
embeddings.append(list(v))
|
|
190
|
+
|
|
191
|
+
# Store ORIGINAL text (not normalized) so users see the real content
|
|
192
|
+
self.collection.upsert(
|
|
193
|
+
documents=original_chunks,
|
|
194
|
+
ids=ids,
|
|
195
|
+
metadatas=metadatas,
|
|
196
|
+
embeddings=embeddings
|
|
197
|
+
)
|
|
198
|
+
print(f" ✓ Indexed {filename}: {len(original_chunks)} chunks")
|
|
199
|
+
updated_manifest[filename] = {"mtime": mtime, "size": size}
|
|
200
|
+
except Exception as e:
|
|
201
|
+
err_msg = str(e)
|
|
202
|
+
print(f"Error processing {filename}: {err_msg}")
|
|
203
|
+
# Auto-recover if HNSW/compaction/index errors occur
|
|
204
|
+
if allow_reset and any(x in err_msg.lower() for x in ["hnsw", "compaction", "segment reader"]):
|
|
205
|
+
if not self._collection_reset_guard:
|
|
206
|
+
print("Detected index corruption. Rebuilding vector_db and retrying sync once...")
|
|
207
|
+
self._collection_reset_guard = True
|
|
208
|
+
self._init_collection(recreate=True)
|
|
209
|
+
return self.sync_knowledge(allow_reset=False)
|
|
210
|
+
# Remove deleted files from the index
|
|
211
|
+
deleted_files = set(manifest.keys()) - set(current_files)
|
|
212
|
+
for filename in deleted_files:
|
|
213
|
+
try:
|
|
214
|
+
self.collection.delete(where={"source": filename})
|
|
215
|
+
print(f" ✓ Removed deleted file from index: {filename}")
|
|
216
|
+
except Exception as e:
|
|
217
|
+
print(f" ! Failed to remove {filename}: {e}")
|
|
218
|
+
# Persist manifest
|
|
219
|
+
self._save_manifest(updated_manifest)
|
|
220
|
+
print("Knowledge base sync complete.")
|
|
221
|
+
|
|
222
|
+
def _extract_text(self, file_path):
|
|
223
|
+
ext = os.path.splitext(file_path)[1].lower()
|
|
224
|
+
if ext == ".txt":
|
|
225
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
226
|
+
return f.read()
|
|
227
|
+
elif ext == ".md":
|
|
228
|
+
# Treat Markdown as plain text for retrieval
|
|
229
|
+
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
230
|
+
return f.read()
|
|
231
|
+
elif ext == ".pdf":
|
|
232
|
+
if PdfReader:
|
|
233
|
+
reader = PdfReader(file_path)
|
|
234
|
+
text = ""
|
|
235
|
+
for page in reader.pages:
|
|
236
|
+
text += page.extract_text() + "\n"
|
|
237
|
+
return text
|
|
238
|
+
else:
|
|
239
|
+
print("pypdf not installed, skipping PDF.")
|
|
240
|
+
elif ext == ".docx":
|
|
241
|
+
if Document:
|
|
242
|
+
doc = Document(file_path)
|
|
243
|
+
return "\n".join([para.text for para in doc.paragraphs])
|
|
244
|
+
else:
|
|
245
|
+
print("python-docx not installed, skipping Word.")
|
|
246
|
+
elif ext == ".json":
|
|
247
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
248
|
+
data = json.load(f)
|
|
249
|
+
return json.dumps(data, ensure_ascii=False, indent=2)
|
|
250
|
+
return None
|
|
251
|
+
|
|
252
|
+
def query(self, text, top_k=5, distance_threshold=0.8):
|
|
253
|
+
"""Retrieves relevant snippets from the knowledge base.
|
|
254
|
+
|
|
255
|
+
Uses cosine distance (lower is better). A result is treated as a hit only
|
|
256
|
+
when best_distance <= distance_threshold.
|
|
257
|
+
Returns:
|
|
258
|
+
dict: {"hit": bool, "context": str, "hits": [{source, chunk, distance, text}, ...]}
|
|
259
|
+
"""
|
|
260
|
+
try:
|
|
261
|
+
# Normalize query same as indexed content
|
|
262
|
+
normalized_text = text.lower().replace('_', ' ')
|
|
263
|
+
|
|
264
|
+
q_vec = None
|
|
265
|
+
for v in self.model.embed([normalized_text]):
|
|
266
|
+
try:
|
|
267
|
+
q_vec = v.tolist()
|
|
268
|
+
except Exception:
|
|
269
|
+
q_vec = list(v)
|
|
270
|
+
break
|
|
271
|
+
if q_vec is None:
|
|
272
|
+
return {"hit": False, "context": "", "hits": []}
|
|
273
|
+
|
|
274
|
+
results = self.collection.query(
|
|
275
|
+
query_embeddings=[q_vec],
|
|
276
|
+
n_results=top_k,
|
|
277
|
+
include=["documents", "metadatas", "distances"]
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
docs = (results or {}).get("documents") or []
|
|
281
|
+
metas = (results or {}).get("metadatas") or []
|
|
282
|
+
dists = (results or {}).get("distances") or []
|
|
283
|
+
|
|
284
|
+
if not docs or not docs[0]:
|
|
285
|
+
print("[KB Query] No results returned from collection")
|
|
286
|
+
return {"hit": False, "context": "", "hits": []}
|
|
287
|
+
|
|
288
|
+
docs0 = docs[0]
|
|
289
|
+
metas0 = metas[0] if metas and metas[0] else [{} for _ in docs0]
|
|
290
|
+
dists0 = dists[0] if dists and dists[0] else [None for _ in docs0]
|
|
291
|
+
|
|
292
|
+
hits = []
|
|
293
|
+
for doc_text, meta, dist in zip(docs0, metas0, dists0):
|
|
294
|
+
hits.append({
|
|
295
|
+
"source": (meta or {}).get("source", ""),
|
|
296
|
+
"chunk": (meta or {}).get("chunk", None),
|
|
297
|
+
"distance": dist,
|
|
298
|
+
"text": doc_text,
|
|
299
|
+
})
|
|
300
|
+
|
|
301
|
+
best = hits[0].get("distance")
|
|
302
|
+
is_hit = (best is not None) and (best <= distance_threshold)
|
|
303
|
+
|
|
304
|
+
# Debug log
|
|
305
|
+
best_str = f"{best:.4f}" if best is not None else "N/A"
|
|
306
|
+
print(f"[KB Query] '{text[:50]}...' -> best_dist={best_str}, threshold={distance_threshold}, hit={is_hit}")
|
|
307
|
+
if hits:
|
|
308
|
+
top3_dists = [f"{h['distance']:.4f}" if h['distance'] is not None else "N/A" for h in hits[:3]]
|
|
309
|
+
print(f"[KB Query] Top 3 distances: {top3_dists}")
|
|
310
|
+
|
|
311
|
+
context = "\n---\n".join([h["text"] for h in hits]) if is_hit else ""
|
|
312
|
+
return {"hit": is_hit, "context": context, "hits": hits}
|
|
313
|
+
except Exception as e:
|
|
314
|
+
print(f"Query error: {e}")
|
|
315
|
+
import traceback
|
|
316
|
+
traceback.print_exc()
|
|
317
|
+
return {"hit": False, "context": "", "hits": []}
|
|
318
|
+
|
|
319
|
+
def start_watcher(self):
|
|
320
|
+
event_handler = KBHandler(self)
|
|
321
|
+
self.observer = Observer()
|
|
322
|
+
self.observer.schedule(event_handler, self.kb_path, recursive=False)
|
|
323
|
+
self.observer.start()
|
|
324
|
+
|
|
325
|
+
def _load_manifest(self):
|
|
326
|
+
if not os.path.exists(self.manifest_path):
|
|
327
|
+
return {}
|
|
328
|
+
try:
|
|
329
|
+
with open(self.manifest_path, "r", encoding="utf-8") as f:
|
|
330
|
+
return json.load(f)
|
|
331
|
+
except Exception:
|
|
332
|
+
return {}
|
|
333
|
+
|
|
334
|
+
def _save_manifest(self, data):
|
|
335
|
+
try:
|
|
336
|
+
os.makedirs(self.db_path, exist_ok=True)
|
|
337
|
+
with open(self.manifest_path, "w", encoding="utf-8") as f:
|
|
338
|
+
json.dump(data, f, ensure_ascii=False, indent=2)
|
|
339
|
+
except Exception as e:
|
|
340
|
+
print(f" ! Failed to save manifest: {e}")
|
|
341
|
+
|
|
342
|
+
class KBHandler(FileSystemEventHandler):
|
|
343
|
+
def __init__(self, kb_instance):
|
|
344
|
+
self.kb = kb_instance
|
|
345
|
+
self.supported_extensions = (".txt", ".md", ".pdf", ".docx", ".json")
|
|
346
|
+
|
|
347
|
+
def on_modified(self, event):
|
|
348
|
+
if not event.is_directory and event.src_path.lower().endswith(self.supported_extensions):
|
|
349
|
+
print(f"File modified: {event.src_path}. Re-syncing...")
|
|
350
|
+
threading.Thread(target=self.kb.sync_knowledge).start()
|
|
351
|
+
|
|
352
|
+
def on_created(self, event):
|
|
353
|
+
if not event.is_directory and event.src_path.lower().endswith(self.supported_extensions):
|
|
354
|
+
print(f"File created: {event.src_path}. Syncing...")
|
|
355
|
+
threading.Thread(target=self.kb.sync_knowledge).start()
|
|
356
|
+
|