cnhkmcp 2.0.4__py3-none-any.whl → 2.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/README.md +38 -0
  2. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/config.json +6 -0
  3. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/get_knowledgeBase_tool/ace_lib.py +1510 -0
  4. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/get_knowledgeBase_tool/fetch_all_datasets.py +157 -0
  5. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/get_knowledgeBase_tool/fetch_all_documentation.py +132 -0
  6. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/get_knowledgeBase_tool/fetch_all_operators.py +99 -0
  7. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/get_knowledgeBase_tool/helpful_functions.py +180 -0
  8. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/icon.ico +0 -0
  9. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/icon.png +0 -0
  10. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/knowledge/test.txt +1 -0
  11. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/main.py +576 -0
  12. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/process_knowledge_base.py +280 -0
  13. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/rag_engine.py +356 -0
  14. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/requirements.txt +7 -0
  15. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/run.bat +3 -0
  16. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/vector_db/_manifest.json +326 -0
  17. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/vector_db/_meta.json +1 -0
  18. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/vector_db/be5d957c-b724-46e3-91d1-999e9f5f7d28/index_metadata.pickle +0 -0
  19. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/vector_db/chroma.sqlite3 +0 -0
  20. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242//321/211/320/266/320/246/321/206/320/274/320/261/321/210/342/224/220/320/240/321/210/320/261/320/234/321/206/320/231/320/243/321/205/342/225/235/320/220/321/206/320/230/320/241.py +265 -0
  21. cnhkmcp/untracked/APP/Tranformer/Transformer.py +2804 -11
  22. cnhkmcp/untracked/APP/Tranformer/output/Alpha_candidates.json +1524 -889
  23. cnhkmcp/untracked/APP/Tranformer/output/Alpha_generated_expressions_error.json +884 -111
  24. cnhkmcp/untracked/APP/Tranformer/output/Alpha_generated_expressions_success.json +442 -168
  25. cnhkmcp/untracked/APP/Tranformer/template_summary.txt +2775 -1
  26. cnhkmcp/untracked/APP/ace.log +2 -0
  27. cnhkmcp/untracked/APP/give_me_idea/fetch_all_datasets.py +157 -0
  28. cnhkmcp/untracked/APP/give_me_idea/fetch_all_operators.py +99 -0
  29. cnhkmcp/untracked/APP/simulator/simulator_wqb.py +16 -16
  30. cnhkmcp/untracked/APP/static/brain.js +61 -0
  31. cnhkmcp/untracked/APP/static/script.js +140 -0
  32. cnhkmcp/untracked/APP/templates/index.html +25 -4
  33. cnhkmcp/untracked/APP//321/210/342/224/220/320/240/321/210/320/261/320/234/321/206/320/231/320/243/321/205/342/225/235/320/220/321/206/320/230/320/241.py +67 -6
  34. {cnhkmcp-2.0.4.dist-info → cnhkmcp-2.1.1.dist-info}/METADATA +1 -1
  35. {cnhkmcp-2.0.4.dist-info → cnhkmcp-2.1.1.dist-info}/RECORD +40 -20
  36. cnhkmcp/untracked/APP/hkSimulator/autosim_20251205_145240.log +0 -0
  37. cnhkmcp/untracked/APP/hkSimulator/autosim_20251215_030103.log +0 -0
  38. /cnhkmcp/untracked/{APP/hkSimulator/ace.log → AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/ace.log"} +0 -0
  39. {cnhkmcp-2.0.4.dist-info → cnhkmcp-2.1.1.dist-info}/WHEEL +0 -0
  40. {cnhkmcp-2.0.4.dist-info → cnhkmcp-2.1.1.dist-info}/entry_points.txt +0 -0
  41. {cnhkmcp-2.0.4.dist-info → cnhkmcp-2.1.1.dist-info}/licenses/LICENSE +0 -0
  42. {cnhkmcp-2.0.4.dist-info → cnhkmcp-2.1.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,280 @@
1
+ import os
2
+ import sys
3
+ import pandas as pd
4
+ import json
5
+ import re
6
+ from pathlib import Path
7
+ from typing import List
8
+
9
+ # Add get_knowledgeBase_tool to path
10
+ SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
11
+ TOOL_DIR = os.path.join(SCRIPT_DIR, "get_knowledgeBase_tool")
12
+ if TOOL_DIR not in sys.path:
13
+ sys.path.insert(0, TOOL_DIR)
14
+
15
+ # Import from tool directory
16
+ sys.path.insert(0, TOOL_DIR)
17
+ import ace_lib
18
+ from fetch_all_operators import fetch_operators, prompt_credentials
19
+ from fetch_all_datasets import (
20
+ fetch_all_combinations,
21
+ fetch_datasets_for_combo,
22
+ merge_and_deduplicate,
23
+ )
24
+ from fetch_all_documentation import (
25
+ fetch_tutorials,
26
+ fetch_tutorial_pages,
27
+ fetch_page,
28
+ _extract_page_id,
29
+ )
30
+
31
+
32
+ def ensure_knowledge_dir():
33
+ """Ensure knowledge directory exists"""
34
+ knowledge_dir = os.path.join(SCRIPT_DIR, "knowledge")
35
+ os.makedirs(knowledge_dir, exist_ok=True)
36
+ return knowledge_dir
37
+
38
+
39
+ def to_jsonable(value):
40
+ """Convert values to JSON-serializable, handling NaN and nested structures."""
41
+ try:
42
+ if isinstance(value, float) and pd.isna(value):
43
+ return None
44
+ except TypeError:
45
+ pass
46
+
47
+ if isinstance(value, list):
48
+ return [to_jsonable(v) for v in value if not (isinstance(v, float) and pd.isna(v))]
49
+ if isinstance(value, dict):
50
+ return {k: to_jsonable(v) for k, v in value.items()}
51
+ if isinstance(value, (str, int, float, bool)) or value is None:
52
+ return value
53
+ return str(value)
54
+
55
+
56
+ def safe_filename(name: str, suffix: str = "") -> str:
57
+ base = re.sub(r"[^A-Za-z0-9._-]+", "_", str(name)).strip("_") or "doc"
58
+ base = base[:80]
59
+ return f"{base}{suffix}"
60
+
61
+
62
+ def process_operators(session: ace_lib.SingleSession, knowledge_dir: str):
63
+ """
64
+ Process operators and save as JSON files
65
+
66
+ Args:
67
+ session: Authenticated BRAIN session
68
+ knowledge_dir: Directory to save JSON files
69
+ """
70
+ print("\n=== Processing Operators ===")
71
+
72
+ # Fetch operators data
73
+ print("Fetching operators...")
74
+ operators_df = fetch_operators(session)
75
+
76
+ if operators_df.empty:
77
+ print("No operators found!")
78
+ return
79
+
80
+ print(f"Found {len(operators_df)} operator entries")
81
+
82
+ # Get unique categories
83
+ categories = sorted(operators_df['category'].dropna().unique())
84
+
85
+ for category in categories:
86
+ category_data = operators_df[operators_df['category'] == category].copy()
87
+
88
+ # Create JSON file for this category
89
+ filename = f"{category.replace(' ', '_').lower()}_operators.json"
90
+ filepath = os.path.join(knowledge_dir, filename)
91
+
92
+ print(f"Processing category: {category}")
93
+
94
+ # Convert to list of dicts
95
+ category_list = []
96
+ for idx, row in category_data.iterrows():
97
+ operator_dict = {}
98
+ for col in row.index:
99
+ value = row[col]
100
+ operator_dict[col] = to_jsonable(value)
101
+ category_list.append(operator_dict)
102
+
103
+ # Save category JSON
104
+ with open(filepath, 'w', encoding='utf-8') as f:
105
+ json.dump(category_list, f, ensure_ascii=False, indent=2)
106
+
107
+ print(f"✓ Created {filename} with {len(category_list)} operators")
108
+
109
+
110
+ def process_datasets(session: ace_lib.SingleSession, dataset_dir: str):
111
+ """Fetch datasets and save one JSON per region."""
112
+ print("=== Processing Datasets ===")
113
+
114
+ print("Fetching valid instrument/region/delay/universe combinations...")
115
+ options_df = fetch_all_combinations(session)
116
+ if options_df is None or options_df.empty:
117
+ print("No simulation options fetched; aborting dataset fetch.")
118
+ return
119
+
120
+ all_datasets: list[pd.DataFrame] = []
121
+ combo_idx = 0
122
+
123
+ for _, row in options_df.iterrows():
124
+ instrument_type = row.get("InstrumentType")
125
+ region = row.get("Region")
126
+ delay = row.get("Delay")
127
+ universes = row.get("Universe") or []
128
+
129
+ for universe in universes:
130
+ combo_idx += 1
131
+ print(f"[{combo_idx}] {instrument_type} / {region} / D{delay} / {universe}")
132
+ try:
133
+ df = fetch_datasets_for_combo(session, instrument_type, region, delay, universe)
134
+ print(f" -> {len(df)} rows")
135
+ all_datasets.append(df)
136
+ except Exception as exc:
137
+ print(f" -> Failed: {exc}")
138
+
139
+ if not all_datasets:
140
+ print("No datasets fetched; nothing to save.")
141
+ return
142
+
143
+ combined_df = pd.concat([df for df in all_datasets if not df.empty], ignore_index=True)
144
+ if combined_df.empty:
145
+ print("No datasets fetched; nothing to save.")
146
+ return
147
+
148
+ regions = sorted(combined_df["param_region"].dropna().unique())
149
+ print(f"Found regions: {', '.join(regions)}")
150
+
151
+ for region in regions:
152
+ region_df = combined_df[combined_df["param_region"] == region]
153
+ region_unique = merge_and_deduplicate([region_df])
154
+
155
+ region_list = []
156
+ for _, row in region_unique.iterrows():
157
+ record = {col: to_jsonable(row[col]) for col in row.index}
158
+ region_list.append(record)
159
+
160
+ filename = f"{region.replace(' ', '_').lower()}_datasets.json"
161
+ filepath = os.path.join(dataset_dir, filename)
162
+ with open(filepath, "w", encoding="utf-8") as f:
163
+ json.dump(region_list, f, ensure_ascii=False, indent=2)
164
+
165
+ print(f"✓ Created {filename} with {len(region_list)} datasets")
166
+
167
+
168
+ def process_documentation(session: ace_lib.SingleSession, knowledge_dir: str):
169
+ """Fetch tutorials and pages, save one JSON per page."""
170
+ print("=== Processing Documentation ===")
171
+
172
+ tutorials = fetch_tutorials(session)
173
+ if not tutorials:
174
+ print("No tutorials fetched; skipping documentation.")
175
+ return
176
+
177
+ print(f"Fetched {len(tutorials)} tutorials")
178
+
179
+ page_count = 0
180
+ seen_pages = set()
181
+
182
+ for idx, tutorial in enumerate(tutorials, start=1):
183
+ tutorial_id = _extract_page_id(tutorial) or f"tutorial_{idx}"
184
+ tutorial_title = tutorial.get("title") or tutorial_id
185
+
186
+ page_candidates = []
187
+ if isinstance(tutorial.get("pages"), list):
188
+ page_candidates.extend(tutorial["pages"])
189
+ if tutorial_id:
190
+ try:
191
+ page_candidates.extend(fetch_tutorial_pages(session, tutorial_id))
192
+ except Exception as exc:
193
+ print(f"[{idx:03d}] failed to fetch pages for {tutorial_id}: {exc}")
194
+
195
+ if not page_candidates and tutorial_id:
196
+ page_candidates.append({"id": tutorial_id, "title": tutorial_title})
197
+
198
+ for page_entry in page_candidates:
199
+ page_id = _extract_page_id(page_entry)
200
+ if not page_id or page_id in seen_pages:
201
+ continue
202
+ seen_pages.add(page_id)
203
+
204
+ try:
205
+ page = fetch_page(session, page_id)
206
+ except Exception as exc:
207
+ print(f"[{idx:03d}] page {page_id} failed: {exc}")
208
+ continue
209
+
210
+ page_count += 1
211
+ page_title = page.get("title") or page_entry.get("title") or page_id
212
+
213
+ # Save each page as individual JSON
214
+ filename = safe_filename(f"{idx:03d}_{page_title}", "_documentation.json")
215
+ filepath = os.path.join(knowledge_dir, filename)
216
+
217
+ with open(filepath, "w", encoding="utf-8") as f:
218
+ json.dump(to_jsonable(page), f, ensure_ascii=False, indent=2)
219
+
220
+ print(f"[{idx:03d}] ✓ Created {filename}")
221
+
222
+ print(f"✓ Total: {page_count} documentation pages saved")
223
+
224
+
225
+ def main():
226
+ print("=== BRAIN Knowledge Base Processor ===")
227
+ print("Starting operator processing...\n")
228
+
229
+ # Get credentials
230
+ email, password = prompt_credentials()
231
+ ace_lib.get_credentials = lambda: (email, password)
232
+
233
+ print("Logging in to BRAIN platform...")
234
+ try:
235
+ session = ace_lib.start_session()
236
+ print("✓ Login successful\n")
237
+ except Exception as exc:
238
+ print(f"✗ Login failed: {exc}")
239
+ return
240
+
241
+ # Ensure knowledge directory exists
242
+ knowledge_dir = ensure_knowledge_dir()
243
+ dataset_dir = knowledge_dir # Save datasets directly under knowledge
244
+ print(f"Knowledge directory: {knowledge_dir}\n")
245
+
246
+ # Process documentation (tutorials/pages)
247
+ print("\nStarting documentation processing...\n")
248
+ try:
249
+ process_documentation(session, knowledge_dir)
250
+ except Exception as exc:
251
+ print(f"✗ Failed to process documentation: {exc}")
252
+ import traceback
253
+ traceback.print_exc()
254
+ return
255
+
256
+ # Process operators
257
+ try:
258
+ process_operators(session, knowledge_dir)
259
+ except Exception as exc:
260
+ print(f"✗ Failed to process operators: {exc}")
261
+ import traceback
262
+ traceback.print_exc()
263
+ return
264
+
265
+ # Process datasets by region
266
+ print("\nStarting dataset processing...\n")
267
+ try:
268
+ process_datasets(session, dataset_dir)
269
+ except Exception as exc:
270
+ print(f"✗ Failed to process datasets: {exc}")
271
+ import traceback
272
+ traceback.print_exc()
273
+ return
274
+
275
+
276
+ print("\n=== Processing Complete ===")
277
+
278
+
279
+ if __name__ == "__main__":
280
+ main()
@@ -0,0 +1,356 @@
1
+ import os
2
+ import json
3
+ import shutil
4
+ import chromadb
5
+ from fastembed import TextEmbedding
6
+ from watchdog.observers import Observer
7
+ from watchdog.events import FileSystemEventHandler
8
+ import threading
9
+
10
+
11
+ PREFERRED_MODELS = [
12
+ "jinaai/jina-embeddings-v2-base-zh", # 中英混合友好,~0.64GB
13
+ "sentence-transformers/paraphrase-multilingual-mpnet-base-v2", # 多语 ~50 语种
14
+ "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", # 多语轻量版
15
+ "intfloat/multilingual-e5-large", # 多语更强,体积约 2.2GB
16
+ ]
17
+
18
+ # Final chosen model will be detected at runtime from supported list
19
+ MODEL_NAME = None
20
+ COLLECTION_NAME = "brain_kb_v5"
21
+
22
+ # Optional imports for different file types
23
+ try:
24
+ from pypdf import PdfReader
25
+ except ImportError:
26
+ PdfReader = None
27
+
28
+ try:
29
+ from docx import Document
30
+ except ImportError:
31
+ Document = None
32
+
33
+ class KnowledgeBase:
34
+ def __init__(self, kb_path="knowledge", db_path="vector_db"):
35
+ self.kb_path = os.path.abspath(kb_path)
36
+ self.db_path = os.path.abspath(db_path)
37
+ self.meta_path = os.path.join(self.db_path, "_meta.json")
38
+ self.manifest_path = os.path.join(self.db_path, "_manifest.json")
39
+ self._collection_reset_guard = False
40
+
41
+ if not os.path.exists(self.kb_path):
42
+ os.makedirs(self.kb_path)
43
+
44
+ # Initialize Embedding Model (BAAI/bge-small-zh-v1.5 is ~100MB)
45
+ # This will load from cache if already downloaded
46
+ # Pick the first available model from the preferred list
47
+ _supported_raw = TextEmbedding.list_supported_models()
48
+ supported = set()
49
+ for item in _supported_raw:
50
+ if isinstance(item, dict) and "model" in item:
51
+ supported.add(item["model"])
52
+ elif isinstance(item, str):
53
+ supported.add(item)
54
+ chosen = None
55
+ for name in PREFERRED_MODELS:
56
+ if name in supported:
57
+ chosen = name
58
+ break
59
+ if not chosen:
60
+ raise RuntimeError(
61
+ "No preferred embedding models are supported by fastembed. "
62
+ "Please check available models via TextEmbedding.list_supported_models()."
63
+ )
64
+
65
+ print(f"Loading Knowledge Base Embedding Model: {chosen} (may take some time on first run)...")
66
+ try:
67
+ self.model = TextEmbedding(model_name=chosen)
68
+ print("Embedding Model loaded successfully.")
69
+ except Exception as e:
70
+ print(f"Error loading embedding model: {e}")
71
+ raise
72
+
73
+ # Store chosen model name for reference
74
+ global MODEL_NAME
75
+ MODEL_NAME = chosen
76
+
77
+ # Cache embedding dimension (detects library/model changes that corrupt existing indexes)
78
+ self.embed_dim = self._get_embedding_dim()
79
+ self.chroma_version = getattr(chromadb, "__version__", "unknown")
80
+
81
+ # If the stored index was built with a different model/dimension/chromadb version, wipe it
82
+ self._maybe_reset_for_incompatibility(chosen, self.embed_dim, self.chroma_version)
83
+
84
+ # Initialize Vector DB
85
+ self._init_collection()
86
+
87
+ # Initial sync
88
+ self.sync_knowledge()
89
+
90
+ # Start Watcher
91
+ self.start_watcher()
92
+
93
+ def _init_collection(self, recreate: bool = False):
94
+ """(Re)initialize Chroma client/collection. If recreate=True, wipe on-disk index."""
95
+ if recreate and os.path.exists(self.db_path):
96
+ shutil.rmtree(self.db_path, ignore_errors=True)
97
+ try:
98
+ self.client = chromadb.PersistentClient(path=self.db_path)
99
+ self.collection = self.client.get_or_create_collection(
100
+ name=COLLECTION_NAME,
101
+ metadata={"hnsw:space": "cosine"}
102
+ )
103
+ except Exception as exc:
104
+ # If collection load itself fails, wipe and retry once to clear corrupted segments
105
+ if not recreate:
106
+ shutil.rmtree(self.db_path, ignore_errors=True)
107
+ return self._init_collection(recreate=True)
108
+ raise
109
+
110
+ # Persist metadata about the embedding model used to build this index
111
+ try:
112
+ os.makedirs(self.db_path, exist_ok=True)
113
+ with open(self.meta_path, "w", encoding="utf-8") as f:
114
+ json.dump({
115
+ "model": MODEL_NAME,
116
+ "embed_dim": self.embed_dim,
117
+ "chroma_version": self.chroma_version,
118
+ }, f)
119
+ except Exception:
120
+ pass # Metadata failure should not block runtime
121
+
122
+ def _maybe_reset_for_incompatibility(self, chosen_model: str, embed_dim: int, chroma_version: str):
123
+ """If existing index meta differs (model/dimension/chromadb), wipe it."""
124
+ if not os.path.exists(self.db_path):
125
+ return
126
+ try:
127
+ with open(self.meta_path, "r", encoding="utf-8") as f:
128
+ meta = json.load(f)
129
+ prev_model = meta.get("model")
130
+ prev_dim = meta.get("embed_dim")
131
+ prev_chroma = meta.get("chroma_version")
132
+ if prev_model != chosen_model or prev_dim != embed_dim or prev_chroma != chroma_version:
133
+ shutil.rmtree(self.db_path, ignore_errors=True)
134
+ except Exception:
135
+ # If meta cannot be read, assume stale/corrupted and rebuild
136
+ shutil.rmtree(self.db_path, ignore_errors=True)
137
+
138
+ def _get_embedding_dim(self) -> int:
139
+ for vec in self.model.embed(["dimension_probe"]):
140
+ try:
141
+ return len(vec)
142
+ except Exception:
143
+ return len(list(vec))
144
+ raise RuntimeError("Failed to determine embedding dimension")
145
+
146
+ def sync_knowledge(self, allow_reset: bool = True):
147
+ """Scans the knowledge folder and updates the vector database."""
148
+ print("Syncing knowledge base...")
149
+ manifest = self._load_manifest()
150
+ updated_manifest = {}
151
+ supported_extensions = (".txt", ".md", ".pdf", ".docx", ".json")
152
+ current_files = []
153
+ for filename in os.listdir(self.kb_path):
154
+ file_path = os.path.join(self.kb_path, filename)
155
+ if os.path.isfile(file_path) and filename.lower().endswith(supported_extensions):
156
+ current_files.append(filename)
157
+ mtime = os.path.getmtime(file_path)
158
+ size = os.path.getsize(file_path)
159
+ prev_meta = manifest.get(filename)
160
+ # Skip unchanged files
161
+ if prev_meta and prev_meta.get("mtime") == mtime and prev_meta.get("size") == size:
162
+ updated_manifest[filename] = prev_meta
163
+ continue
164
+ try:
165
+ content = self._extract_text(file_path)
166
+ if content:
167
+ # Sliding window chunking on original text
168
+ chunk_size = 400
169
+ overlap = 80
170
+ original_chunks = []
171
+ for i in range(0, len(content), chunk_size - overlap):
172
+ chunk = content[i:i + chunk_size].strip()
173
+ if chunk:
174
+ original_chunks.append(chunk)
175
+
176
+ if original_chunks:
177
+ # Normalize for embedding generation only (not for storage)
178
+ normalized_chunks = [c.lower().replace('_', ' ') for c in original_chunks]
179
+
180
+ ids = [f"{filename}_{i}" for i in range(len(original_chunks))]
181
+ metadatas = [{"source": filename, "chunk": i} for i in range(len(original_chunks))]
182
+
183
+ # Compute embeddings from normalized text
184
+ embeddings = []
185
+ for v in self.model.embed(normalized_chunks):
186
+ try:
187
+ embeddings.append(v.tolist())
188
+ except Exception:
189
+ embeddings.append(list(v))
190
+
191
+ # Store ORIGINAL text (not normalized) so users see the real content
192
+ self.collection.upsert(
193
+ documents=original_chunks,
194
+ ids=ids,
195
+ metadatas=metadatas,
196
+ embeddings=embeddings
197
+ )
198
+ print(f" ✓ Indexed {filename}: {len(original_chunks)} chunks")
199
+ updated_manifest[filename] = {"mtime": mtime, "size": size}
200
+ except Exception as e:
201
+ err_msg = str(e)
202
+ print(f"Error processing {filename}: {err_msg}")
203
+ # Auto-recover if HNSW/compaction/index errors occur
204
+ if allow_reset and any(x in err_msg.lower() for x in ["hnsw", "compaction", "segment reader"]):
205
+ if not self._collection_reset_guard:
206
+ print("Detected index corruption. Rebuilding vector_db and retrying sync once...")
207
+ self._collection_reset_guard = True
208
+ self._init_collection(recreate=True)
209
+ return self.sync_knowledge(allow_reset=False)
210
+ # Remove deleted files from the index
211
+ deleted_files = set(manifest.keys()) - set(current_files)
212
+ for filename in deleted_files:
213
+ try:
214
+ self.collection.delete(where={"source": filename})
215
+ print(f" ✓ Removed deleted file from index: {filename}")
216
+ except Exception as e:
217
+ print(f" ! Failed to remove {filename}: {e}")
218
+ # Persist manifest
219
+ self._save_manifest(updated_manifest)
220
+ print("Knowledge base sync complete.")
221
+
222
+ def _extract_text(self, file_path):
223
+ ext = os.path.splitext(file_path)[1].lower()
224
+ if ext == ".txt":
225
+ with open(file_path, 'r', encoding='utf-8') as f:
226
+ return f.read()
227
+ elif ext == ".md":
228
+ # Treat Markdown as plain text for retrieval
229
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
230
+ return f.read()
231
+ elif ext == ".pdf":
232
+ if PdfReader:
233
+ reader = PdfReader(file_path)
234
+ text = ""
235
+ for page in reader.pages:
236
+ text += page.extract_text() + "\n"
237
+ return text
238
+ else:
239
+ print("pypdf not installed, skipping PDF.")
240
+ elif ext == ".docx":
241
+ if Document:
242
+ doc = Document(file_path)
243
+ return "\n".join([para.text for para in doc.paragraphs])
244
+ else:
245
+ print("python-docx not installed, skipping Word.")
246
+ elif ext == ".json":
247
+ with open(file_path, 'r', encoding='utf-8') as f:
248
+ data = json.load(f)
249
+ return json.dumps(data, ensure_ascii=False, indent=2)
250
+ return None
251
+
252
+ def query(self, text, top_k=5, distance_threshold=0.8):
253
+ """Retrieves relevant snippets from the knowledge base.
254
+
255
+ Uses cosine distance (lower is better). A result is treated as a hit only
256
+ when best_distance <= distance_threshold.
257
+ Returns:
258
+ dict: {"hit": bool, "context": str, "hits": [{source, chunk, distance, text}, ...]}
259
+ """
260
+ try:
261
+ # Normalize query same as indexed content
262
+ normalized_text = text.lower().replace('_', ' ')
263
+
264
+ q_vec = None
265
+ for v in self.model.embed([normalized_text]):
266
+ try:
267
+ q_vec = v.tolist()
268
+ except Exception:
269
+ q_vec = list(v)
270
+ break
271
+ if q_vec is None:
272
+ return {"hit": False, "context": "", "hits": []}
273
+
274
+ results = self.collection.query(
275
+ query_embeddings=[q_vec],
276
+ n_results=top_k,
277
+ include=["documents", "metadatas", "distances"]
278
+ )
279
+
280
+ docs = (results or {}).get("documents") or []
281
+ metas = (results or {}).get("metadatas") or []
282
+ dists = (results or {}).get("distances") or []
283
+
284
+ if not docs or not docs[0]:
285
+ print("[KB Query] No results returned from collection")
286
+ return {"hit": False, "context": "", "hits": []}
287
+
288
+ docs0 = docs[0]
289
+ metas0 = metas[0] if metas and metas[0] else [{} for _ in docs0]
290
+ dists0 = dists[0] if dists and dists[0] else [None for _ in docs0]
291
+
292
+ hits = []
293
+ for doc_text, meta, dist in zip(docs0, metas0, dists0):
294
+ hits.append({
295
+ "source": (meta or {}).get("source", ""),
296
+ "chunk": (meta or {}).get("chunk", None),
297
+ "distance": dist,
298
+ "text": doc_text,
299
+ })
300
+
301
+ best = hits[0].get("distance")
302
+ is_hit = (best is not None) and (best <= distance_threshold)
303
+
304
+ # Debug log
305
+ best_str = f"{best:.4f}" if best is not None else "N/A"
306
+ print(f"[KB Query] '{text[:50]}...' -> best_dist={best_str}, threshold={distance_threshold}, hit={is_hit}")
307
+ if hits:
308
+ top3_dists = [f"{h['distance']:.4f}" if h['distance'] is not None else "N/A" for h in hits[:3]]
309
+ print(f"[KB Query] Top 3 distances: {top3_dists}")
310
+
311
+ context = "\n---\n".join([h["text"] for h in hits]) if is_hit else ""
312
+ return {"hit": is_hit, "context": context, "hits": hits}
313
+ except Exception as e:
314
+ print(f"Query error: {e}")
315
+ import traceback
316
+ traceback.print_exc()
317
+ return {"hit": False, "context": "", "hits": []}
318
+
319
+ def start_watcher(self):
320
+ event_handler = KBHandler(self)
321
+ self.observer = Observer()
322
+ self.observer.schedule(event_handler, self.kb_path, recursive=False)
323
+ self.observer.start()
324
+
325
+ def _load_manifest(self):
326
+ if not os.path.exists(self.manifest_path):
327
+ return {}
328
+ try:
329
+ with open(self.manifest_path, "r", encoding="utf-8") as f:
330
+ return json.load(f)
331
+ except Exception:
332
+ return {}
333
+
334
+ def _save_manifest(self, data):
335
+ try:
336
+ os.makedirs(self.db_path, exist_ok=True)
337
+ with open(self.manifest_path, "w", encoding="utf-8") as f:
338
+ json.dump(data, f, ensure_ascii=False, indent=2)
339
+ except Exception as e:
340
+ print(f" ! Failed to save manifest: {e}")
341
+
342
+ class KBHandler(FileSystemEventHandler):
343
+ def __init__(self, kb_instance):
344
+ self.kb = kb_instance
345
+ self.supported_extensions = (".txt", ".md", ".pdf", ".docx", ".json")
346
+
347
+ def on_modified(self, event):
348
+ if not event.is_directory and event.src_path.lower().endswith(self.supported_extensions):
349
+ print(f"File modified: {event.src_path}. Re-syncing...")
350
+ threading.Thread(target=self.kb.sync_knowledge).start()
351
+
352
+ def on_created(self, event):
353
+ if not event.is_directory and event.src_path.lower().endswith(self.supported_extensions):
354
+ print(f"File created: {event.src_path}. Syncing...")
355
+ threading.Thread(target=self.kb.sync_knowledge).start()
356
+
@@ -0,0 +1,7 @@
1
+ openai
2
+ Pillow
3
+ fastembed>=0.3.4
4
+ chromadb>=0.5.0
5
+ watchdog
6
+ pypdf
7
+ python-docx