cnhkmcp 2.0.3__py3-none-any.whl → 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/README.md +38 -0
  2. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/config.json +6 -0
  3. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/get_knowledgeBase_tool/ace_lib.py +1510 -0
  4. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/get_knowledgeBase_tool/fetch_all_datasets.py +157 -0
  5. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/get_knowledgeBase_tool/fetch_all_documentation.py +132 -0
  6. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/get_knowledgeBase_tool/fetch_all_operators.py +99 -0
  7. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/get_knowledgeBase_tool/helpful_functions.py +180 -0
  8. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/icon.ico +0 -0
  9. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/icon.png +0 -0
  10. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/knowledge/test.txt +1 -0
  11. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/main.py +581 -0
  12. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/process_knowledge_base.py +280 -0
  13. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/rag_engine.py +265 -0
  14. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/requirements.txt +12 -0
  15. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/run.bat +3 -0
  16. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/vector_db/chroma.sqlite3 +0 -0
  17. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242//321/211/320/266/320/246/321/206/320/274/320/261/321/210/342/224/220/320/240/321/210/320/261/320/234/321/206/320/231/320/243/321/205/342/225/235/320/220/321/206/320/230/320/241.py +265 -0
  18. cnhkmcp/untracked/APP/Tranformer/Transformer.py +2804 -11
  19. cnhkmcp/untracked/APP/Tranformer/output/Alpha_candidates.json +1524 -889
  20. cnhkmcp/untracked/APP/Tranformer/output/Alpha_generated_expressions_error.json +884 -111
  21. cnhkmcp/untracked/APP/Tranformer/output/Alpha_generated_expressions_success.json +442 -168
  22. cnhkmcp/untracked/APP/Tranformer/template_summary.txt +2775 -1
  23. cnhkmcp/untracked/APP/ace.log +2 -0
  24. cnhkmcp/untracked/APP/give_me_idea/fetch_all_datasets.py +157 -0
  25. cnhkmcp/untracked/APP/give_me_idea/fetch_all_operators.py +99 -0
  26. cnhkmcp/untracked/APP/simulator/simulator_wqb.py +16 -16
  27. cnhkmcp/untracked/APP/static/brain.js +61 -0
  28. cnhkmcp/untracked/APP/static/script.js +140 -0
  29. cnhkmcp/untracked/APP/templates/index.html +25 -4
  30. cnhkmcp/untracked/APP//321/210/342/224/220/320/240/321/210/320/261/320/234/321/206/320/231/320/243/321/205/342/225/235/320/220/321/206/320/230/320/241.py +70 -8
  31. {cnhkmcp-2.0.3.dist-info → cnhkmcp-2.1.0.dist-info}/METADATA +1 -1
  32. {cnhkmcp-2.0.3.dist-info → cnhkmcp-2.1.0.dist-info}/RECORD +36 -20
  33. cnhkmcp/untracked/APP/hkSimulator/ace.log +0 -0
  34. cnhkmcp/untracked/APP/hkSimulator/autosim_20251205_145240.log +0 -0
  35. cnhkmcp/untracked/APP/hkSimulator/autosim_20251215_030103.log +0 -0
  36. {cnhkmcp-2.0.3.dist-info → cnhkmcp-2.1.0.dist-info}/WHEEL +0 -0
  37. {cnhkmcp-2.0.3.dist-info → cnhkmcp-2.1.0.dist-info}/entry_points.txt +0 -0
  38. {cnhkmcp-2.0.3.dist-info → cnhkmcp-2.1.0.dist-info}/licenses/LICENSE +0 -0
  39. {cnhkmcp-2.0.3.dist-info → cnhkmcp-2.1.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,280 @@
1
+ import os
2
+ import sys
3
+ import pandas as pd
4
+ import json
5
+ import re
6
+ from pathlib import Path
7
+ from typing import List
8
+
9
+ # Add get_knowledgeBase_tool to path
10
+ SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
11
+ TOOL_DIR = os.path.join(SCRIPT_DIR, "get_knowledgeBase_tool")
12
+ if TOOL_DIR not in sys.path:
13
+ sys.path.insert(0, TOOL_DIR)
14
+
15
+ # Import from tool directory
16
+ sys.path.insert(0, TOOL_DIR)
17
+ import ace_lib
18
+ from fetch_all_operators import fetch_operators, prompt_credentials
19
+ from fetch_all_datasets import (
20
+ fetch_all_combinations,
21
+ fetch_datasets_for_combo,
22
+ merge_and_deduplicate,
23
+ )
24
+ from fetch_all_documentation import (
25
+ fetch_tutorials,
26
+ fetch_tutorial_pages,
27
+ fetch_page,
28
+ _extract_page_id,
29
+ )
30
+
31
+
32
+ def ensure_knowledge_dir():
33
+ """Ensure knowledge directory exists"""
34
+ knowledge_dir = os.path.join(SCRIPT_DIR, "knowledge")
35
+ os.makedirs(knowledge_dir, exist_ok=True)
36
+ return knowledge_dir
37
+
38
+
39
+ def to_jsonable(value):
40
+ """Convert values to JSON-serializable, handling NaN and nested structures."""
41
+ try:
42
+ if isinstance(value, float) and pd.isna(value):
43
+ return None
44
+ except TypeError:
45
+ pass
46
+
47
+ if isinstance(value, list):
48
+ return [to_jsonable(v) for v in value if not (isinstance(v, float) and pd.isna(v))]
49
+ if isinstance(value, dict):
50
+ return {k: to_jsonable(v) for k, v in value.items()}
51
+ if isinstance(value, (str, int, float, bool)) or value is None:
52
+ return value
53
+ return str(value)
54
+
55
+
56
+ def safe_filename(name: str, suffix: str = "") -> str:
57
+ base = re.sub(r"[^A-Za-z0-9._-]+", "_", str(name)).strip("_") or "doc"
58
+ base = base[:80]
59
+ return f"{base}{suffix}"
60
+
61
+
62
+ def process_operators(session: ace_lib.SingleSession, knowledge_dir: str):
63
+ """
64
+ Process operators and save as JSON files
65
+
66
+ Args:
67
+ session: Authenticated BRAIN session
68
+ knowledge_dir: Directory to save JSON files
69
+ """
70
+ print("\n=== Processing Operators ===")
71
+
72
+ # Fetch operators data
73
+ print("Fetching operators...")
74
+ operators_df = fetch_operators(session)
75
+
76
+ if operators_df.empty:
77
+ print("No operators found!")
78
+ return
79
+
80
+ print(f"Found {len(operators_df)} operator entries")
81
+
82
+ # Get unique categories
83
+ categories = sorted(operators_df['category'].dropna().unique())
84
+
85
+ for category in categories:
86
+ category_data = operators_df[operators_df['category'] == category].copy()
87
+
88
+ # Create JSON file for this category
89
+ filename = f"{category.replace(' ', '_').lower()}_operators.json"
90
+ filepath = os.path.join(knowledge_dir, filename)
91
+
92
+ print(f"Processing category: {category}")
93
+
94
+ # Convert to list of dicts
95
+ category_list = []
96
+ for idx, row in category_data.iterrows():
97
+ operator_dict = {}
98
+ for col in row.index:
99
+ value = row[col]
100
+ operator_dict[col] = to_jsonable(value)
101
+ category_list.append(operator_dict)
102
+
103
+ # Save category JSON
104
+ with open(filepath, 'w', encoding='utf-8') as f:
105
+ json.dump(category_list, f, ensure_ascii=False, indent=2)
106
+
107
+ print(f"✓ Created {filename} with {len(category_list)} operators")
108
+
109
+
110
+ def process_datasets(session: ace_lib.SingleSession, dataset_dir: str):
111
+ """Fetch datasets and save one JSON per region."""
112
+ print("=== Processing Datasets ===")
113
+
114
+ print("Fetching valid instrument/region/delay/universe combinations...")
115
+ options_df = fetch_all_combinations(session)
116
+ if options_df is None or options_df.empty:
117
+ print("No simulation options fetched; aborting dataset fetch.")
118
+ return
119
+
120
+ all_datasets: list[pd.DataFrame] = []
121
+ combo_idx = 0
122
+
123
+ for _, row in options_df.iterrows():
124
+ instrument_type = row.get("InstrumentType")
125
+ region = row.get("Region")
126
+ delay = row.get("Delay")
127
+ universes = row.get("Universe") or []
128
+
129
+ for universe in universes:
130
+ combo_idx += 1
131
+ print(f"[{combo_idx}] {instrument_type} / {region} / D{delay} / {universe}")
132
+ try:
133
+ df = fetch_datasets_for_combo(session, instrument_type, region, delay, universe)
134
+ print(f" -> {len(df)} rows")
135
+ all_datasets.append(df)
136
+ except Exception as exc:
137
+ print(f" -> Failed: {exc}")
138
+
139
+ if not all_datasets:
140
+ print("No datasets fetched; nothing to save.")
141
+ return
142
+
143
+ combined_df = pd.concat([df for df in all_datasets if not df.empty], ignore_index=True)
144
+ if combined_df.empty:
145
+ print("No datasets fetched; nothing to save.")
146
+ return
147
+
148
+ regions = sorted(combined_df["param_region"].dropna().unique())
149
+ print(f"Found regions: {', '.join(regions)}")
150
+
151
+ for region in regions:
152
+ region_df = combined_df[combined_df["param_region"] == region]
153
+ region_unique = merge_and_deduplicate([region_df])
154
+
155
+ region_list = []
156
+ for _, row in region_unique.iterrows():
157
+ record = {col: to_jsonable(row[col]) for col in row.index}
158
+ region_list.append(record)
159
+
160
+ filename = f"{region.replace(' ', '_').lower()}_datasets.json"
161
+ filepath = os.path.join(dataset_dir, filename)
162
+ with open(filepath, "w", encoding="utf-8") as f:
163
+ json.dump(region_list, f, ensure_ascii=False, indent=2)
164
+
165
+ print(f"✓ Created {filename} with {len(region_list)} datasets")
166
+
167
+
168
+ def process_documentation(session: ace_lib.SingleSession, knowledge_dir: str):
169
+ """Fetch tutorials and pages, save one JSON per page."""
170
+ print("=== Processing Documentation ===")
171
+
172
+ tutorials = fetch_tutorials(session)
173
+ if not tutorials:
174
+ print("No tutorials fetched; skipping documentation.")
175
+ return
176
+
177
+ print(f"Fetched {len(tutorials)} tutorials")
178
+
179
+ page_count = 0
180
+ seen_pages = set()
181
+
182
+ for idx, tutorial in enumerate(tutorials, start=1):
183
+ tutorial_id = _extract_page_id(tutorial) or f"tutorial_{idx}"
184
+ tutorial_title = tutorial.get("title") or tutorial_id
185
+
186
+ page_candidates = []
187
+ if isinstance(tutorial.get("pages"), list):
188
+ page_candidates.extend(tutorial["pages"])
189
+ if tutorial_id:
190
+ try:
191
+ page_candidates.extend(fetch_tutorial_pages(session, tutorial_id))
192
+ except Exception as exc:
193
+ print(f"[{idx:03d}] failed to fetch pages for {tutorial_id}: {exc}")
194
+
195
+ if not page_candidates and tutorial_id:
196
+ page_candidates.append({"id": tutorial_id, "title": tutorial_title})
197
+
198
+ for page_entry in page_candidates:
199
+ page_id = _extract_page_id(page_entry)
200
+ if not page_id or page_id in seen_pages:
201
+ continue
202
+ seen_pages.add(page_id)
203
+
204
+ try:
205
+ page = fetch_page(session, page_id)
206
+ except Exception as exc:
207
+ print(f"[{idx:03d}] page {page_id} failed: {exc}")
208
+ continue
209
+
210
+ page_count += 1
211
+ page_title = page.get("title") or page_entry.get("title") or page_id
212
+
213
+ # Save each page as individual JSON
214
+ filename = safe_filename(f"{idx:03d}_{page_title}", "_documentation.json")
215
+ filepath = os.path.join(knowledge_dir, filename)
216
+
217
+ with open(filepath, "w", encoding="utf-8") as f:
218
+ json.dump(to_jsonable(page), f, ensure_ascii=False, indent=2)
219
+
220
+ print(f"[{idx:03d}] ✓ Created {filename}")
221
+
222
+ print(f"✓ Total: {page_count} documentation pages saved")
223
+
224
+
225
+ def main():
226
+ print("=== BRAIN Knowledge Base Processor ===")
227
+ print("Starting operator processing...\n")
228
+
229
+ # Get credentials
230
+ email, password = prompt_credentials()
231
+ ace_lib.get_credentials = lambda: (email, password)
232
+
233
+ print("Logging in to BRAIN platform...")
234
+ try:
235
+ session = ace_lib.start_session()
236
+ print("✓ Login successful\n")
237
+ except Exception as exc:
238
+ print(f"✗ Login failed: {exc}")
239
+ return
240
+
241
+ # Ensure knowledge directory exists
242
+ knowledge_dir = ensure_knowledge_dir()
243
+ dataset_dir = knowledge_dir # Save datasets directly under knowledge
244
+ print(f"Knowledge directory: {knowledge_dir}\n")
245
+
246
+ # Process documentation (tutorials/pages)
247
+ print("\nStarting documentation processing...\n")
248
+ try:
249
+ process_documentation(session, knowledge_dir)
250
+ except Exception as exc:
251
+ print(f"✗ Failed to process documentation: {exc}")
252
+ import traceback
253
+ traceback.print_exc()
254
+ return
255
+
256
+ # Process operators
257
+ try:
258
+ process_operators(session, knowledge_dir)
259
+ except Exception as exc:
260
+ print(f"✗ Failed to process operators: {exc}")
261
+ import traceback
262
+ traceback.print_exc()
263
+ return
264
+
265
+ # Process datasets by region
266
+ print("\nStarting dataset processing...\n")
267
+ try:
268
+ process_datasets(session, dataset_dir)
269
+ except Exception as exc:
270
+ print(f"✗ Failed to process datasets: {exc}")
271
+ import traceback
272
+ traceback.print_exc()
273
+ return
274
+
275
+
276
+ print("\n=== Processing Complete ===")
277
+
278
+
279
+ if __name__ == "__main__":
280
+ main()
@@ -0,0 +1,265 @@
1
+ import os
2
+ import json
3
+ import shutil
4
+ import chromadb
5
+ from fastembed import TextEmbedding
6
+ from watchdog.observers import Observer
7
+ from watchdog.events import FileSystemEventHandler
8
+ import threading
9
+
10
+
11
+ PREFERRED_MODELS = [
12
+ "jinaai/jina-embeddings-v2-base-zh", # 中英混合友好,~0.64GB
13
+ "sentence-transformers/paraphrase-multilingual-mpnet-base-v2", # 多语 ~50 语种
14
+ "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", # 多语轻量版
15
+ "intfloat/multilingual-e5-large", # 多语更强,体积约 2.2GB
16
+ ]
17
+
18
+ # Final chosen model will be detected at runtime from supported list
19
+ MODEL_NAME = None
20
+ COLLECTION_NAME = "brain_kb_v5"
21
+
22
+ # Optional imports for different file types
23
+ try:
24
+ from pypdf import PdfReader
25
+ except ImportError:
26
+ PdfReader = None
27
+
28
+ try:
29
+ from docx import Document
30
+ except ImportError:
31
+ Document = None
32
+
33
+ class KnowledgeBase:
34
+ def __init__(self, kb_path="knowledge", db_path="vector_db"):
35
+ self.kb_path = os.path.abspath(kb_path)
36
+ self.db_path = os.path.abspath(db_path)
37
+ self._collection_reset_guard = False
38
+
39
+ if not os.path.exists(self.kb_path):
40
+ os.makedirs(self.kb_path)
41
+
42
+ # Initialize Embedding Model (BAAI/bge-small-zh-v1.5 is ~100MB)
43
+ # This will load from cache if already downloaded
44
+ # Pick the first available model from the preferred list
45
+ _supported_raw = TextEmbedding.list_supported_models()
46
+ supported = set()
47
+ for item in _supported_raw:
48
+ if isinstance(item, dict) and "model" in item:
49
+ supported.add(item["model"])
50
+ elif isinstance(item, str):
51
+ supported.add(item)
52
+ chosen = None
53
+ for name in PREFERRED_MODELS:
54
+ if name in supported:
55
+ chosen = name
56
+ break
57
+ if not chosen:
58
+ raise RuntimeError(
59
+ "No preferred embedding models are supported by fastembed. "
60
+ "Please check available models via TextEmbedding.list_supported_models()."
61
+ )
62
+
63
+ print(f"Loading Knowledge Base Embedding Model: {chosen} (may take some time on first run)...")
64
+ try:
65
+ self.model = TextEmbedding(model_name=chosen)
66
+ print("Embedding Model loaded successfully.")
67
+ except Exception as e:
68
+ print(f"Error loading embedding model: {e}")
69
+ raise
70
+
71
+ # Store chosen model name for reference
72
+ global MODEL_NAME
73
+ MODEL_NAME = chosen
74
+
75
+ # Initialize Vector DB
76
+ self._init_collection()
77
+
78
+ # Initial sync
79
+ self.sync_knowledge()
80
+
81
+ # Start Watcher
82
+ self.start_watcher()
83
+
84
+ def _init_collection(self, recreate: bool = False):
85
+ """(Re)initialize Chroma client/collection. If recreate=True, wipe on-disk index."""
86
+ if recreate and os.path.exists(self.db_path):
87
+ shutil.rmtree(self.db_path, ignore_errors=True)
88
+ self.client = chromadb.PersistentClient(path=self.db_path)
89
+ self.collection = self.client.get_or_create_collection(
90
+ name=COLLECTION_NAME,
91
+ metadata={"hnsw:space": "cosine"}
92
+ )
93
+
94
+ def sync_knowledge(self, allow_reset: bool = True):
95
+ """Scans the knowledge folder and updates the vector database."""
96
+ print("Syncing knowledge base...")
97
+ supported_extensions = (".txt", ".md", ".pdf", ".docx", ".json")
98
+ for filename in os.listdir(self.kb_path):
99
+ file_path = os.path.join(self.kb_path, filename)
100
+ if os.path.isfile(file_path) and filename.lower().endswith(supported_extensions):
101
+ try:
102
+ content = self._extract_text(file_path)
103
+ if content:
104
+ # Sliding window chunking on original text
105
+ chunk_size = 400
106
+ overlap = 80
107
+ original_chunks = []
108
+ for i in range(0, len(content), chunk_size - overlap):
109
+ chunk = content[i:i + chunk_size].strip()
110
+ if chunk:
111
+ original_chunks.append(chunk)
112
+
113
+ if original_chunks:
114
+ # Normalize for embedding generation only (not for storage)
115
+ normalized_chunks = [c.lower().replace('_', ' ') for c in original_chunks]
116
+
117
+ ids = [f"{filename}_{i}" for i in range(len(original_chunks))]
118
+ metadatas = [{"source": filename, "chunk": i} for i in range(len(original_chunks))]
119
+
120
+ # Compute embeddings from normalized text
121
+ embeddings = []
122
+ for v in self.model.embed(normalized_chunks):
123
+ try:
124
+ embeddings.append(v.tolist())
125
+ except Exception:
126
+ embeddings.append(list(v))
127
+
128
+ # Store ORIGINAL text (not normalized) so users see the real content
129
+ self.collection.upsert(
130
+ documents=original_chunks,
131
+ ids=ids,
132
+ metadatas=metadatas,
133
+ embeddings=embeddings
134
+ )
135
+ print(f" ✓ Indexed {filename}: {len(original_chunks)} chunks")
136
+ except Exception as e:
137
+ err_msg = str(e)
138
+ print(f"Error processing {filename}: {err_msg}")
139
+ # Auto-recover if HNSW/compaction/index errors occur
140
+ if allow_reset and any(x in err_msg.lower() for x in ["hnsw", "compaction", "segment reader"]):
141
+ if not self._collection_reset_guard:
142
+ print("Detected index corruption. Rebuilding vector_db and retrying sync once...")
143
+ self._collection_reset_guard = True
144
+ self._init_collection(recreate=True)
145
+ return self.sync_knowledge(allow_reset=False)
146
+ print("Knowledge base sync complete.")
147
+
148
+ def _extract_text(self, file_path):
149
+ ext = os.path.splitext(file_path)[1].lower()
150
+ if ext == ".txt":
151
+ with open(file_path, 'r', encoding='utf-8') as f:
152
+ return f.read()
153
+ elif ext == ".md":
154
+ # Treat Markdown as plain text for retrieval
155
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
156
+ return f.read()
157
+ elif ext == ".pdf":
158
+ if PdfReader:
159
+ reader = PdfReader(file_path)
160
+ text = ""
161
+ for page in reader.pages:
162
+ text += page.extract_text() + "\n"
163
+ return text
164
+ else:
165
+ print("pypdf not installed, skipping PDF.")
166
+ elif ext == ".docx":
167
+ if Document:
168
+ doc = Document(file_path)
169
+ return "\n".join([para.text for para in doc.paragraphs])
170
+ else:
171
+ print("python-docx not installed, skipping Word.")
172
+ elif ext == ".json":
173
+ with open(file_path, 'r', encoding='utf-8') as f:
174
+ data = json.load(f)
175
+ return json.dumps(data, ensure_ascii=False, indent=2)
176
+ return None
177
+
178
+ def query(self, text, top_k=5, distance_threshold=0.8):
179
+ """Retrieves relevant snippets from the knowledge base.
180
+
181
+ Uses cosine distance (lower is better). A result is treated as a hit only
182
+ when best_distance <= distance_threshold.
183
+ Returns:
184
+ dict: {"hit": bool, "context": str, "hits": [{source, chunk, distance, text}, ...]}
185
+ """
186
+ try:
187
+ # Normalize query same as indexed content
188
+ normalized_text = text.lower().replace('_', ' ')
189
+
190
+ q_vec = None
191
+ for v in self.model.embed([normalized_text]):
192
+ try:
193
+ q_vec = v.tolist()
194
+ except Exception:
195
+ q_vec = list(v)
196
+ break
197
+ if q_vec is None:
198
+ return {"hit": False, "context": "", "hits": []}
199
+
200
+ results = self.collection.query(
201
+ query_embeddings=[q_vec],
202
+ n_results=top_k,
203
+ include=["documents", "metadatas", "distances"]
204
+ )
205
+
206
+ docs = (results or {}).get("documents") or []
207
+ metas = (results or {}).get("metadatas") or []
208
+ dists = (results or {}).get("distances") or []
209
+
210
+ if not docs or not docs[0]:
211
+ print("[KB Query] No results returned from collection")
212
+ return {"hit": False, "context": "", "hits": []}
213
+
214
+ docs0 = docs[0]
215
+ metas0 = metas[0] if metas and metas[0] else [{} for _ in docs0]
216
+ dists0 = dists[0] if dists and dists[0] else [None for _ in docs0]
217
+
218
+ hits = []
219
+ for doc_text, meta, dist in zip(docs0, metas0, dists0):
220
+ hits.append({
221
+ "source": (meta or {}).get("source", ""),
222
+ "chunk": (meta or {}).get("chunk", None),
223
+ "distance": dist,
224
+ "text": doc_text,
225
+ })
226
+
227
+ best = hits[0].get("distance")
228
+ is_hit = (best is not None) and (best <= distance_threshold)
229
+
230
+ # Debug log
231
+ best_str = f"{best:.4f}" if best is not None else "N/A"
232
+ print(f"[KB Query] '{text[:50]}...' -> best_dist={best_str}, threshold={distance_threshold}, hit={is_hit}")
233
+ if hits:
234
+ top3_dists = [f"{h['distance']:.4f}" if h['distance'] is not None else "N/A" for h in hits[:3]]
235
+ print(f"[KB Query] Top 3 distances: {top3_dists}")
236
+
237
+ context = "\n---\n".join([h["text"] for h in hits]) if is_hit else ""
238
+ return {"hit": is_hit, "context": context, "hits": hits}
239
+ except Exception as e:
240
+ print(f"Query error: {e}")
241
+ import traceback
242
+ traceback.print_exc()
243
+ return {"hit": False, "context": "", "hits": []}
244
+
245
+ def start_watcher(self):
246
+ event_handler = KBHandler(self)
247
+ self.observer = Observer()
248
+ self.observer.schedule(event_handler, self.kb_path, recursive=False)
249
+ self.observer.start()
250
+
251
+ class KBHandler(FileSystemEventHandler):
252
+ def __init__(self, kb_instance):
253
+ self.kb = kb_instance
254
+ self.supported_extensions = (".txt", ".md", ".pdf", ".docx", ".json")
255
+
256
+ def on_modified(self, event):
257
+ if not event.is_directory and event.src_path.lower().endswith(self.supported_extensions):
258
+ print(f"File modified: {event.src_path}. Re-syncing...")
259
+ threading.Thread(target=self.kb.sync_knowledge).start()
260
+
261
+ def on_created(self, event):
262
+ if not event.is_directory and event.src_path.lower().endswith(self.supported_extensions):
263
+ print(f"File created: {event.src_path}. Syncing...")
264
+ threading.Thread(target=self.kb.sync_knowledge).start()
265
+
@@ -0,0 +1,12 @@
1
+ openai
2
+ pyautogui
3
+ Pillow
4
+ pyperclip
5
+ keyboard
6
+ fastembed>=0.3.4
7
+ chromadb>=0.5.0
8
+ watchdog
9
+ urllib3>=2.5.0
10
+ pypdf
11
+ python-docx
12
+ pywin32