cnhkmcp 2.1.0__tar.gz → 2.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cnhkmcp-2.1.0/cnhkmcp.egg-info → cnhkmcp-2.1.2}/PKG-INFO +1 -1
- {cnhkmcp-2.1.0/cnhkmcp/untracked/APP/give_me_idea → cnhkmcp-2.1.2/cnhkmcp/untracked/AI/346/241/214/351/235/242/346/217/222/344/273/266/get_knowledgeBase_tool}/helpful_functions.py +1 -1
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/AI/346/241/214/351/235/242/346/217/222/344/273/266/main.py +0 -5
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/AI/346/241/214/351/235/242/346/217/222/344/273/266/process_knowledge_base.py +73 -72
- cnhkmcp-2.1.2/cnhkmcp/untracked/AI/346/241/214/351/235/242/346/217/222/344/273/266/rag_engine.py +408 -0
- cnhkmcp-2.1.2/cnhkmcp/untracked/AI/346/241/214/351/235/242/346/217/222/344/273/266/requirements.txt +7 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/Tranformer/Transformer.py +1 -1
- cnhkmcp-2.1.2/cnhkmcp/untracked/APP/Tranformer/ace.log +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/Tranformer/helpful_functions.py +1 -1
- {cnhkmcp-2.1.0/cnhkmcp/untracked/AI/346/241/214/351/235/242/346/217/222/344/273/266/get_knowledgeBase_tool → cnhkmcp-2.1.2/cnhkmcp/untracked/APP/give_me_idea}/helpful_functions.py +1 -1
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/helpful_functions.py +1 -1
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/hkSimulator/ace_lib.py +2 -2
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/hkSimulator/autosimulator.py +4 -4
- cnhkmcp-2.1.2/cnhkmcp/untracked/APP/hkSimulator/helpful_functions.py +180 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/simulator/simulator_wqb.py +1 -1
- cnhkmcp-2.1.2/cnhkmcp/untracked/APP//347/274/230/345/210/206/344/270/200/351/201/223/346/241/245/helpful_functions.py +180 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/mcp/346/226/207/344/273/266/350/256/272/345/235/233/347/211/2102_/345/246/202/346/236/234/345/216/237/347/211/210/345/220/257/345/212/250/344/270/215/344/272/206/346/265/217/350/247/210/345/231/250/345/260/261/350/257/225/350/277/231/344/270/252/platform_functions.py +2 -2
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/platform_functions.py +1 -1
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2/cnhkmcp.egg-info}/PKG-INFO +1 -1
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp.egg-info/SOURCES.txt +1 -1
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/setup.py +1 -1
- cnhkmcp-2.1.0/cnhkmcp/untracked/AI/346/241/214/351/235/242/346/217/222/344/273/266/rag_engine.py +0 -265
- cnhkmcp-2.1.0/cnhkmcp/untracked/AI/346/241/214/351/235/242/346/217/222/344/273/266/requirements.txt +0 -12
- cnhkmcp-2.1.0/cnhkmcp/untracked/AI/346/241/214/351/235/242/346/217/222/344/273/266/vector_db/chroma.sqlite3 +0 -0
- cnhkmcp-2.1.0/cnhkmcp/untracked/APP/hkSimulator/helpful_functions.py +0 -180
- cnhkmcp-2.1.0/cnhkmcp/untracked/APP//347/274/230/345/210/206/344/270/200/351/201/223/346/241/245/helpful_functions.py +0 -180
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/LICENSE +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/MANIFEST.in +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/README.md +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/__init__.py +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/AI/346/241/214/351/235/242/346/217/222/344/273/266/README.md" +0 -0
- /cnhkmcp-2.1.0/cnhkmcp/untracked/APP/Tranformer/ace.log → /cnhkmcp-2.1.2/cnhkmcp/untracked/AI/346/241/214/351/235/242/346/217/222/344/273/266/ace.log" +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/AI/346/241/214/351/235/242/346/217/222/344/273/266/config.json" +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/AI/346/241/214/351/235/242/346/217/222/344/273/266/get_knowledgeBase_tool/ace_lib.py" +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/AI/346/241/214/351/235/242/346/217/222/344/273/266/get_knowledgeBase_tool/fetch_all_datasets.py" +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/AI/346/241/214/351/235/242/346/217/222/344/273/266/get_knowledgeBase_tool/fetch_all_documentation.py" +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/AI/346/241/214/351/235/242/346/217/222/344/273/266/get_knowledgeBase_tool/fetch_all_operators.py" +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/AI/346/241/214/351/235/242/346/217/222/344/273/266/icon.ico" +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/AI/346/241/214/351/235/242/346/217/222/344/273/266/icon.png" +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/AI/346/241/214/351/235/242/346/217/222/344/273/266/knowledge/test.txt" +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/AI/346/241/214/351/235/242/346/217/222/344/273/266/run.bat" +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/AI/346/241/214/351/235/242/346/217/222/344/273/266//351/246/226/346/254/241/350/277/220/350/241/214/346/211/223/345/274/200/346/210/221.py" +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/.gitignore +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/MODULAR_STRUCTURE.md +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/README.md +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/Tranformer/ace_lib.py +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/Tranformer/output/Alpha_candidates.json +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/Tranformer/output/Alpha_candidates_/347/244/272/344/276/213.json" +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/Tranformer/output/Alpha_generated_expressions_error.json +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/Tranformer/output/Alpha_generated_expressions_success.json +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/Tranformer/output/Alpha_generated_expressions_/347/244/272/344/276/213/345/217/257/347/233/264/346/216/245/350/275/275/345/205/245Machine_lib.json" +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/Tranformer/parsetab.py +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/Tranformer/template_summary.txt +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/Tranformer/transformer_config.json +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/Tranformer/validator.py +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/ace.log +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/ace_lib.py +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/blueprints/__init__.py +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/blueprints/feature_engineering.py +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/blueprints/idea_house.py +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/blueprints/inspiration_house.py +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/blueprints/paper_analysis.py +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/custom_templates/templates.json +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/give_me_idea/BRAIN_Alpha_Template_Expert_SystemPrompt.md +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/give_me_idea/ace_lib.py +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/give_me_idea/alpha_data_specific_template_master.py +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/give_me_idea/fetch_all_datasets.py +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/give_me_idea/fetch_all_operators.py +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/give_me_idea/what_is_Alpha_template.md +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/mirror_config.txt +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/operaters.csv +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/requirements.txt +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/run_app.bat +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/run_app.sh +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/setup_tsinghua.bat +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/setup_tsinghua.sh +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/simulator/alpha_submitter.py +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/ssrn-3332513.pdf +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/static/brain.js +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/static/decoder.js +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/static/feature_engineering.js +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/static/idea_house.js +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/static/inspiration.js +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/static/inspiration_house.js +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/static/paper_analysis.js +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/static/script.js +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/static/simulator.js +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/static/styles.css +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/static/usage_widget.js +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/templates/alpha_inspector.html +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/templates/feature_engineering.html +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/templates/idea_house.html +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/templates/index.html +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/templates/inspiration_house.html +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/templates/paper_analysis.html +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/templates/simulator.html +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/templates/transformer_web.html +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP/usage.md +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP//347/274/230/345/210/206/344/270/200/351/201/223/346/241/245/ace_lib.py" +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP//347/274/230/345/210/206/344/270/200/351/201/223/346/241/245/brain_alpha_inspector.py" +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/APP//350/277/220/350/241/214/346/211/223/345/274/200/346/210/221.py" +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/arXiv_API_Tool_Manual.md +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/arxiv_api.py +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/forum_functions.py +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/mcp/346/226/207/344/273/266/350/256/272/345/235/233/347/211/2102_/345/246/202/346/236/234/345/216/237/347/211/210/345/220/257/345/212/250/344/270/215/344/272/206/346/265/217/350/247/210/345/231/250/345/260/261/350/257/225/350/277/231/344/270/252/forum_functions.py" +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/mcp/346/226/207/344/273/266/350/256/272/345/235/233/347/211/2102_/345/246/202/346/236/234/345/216/237/347/211/210/345/220/257/345/212/250/344/270/215/344/272/206/346/265/217/350/247/210/345/231/250/345/260/261/350/257/225/350/277/231/344/270/252/user_config.json" +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/mcp/346/226/207/344/273/266/350/256/272/345/235/233/347/211/2102_/345/246/202/346/236/234/345/216/237/347/211/210/345/220/257/345/212/250/344/270/215/344/272/206/346/265/217/350/247/210/345/231/250/345/260/261/350/257/225/350/277/231/344/270/252//350/256/251AI/350/257/273/350/277/231/344/270/252/346/226/207/346/241/243/346/235/245/345/255/246/344/274/232/344/270/213/350/275/275/346/265/217/350/247/210/345/231/250.md" +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/mcp/346/226/207/344/273/266/350/256/272/345/235/233/347/211/2102_/345/246/202/346/236/234/345/216/237/347/211/210/345/220/257/345/212/250/344/270/215/344/272/206/346/265/217/350/247/210/345/231/250/345/260/261/350/257/225/350/277/231/344/270/252//351/205/215/347/275/256/345/211/215/350/277/220/350/241/214/346/210/221_/345/256/211/350/243/205/345/277/205/350/246/201/344/276/235/350/265/226/345/214/205.py" +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/sample_mcp_config.json +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked/user_config.json +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked//347/244/272/344/276/213/345/217/202/350/200/203/346/226/207/346/241/243_BRAIN_Alpha_Test_Requirements_and_Tips.md" +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked//347/244/272/344/276/213/345/267/245/344/275/234/346/265/201_Alpha_explaination_workflow.md" +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked//347/244/272/344/276/213/345/267/245/344/275/234/346/265/201_BRAIN_6_Tips_Datafield_Exploration_Guide.md" +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked//347/244/272/344/276/213/345/267/245/344/275/234/346/265/201_BRAIN_Alpha_Improvement_Workflow.md" +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked//347/244/272/344/276/213/345/267/245/344/275/234/346/265/201_Dataset_Exploration_Expert_Manual.md" +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked//347/244/272/344/276/213/345/267/245/344/275/234/346/265/201_daily_report_workflow.md" +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp/untracked//351/205/215/347/275/256/345/211/215/350/277/220/350/241/214/346/210/221_/345/256/211/350/243/205/345/277/205/350/246/201/344/276/235/350/265/226/345/214/205.py" +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp.egg-info/dependency_links.txt +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp.egg-info/entry_points.txt +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp.egg-info/not-zip-safe +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp.egg-info/requires.txt +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/cnhkmcp.egg-info/top_level.txt +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/requirements.txt +0 -0
- {cnhkmcp-2.1.0 → cnhkmcp-2.1.2}/setup.cfg +0 -0
|
@@ -4,7 +4,6 @@ import base64
|
|
|
4
4
|
import tkinter as tk
|
|
5
5
|
from tkinter import scrolledtext, messagebox, Toplevel
|
|
6
6
|
from PIL import Image, ImageTk, ImageGrab
|
|
7
|
-
import pyautogui
|
|
8
7
|
from openai import OpenAI
|
|
9
8
|
import threading
|
|
10
9
|
import io
|
|
@@ -21,14 +20,10 @@ def install_dependencies():
|
|
|
21
20
|
# Mapping of package names to their import names (if different)
|
|
22
21
|
packages = {
|
|
23
22
|
"openai": "openai",
|
|
24
|
-
"pyautogui": "pyautogui",
|
|
25
23
|
"Pillow": "PIL",
|
|
26
|
-
"pyperclip": "pyperclip",
|
|
27
|
-
"keyboard": "keyboard",
|
|
28
24
|
"fastembed": "fastembed",
|
|
29
25
|
"chromadb": "chromadb",
|
|
30
26
|
"watchdog": "watchdog",
|
|
31
|
-
"urllib3": "urllib3",
|
|
32
27
|
"pypdf": "pypdf",
|
|
33
28
|
"python-docx": "docx"
|
|
34
29
|
}
|
|
@@ -16,17 +16,18 @@ if TOOL_DIR not in sys.path:
|
|
|
16
16
|
sys.path.insert(0, TOOL_DIR)
|
|
17
17
|
import ace_lib
|
|
18
18
|
from fetch_all_operators import fetch_operators, prompt_credentials
|
|
19
|
-
from fetch_all_datasets import (
|
|
20
|
-
fetch_all_combinations,
|
|
21
|
-
fetch_datasets_for_combo,
|
|
22
|
-
merge_and_deduplicate,
|
|
23
|
-
)
|
|
24
19
|
from fetch_all_documentation import (
|
|
25
20
|
fetch_tutorials,
|
|
26
21
|
fetch_tutorial_pages,
|
|
27
22
|
fetch_page,
|
|
28
23
|
_extract_page_id,
|
|
29
24
|
)
|
|
25
|
+
# Dataset fetching currently disabled per request
|
|
26
|
+
# from fetch_all_datasets import (
|
|
27
|
+
# fetch_all_combinations,
|
|
28
|
+
# fetch_datasets_for_combo,
|
|
29
|
+
# merge_and_deduplicate,
|
|
30
|
+
# )
|
|
30
31
|
|
|
31
32
|
|
|
32
33
|
def ensure_knowledge_dir():
|
|
@@ -107,62 +108,63 @@ def process_operators(session: ace_lib.SingleSession, knowledge_dir: str):
|
|
|
107
108
|
print(f"✓ Created {filename} with {len(category_list)} operators")
|
|
108
109
|
|
|
109
110
|
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
if
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
111
|
+
# Dataset fetching intentionally disabled; keep for potential re-enable.
|
|
112
|
+
# def process_datasets(session: ace_lib.SingleSession, dataset_dir: str):
|
|
113
|
+
# """Fetch datasets and save one JSON per region."""
|
|
114
|
+
# print("=== Processing Datasets ===")
|
|
115
|
+
#
|
|
116
|
+
# print("Fetching valid instrument/region/delay/universe combinations...")
|
|
117
|
+
# options_df = fetch_all_combinations(session)
|
|
118
|
+
# if options_df is None or options_df.empty:
|
|
119
|
+
# print("No simulation options fetched; aborting dataset fetch.")
|
|
120
|
+
# return
|
|
121
|
+
#
|
|
122
|
+
# all_datasets: list[pd.DataFrame] = []
|
|
123
|
+
# combo_idx = 0
|
|
124
|
+
#
|
|
125
|
+
# for _, row in options_df.iterrows():
|
|
126
|
+
# instrument_type = row.get("InstrumentType")
|
|
127
|
+
# region = row.get("Region")
|
|
128
|
+
# delay = row.get("Delay")
|
|
129
|
+
# universes = row.get("Universe") or []
|
|
130
|
+
#
|
|
131
|
+
# for universe in universes:
|
|
132
|
+
# combo_idx += 1
|
|
133
|
+
# print(f"[{combo_idx}] {instrument_type} / {region} / D{delay} / {universe}")
|
|
134
|
+
# try:
|
|
135
|
+
# df = fetch_datasets_for_combo(session, instrument_type, region, delay, universe)
|
|
136
|
+
# print(f" -> {len(df)} rows")
|
|
137
|
+
# all_datasets.append(df)
|
|
138
|
+
# except Exception as exc:
|
|
139
|
+
# print(f" -> Failed: {exc}")
|
|
140
|
+
#
|
|
141
|
+
# if not all_datasets:
|
|
142
|
+
# print("No datasets fetched; nothing to save.")
|
|
143
|
+
# return
|
|
144
|
+
#
|
|
145
|
+
# combined_df = pd.concat([df for df in all_datasets if not df.empty], ignore_index=True)
|
|
146
|
+
# if combined_df.empty:
|
|
147
|
+
# print("No datasets fetched; nothing to save.")
|
|
148
|
+
# return
|
|
149
|
+
#
|
|
150
|
+
# regions = sorted(combined_df["param_region"].dropna().unique())
|
|
151
|
+
# print(f"Found regions: {', '.join(regions)}")
|
|
152
|
+
#
|
|
153
|
+
# for region in regions:
|
|
154
|
+
# region_df = combined_df[combined_df["param_region"] == region]
|
|
155
|
+
# region_unique = merge_and_deduplicate([region_df])
|
|
156
|
+
#
|
|
157
|
+
# region_list = []
|
|
158
|
+
# for _, row in region_unique.iterrows():
|
|
159
|
+
# record = {col: to_jsonable(row[col]) for col in row.index}
|
|
160
|
+
# region_list.append(record)
|
|
161
|
+
#
|
|
162
|
+
# filename = f"{region.replace(' ', '_').lower()}_datasets.json"
|
|
163
|
+
# filepath = os.path.join(dataset_dir, filename)
|
|
164
|
+
# with open(filepath, "w", encoding="utf-8") as f:
|
|
165
|
+
# json.dump(region_list, f, ensure_ascii=False, indent=2)
|
|
166
|
+
#
|
|
167
|
+
# print(f"✓ Created {filename} with {len(region_list)} datasets")
|
|
166
168
|
|
|
167
169
|
|
|
168
170
|
def process_documentation(session: ace_lib.SingleSession, knowledge_dir: str):
|
|
@@ -240,7 +242,7 @@ def main():
|
|
|
240
242
|
|
|
241
243
|
# Ensure knowledge directory exists
|
|
242
244
|
knowledge_dir = ensure_knowledge_dir()
|
|
243
|
-
dataset_dir = knowledge_dir # Save datasets directly under knowledge
|
|
245
|
+
# dataset_dir = knowledge_dir # Save datasets directly under knowledge (disabled)
|
|
244
246
|
print(f"Knowledge directory: {knowledge_dir}\n")
|
|
245
247
|
|
|
246
248
|
# Process documentation (tutorials/pages)
|
|
@@ -262,16 +264,15 @@ def main():
|
|
|
262
264
|
traceback.print_exc()
|
|
263
265
|
return
|
|
264
266
|
|
|
265
|
-
#
|
|
266
|
-
print("\nStarting dataset processing...\n")
|
|
267
|
-
try:
|
|
268
|
-
|
|
269
|
-
except Exception as exc:
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
267
|
+
# Dataset processing disabled; re-enable by uncommenting the block below.
|
|
268
|
+
# print("\nStarting dataset processing...\n")
|
|
269
|
+
# try:
|
|
270
|
+
# process_datasets(session, dataset_dir)
|
|
271
|
+
# except Exception as exc:
|
|
272
|
+
# print(f"✗ Failed to process datasets: {exc}")
|
|
273
|
+
# import traceback
|
|
274
|
+
# traceback.print_exc()
|
|
275
|
+
# return
|
|
275
276
|
|
|
276
277
|
print("\n=== Processing Complete ===")
|
|
277
278
|
|
cnhkmcp-2.1.2/cnhkmcp/untracked/AI/346/241/214/351/235/242/346/217/222/344/273/266/rag_engine.py
ADDED
|
@@ -0,0 +1,408 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
import shutil
|
|
4
|
+
import chromadb
|
|
5
|
+
from fastembed import TextEmbedding
|
|
6
|
+
from watchdog.observers import Observer
|
|
7
|
+
from watchdog.events import FileSystemEventHandler
|
|
8
|
+
import threading
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
PREFERRED_MODELS = [
|
|
12
|
+
"jinaai/jina-embeddings-v2-base-zh", # 中英混合友好,~0.64GB
|
|
13
|
+
"sentence-transformers/paraphrase-multilingual-mpnet-base-v2", # 多语 ~50 语种
|
|
14
|
+
"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", # 多语轻量版
|
|
15
|
+
"intfloat/multilingual-e5-large", # 多语更强,体积约 2.2GB
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
# Final chosen model will be detected at runtime from supported list
|
|
19
|
+
MODEL_NAME = None
|
|
20
|
+
COLLECTION_NAME = "brain_kb_v5"
|
|
21
|
+
BATCH_SIZE = 128 # batch upserts to avoid huge single writes
|
|
22
|
+
|
|
23
|
+
# Optional imports for different file types
|
|
24
|
+
try:
|
|
25
|
+
from pypdf import PdfReader
|
|
26
|
+
except ImportError:
|
|
27
|
+
PdfReader = None
|
|
28
|
+
|
|
29
|
+
try:
|
|
30
|
+
from docx import Document
|
|
31
|
+
except ImportError:
|
|
32
|
+
Document = None
|
|
33
|
+
|
|
34
|
+
class KnowledgeBase:
|
|
35
|
+
def __init__(self, kb_path="knowledge", db_path="vector_db"):
|
|
36
|
+
self.kb_path = os.path.abspath(kb_path)
|
|
37
|
+
self.db_path = os.path.abspath(db_path)
|
|
38
|
+
self.meta_path = os.path.join(self.db_path, "_meta.json")
|
|
39
|
+
self.manifest_path = os.path.join(self.db_path, "_manifest.json")
|
|
40
|
+
self._collection_reset_guard = False
|
|
41
|
+
self._query_reset_guard = False
|
|
42
|
+
self._sync_lock = threading.Lock()
|
|
43
|
+
|
|
44
|
+
if not os.path.exists(self.kb_path):
|
|
45
|
+
os.makedirs(self.kb_path)
|
|
46
|
+
|
|
47
|
+
# Initialize Embedding Model (BAAI/bge-small-zh-v1.5 is ~100MB)
|
|
48
|
+
# This will load from cache if already downloaded
|
|
49
|
+
# Pick the first available model from the preferred list
|
|
50
|
+
_supported_raw = TextEmbedding.list_supported_models()
|
|
51
|
+
supported = set()
|
|
52
|
+
for item in _supported_raw:
|
|
53
|
+
if isinstance(item, dict) and "model" in item:
|
|
54
|
+
supported.add(item["model"])
|
|
55
|
+
elif isinstance(item, str):
|
|
56
|
+
supported.add(item)
|
|
57
|
+
chosen = None
|
|
58
|
+
for name in PREFERRED_MODELS:
|
|
59
|
+
if name in supported:
|
|
60
|
+
chosen = name
|
|
61
|
+
break
|
|
62
|
+
if not chosen:
|
|
63
|
+
raise RuntimeError(
|
|
64
|
+
"No preferred embedding models are supported by fastembed. "
|
|
65
|
+
"Please check available models via TextEmbedding.list_supported_models()."
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
print(f"Loading Knowledge Base Embedding Model: {chosen} (may take some time on first run)...")
|
|
69
|
+
try:
|
|
70
|
+
self.model = TextEmbedding(model_name=chosen)
|
|
71
|
+
print("Embedding Model loaded successfully.")
|
|
72
|
+
except Exception as e:
|
|
73
|
+
print(f"Error loading embedding model: {e}")
|
|
74
|
+
raise
|
|
75
|
+
|
|
76
|
+
# Store chosen model name for reference
|
|
77
|
+
global MODEL_NAME
|
|
78
|
+
MODEL_NAME = chosen
|
|
79
|
+
|
|
80
|
+
# Cache embedding dimension (detects library/model changes that corrupt existing indexes)
|
|
81
|
+
self.embed_dim = self._get_embedding_dim()
|
|
82
|
+
self.chroma_version = getattr(chromadb, "__version__", "unknown")
|
|
83
|
+
|
|
84
|
+
# If the stored index was built with a different model/dimension/chromadb version, wipe it
|
|
85
|
+
self._maybe_reset_for_incompatibility(chosen, self.embed_dim, self.chroma_version)
|
|
86
|
+
|
|
87
|
+
# Initialize Vector DB
|
|
88
|
+
self._init_collection()
|
|
89
|
+
self._healthcheck()
|
|
90
|
+
|
|
91
|
+
# Initial sync
|
|
92
|
+
self.sync_knowledge()
|
|
93
|
+
|
|
94
|
+
# Start Watcher
|
|
95
|
+
self.start_watcher()
|
|
96
|
+
|
|
97
|
+
def _init_collection(self, recreate: bool = False):
|
|
98
|
+
"""(Re)initialize Chroma client/collection. If recreate=True, wipe on-disk index."""
|
|
99
|
+
if recreate and os.path.exists(self.db_path):
|
|
100
|
+
shutil.rmtree(self.db_path, ignore_errors=True)
|
|
101
|
+
try:
|
|
102
|
+
self.client = chromadb.PersistentClient(path=self.db_path)
|
|
103
|
+
self.collection = self.client.get_or_create_collection(
|
|
104
|
+
name=COLLECTION_NAME,
|
|
105
|
+
metadata={"hnsw:space": "cosine"}
|
|
106
|
+
)
|
|
107
|
+
except Exception as exc:
|
|
108
|
+
# If collection load itself fails, wipe and retry once to clear corrupted segments
|
|
109
|
+
if not recreate:
|
|
110
|
+
shutil.rmtree(self.db_path, ignore_errors=True)
|
|
111
|
+
return self._init_collection(recreate=True)
|
|
112
|
+
raise
|
|
113
|
+
|
|
114
|
+
# Persist metadata about the embedding model used to build this index
|
|
115
|
+
try:
|
|
116
|
+
os.makedirs(self.db_path, exist_ok=True)
|
|
117
|
+
with open(self.meta_path, "w", encoding="utf-8") as f:
|
|
118
|
+
json.dump({
|
|
119
|
+
"model": MODEL_NAME,
|
|
120
|
+
"embed_dim": self.embed_dim,
|
|
121
|
+
"chroma_version": self.chroma_version,
|
|
122
|
+
}, f)
|
|
123
|
+
except Exception:
|
|
124
|
+
pass # Metadata failure should not block runtime
|
|
125
|
+
|
|
126
|
+
def _healthcheck(self):
|
|
127
|
+
"""Validate index readability right after startup; rebuild if corrupted."""
|
|
128
|
+
try:
|
|
129
|
+
_ = self.collection.count()
|
|
130
|
+
except Exception as e:
|
|
131
|
+
msg = str(e).lower()
|
|
132
|
+
if any(x in msg for x in ["hnsw", "segment", "compaction", "backfill"]):
|
|
133
|
+
print("Detected index corruption on startup. Rebuilding vector_db...")
|
|
134
|
+
shutil.rmtree(self.db_path, ignore_errors=True)
|
|
135
|
+
self._init_collection(recreate=True)
|
|
136
|
+
self.sync_knowledge(allow_reset=False)
|
|
137
|
+
else:
|
|
138
|
+
print(f"Index healthcheck encountered an unexpected error: {e}")
|
|
139
|
+
|
|
140
|
+
def _maybe_reset_for_incompatibility(self, chosen_model: str, embed_dim: int, chroma_version: str):
|
|
141
|
+
"""If existing index meta differs (model/dimension/chromadb), wipe it."""
|
|
142
|
+
if not os.path.exists(self.db_path):
|
|
143
|
+
return
|
|
144
|
+
try:
|
|
145
|
+
with open(self.meta_path, "r", encoding="utf-8") as f:
|
|
146
|
+
meta = json.load(f)
|
|
147
|
+
prev_model = meta.get("model")
|
|
148
|
+
prev_dim = meta.get("embed_dim")
|
|
149
|
+
prev_chroma = meta.get("chroma_version")
|
|
150
|
+
if prev_model != chosen_model or prev_dim != embed_dim or prev_chroma != chroma_version:
|
|
151
|
+
shutil.rmtree(self.db_path, ignore_errors=True)
|
|
152
|
+
except Exception:
|
|
153
|
+
# If meta cannot be read, assume stale/corrupted and rebuild
|
|
154
|
+
shutil.rmtree(self.db_path, ignore_errors=True)
|
|
155
|
+
|
|
156
|
+
def _get_embedding_dim(self) -> int:
|
|
157
|
+
for vec in self.model.embed(["dimension_probe"]):
|
|
158
|
+
try:
|
|
159
|
+
return len(vec)
|
|
160
|
+
except Exception:
|
|
161
|
+
return len(list(vec))
|
|
162
|
+
raise RuntimeError("Failed to determine embedding dimension")
|
|
163
|
+
|
|
164
|
+
def sync_knowledge(self, allow_reset: bool = True):
|
|
165
|
+
"""Scans the knowledge folder and updates the vector database."""
|
|
166
|
+
if not self._sync_lock.acquire(blocking=False):
|
|
167
|
+
print("Sync already running, skip this trigger.")
|
|
168
|
+
return
|
|
169
|
+
|
|
170
|
+
print("Syncing knowledge base...")
|
|
171
|
+
manifest = self._load_manifest()
|
|
172
|
+
updated_manifest = {}
|
|
173
|
+
supported_extensions = (".txt", ".md", ".pdf", ".docx", ".json")
|
|
174
|
+
current_files = []
|
|
175
|
+
try:
|
|
176
|
+
for filename in os.listdir(self.kb_path):
|
|
177
|
+
file_path = os.path.join(self.kb_path, filename)
|
|
178
|
+
if os.path.isfile(file_path) and filename.lower().endswith(supported_extensions):
|
|
179
|
+
current_files.append(filename)
|
|
180
|
+
mtime = os.path.getmtime(file_path)
|
|
181
|
+
size = os.path.getsize(file_path)
|
|
182
|
+
prev_meta = manifest.get(filename)
|
|
183
|
+
# Skip unchanged files
|
|
184
|
+
if prev_meta and prev_meta.get("mtime") == mtime and prev_meta.get("size") == size:
|
|
185
|
+
updated_manifest[filename] = prev_meta
|
|
186
|
+
continue
|
|
187
|
+
try:
|
|
188
|
+
content = self._extract_text(file_path)
|
|
189
|
+
if content:
|
|
190
|
+
# Sliding window chunking on original text
|
|
191
|
+
chunk_size = 800
|
|
192
|
+
overlap = 80
|
|
193
|
+
original_chunks = []
|
|
194
|
+
for i in range(0, len(content), chunk_size - overlap):
|
|
195
|
+
chunk = content[i:i + chunk_size].strip()
|
|
196
|
+
if chunk:
|
|
197
|
+
original_chunks.append(chunk)
|
|
198
|
+
|
|
199
|
+
if original_chunks:
|
|
200
|
+
# Normalize for embedding generation only (not for storage)
|
|
201
|
+
normalized_chunks = [c.lower().replace('_', ' ') for c in original_chunks]
|
|
202
|
+
|
|
203
|
+
ids = [f"{filename}_{i}" for i in range(len(original_chunks))]
|
|
204
|
+
metadatas = [{"source": filename, "chunk": i} for i in range(len(original_chunks))]
|
|
205
|
+
|
|
206
|
+
# Compute embeddings from normalized text
|
|
207
|
+
embeddings = []
|
|
208
|
+
for v in self.model.embed(normalized_chunks):
|
|
209
|
+
try:
|
|
210
|
+
embeddings.append(v.tolist())
|
|
211
|
+
except Exception:
|
|
212
|
+
embeddings.append(list(v))
|
|
213
|
+
|
|
214
|
+
# Store ORIGINAL text (not normalized) so users see the real content
|
|
215
|
+
for start in range(0, len(original_chunks), BATCH_SIZE):
|
|
216
|
+
end = start + BATCH_SIZE
|
|
217
|
+
self.collection.upsert(
|
|
218
|
+
documents=original_chunks[start:end],
|
|
219
|
+
ids=ids[start:end],
|
|
220
|
+
metadatas=metadatas[start:end],
|
|
221
|
+
embeddings=embeddings[start:end]
|
|
222
|
+
)
|
|
223
|
+
print(f" ✓ Indexed {filename}: {len(original_chunks)} chunks (batched)")
|
|
224
|
+
updated_manifest[filename] = {"mtime": mtime, "size": size}
|
|
225
|
+
except Exception as e:
|
|
226
|
+
err_msg = str(e)
|
|
227
|
+
print(f"Error processing {filename}: {err_msg}")
|
|
228
|
+
# Auto-recover if HNSW/compaction/index errors occur
|
|
229
|
+
if allow_reset and any(x in err_msg.lower() for x in ["hnsw", "compaction", "segment reader"]):
|
|
230
|
+
if not self._collection_reset_guard:
|
|
231
|
+
print("Detected index corruption. Rebuilding vector_db and retrying sync once...")
|
|
232
|
+
self._collection_reset_guard = True
|
|
233
|
+
self._init_collection(recreate=True)
|
|
234
|
+
return self.sync_knowledge(allow_reset=False)
|
|
235
|
+
# Remove deleted files from the index
|
|
236
|
+
deleted_files = set(manifest.keys()) - set(current_files)
|
|
237
|
+
for filename in deleted_files:
|
|
238
|
+
try:
|
|
239
|
+
self.collection.delete(where={"source": filename})
|
|
240
|
+
print(f" ✓ Removed deleted file from index: {filename}")
|
|
241
|
+
except Exception as e:
|
|
242
|
+
print(f" ! Failed to remove {filename}: {e}")
|
|
243
|
+
# Persist manifest
|
|
244
|
+
self._save_manifest(updated_manifest)
|
|
245
|
+
print("Knowledge base sync complete.")
|
|
246
|
+
finally:
|
|
247
|
+
self._sync_lock.release()
|
|
248
|
+
|
|
249
|
+
def _extract_text(self, file_path):
|
|
250
|
+
ext = os.path.splitext(file_path)[1].lower()
|
|
251
|
+
if ext == ".txt":
|
|
252
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
253
|
+
return f.read()
|
|
254
|
+
elif ext == ".md":
|
|
255
|
+
# Treat Markdown as plain text for retrieval
|
|
256
|
+
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
257
|
+
return f.read()
|
|
258
|
+
elif ext == ".pdf":
|
|
259
|
+
if PdfReader:
|
|
260
|
+
reader = PdfReader(file_path)
|
|
261
|
+
text = ""
|
|
262
|
+
for page in reader.pages:
|
|
263
|
+
text += page.extract_text() + "\n"
|
|
264
|
+
return text
|
|
265
|
+
else:
|
|
266
|
+
print("pypdf not installed, skipping PDF.")
|
|
267
|
+
elif ext == ".docx":
|
|
268
|
+
if Document:
|
|
269
|
+
doc = Document(file_path)
|
|
270
|
+
return "\n".join([para.text for para in doc.paragraphs])
|
|
271
|
+
else:
|
|
272
|
+
print("python-docx not installed, skipping Word.")
|
|
273
|
+
elif ext == ".json":
|
|
274
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
275
|
+
data = json.load(f)
|
|
276
|
+
return json.dumps(data, ensure_ascii=False, indent=2)
|
|
277
|
+
return None
|
|
278
|
+
|
|
279
|
+
def query(self, text, top_k=5, distance_threshold=0.8, allow_reset: bool = True):
|
|
280
|
+
"""Retrieves relevant snippets from the knowledge base.
|
|
281
|
+
|
|
282
|
+
Uses cosine distance (lower is better). A result is treated as a hit only
|
|
283
|
+
when best_distance <= distance_threshold.
|
|
284
|
+
Returns:
|
|
285
|
+
dict: {"hit": bool, "context": str, "hits": [{source, chunk, distance, text}, ...]}
|
|
286
|
+
"""
|
|
287
|
+
try:
|
|
288
|
+
# Normalize query same as indexed content
|
|
289
|
+
normalized_text = text.lower().replace('_', ' ')
|
|
290
|
+
|
|
291
|
+
q_vec = None
|
|
292
|
+
for v in self.model.embed([normalized_text]):
|
|
293
|
+
try:
|
|
294
|
+
q_vec = v.tolist()
|
|
295
|
+
except Exception:
|
|
296
|
+
q_vec = list(v)
|
|
297
|
+
break
|
|
298
|
+
if q_vec is None:
|
|
299
|
+
return {"hit": False, "context": "", "hits": []}
|
|
300
|
+
|
|
301
|
+
results = self.collection.query(
|
|
302
|
+
query_embeddings=[q_vec],
|
|
303
|
+
n_results=top_k,
|
|
304
|
+
include=["documents", "metadatas", "distances"]
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
docs = (results or {}).get("documents") or []
|
|
308
|
+
metas = (results or {}).get("metadatas") or []
|
|
309
|
+
dists = (results or {}).get("distances") or []
|
|
310
|
+
|
|
311
|
+
if not docs or not docs[0]:
|
|
312
|
+
print("[KB Query] No results returned from collection")
|
|
313
|
+
return {"hit": False, "context": "", "hits": []}
|
|
314
|
+
|
|
315
|
+
docs0 = docs[0]
|
|
316
|
+
metas0 = metas[0] if metas and metas[0] else [{} for _ in docs0]
|
|
317
|
+
dists0 = dists[0] if dists and dists[0] else [None for _ in docs0]
|
|
318
|
+
|
|
319
|
+
hits = []
|
|
320
|
+
for doc_text, meta, dist in zip(docs0, metas0, dists0):
|
|
321
|
+
hits.append({
|
|
322
|
+
"source": (meta or {}).get("source", ""),
|
|
323
|
+
"chunk": (meta or {}).get("chunk", None),
|
|
324
|
+
"distance": dist,
|
|
325
|
+
"text": doc_text,
|
|
326
|
+
})
|
|
327
|
+
|
|
328
|
+
best = hits[0].get("distance")
|
|
329
|
+
is_hit = (best is not None) and (best <= distance_threshold)
|
|
330
|
+
|
|
331
|
+
# Debug log
|
|
332
|
+
best_str = f"{best:.4f}" if best is not None else "N/A"
|
|
333
|
+
print(f"[KB Query] '{text[:50]}...' -> best_dist={best_str}, threshold={distance_threshold}, hit={is_hit}")
|
|
334
|
+
if hits:
|
|
335
|
+
top3_dists = [f"{h['distance']:.4f}" if h['distance'] is not None else "N/A" for h in hits[:3]]
|
|
336
|
+
print(f"[KB Query] Top 3 distances: {top3_dists}")
|
|
337
|
+
|
|
338
|
+
context = "\n---\n".join([h["text"] for h in hits]) if is_hit else ""
|
|
339
|
+
return {"hit": is_hit, "context": context, "hits": hits}
|
|
340
|
+
except Exception as e:
|
|
341
|
+
err_msg = str(e)
|
|
342
|
+
print(f"Query error: {err_msg}")
|
|
343
|
+
import traceback
|
|
344
|
+
traceback.print_exc()
|
|
345
|
+
|
|
346
|
+
# Auto-recover if HNSW/compaction/backfill errors surface during query
|
|
347
|
+
if allow_reset and any(x in err_msg.lower() for x in ["hnsw", "compaction", "segment reader", "backfill"]):
|
|
348
|
+
if not self._query_reset_guard:
|
|
349
|
+
print("Detected index corruption during query. Rebuilding vector_db and retrying once...")
|
|
350
|
+
self._query_reset_guard = True
|
|
351
|
+
try:
|
|
352
|
+
self._init_collection(recreate=True)
|
|
353
|
+
self.sync_knowledge(allow_reset=False)
|
|
354
|
+
# Retry query once with guard disabled to avoid loops
|
|
355
|
+
self._query_reset_guard = False
|
|
356
|
+
return self.query(text, top_k=top_k, distance_threshold=distance_threshold, allow_reset=False)
|
|
357
|
+
except Exception as inner_e:
|
|
358
|
+
print(f"Auto-rebuild after query failure also failed: {inner_e}")
|
|
359
|
+
self._query_reset_guard = False
|
|
360
|
+
return {"hit": False, "context": "", "hits": []}
|
|
361
|
+
|
|
362
|
+
def start_watcher(self):
|
|
363
|
+
event_handler = KBHandler(self)
|
|
364
|
+
self.observer = Observer()
|
|
365
|
+
self.observer.schedule(event_handler, self.kb_path, recursive=False)
|
|
366
|
+
self.observer.start()
|
|
367
|
+
|
|
368
|
+
def _load_manifest(self):
|
|
369
|
+
if not os.path.exists(self.manifest_path):
|
|
370
|
+
return {}
|
|
371
|
+
try:
|
|
372
|
+
with open(self.manifest_path, "r", encoding="utf-8") as f:
|
|
373
|
+
return json.load(f)
|
|
374
|
+
except Exception:
|
|
375
|
+
return {}
|
|
376
|
+
|
|
377
|
+
def _save_manifest(self, data):
|
|
378
|
+
try:
|
|
379
|
+
os.makedirs(self.db_path, exist_ok=True)
|
|
380
|
+
with open(self.manifest_path, "w", encoding="utf-8") as f:
|
|
381
|
+
json.dump(data, f, ensure_ascii=False, indent=2)
|
|
382
|
+
except Exception as e:
|
|
383
|
+
print(f" ! Failed to save manifest: {e}")
|
|
384
|
+
|
|
385
|
+
class KBHandler(FileSystemEventHandler):
|
|
386
|
+
def __init__(self, kb_instance):
|
|
387
|
+
self.kb = kb_instance
|
|
388
|
+
self.supported_extensions = (".txt", ".md", ".pdf", ".docx", ".json")
|
|
389
|
+
self._debounce_timer = None
|
|
390
|
+
|
|
391
|
+
def _trigger_sync(self):
|
|
392
|
+
def run():
|
|
393
|
+
self.kb.sync_knowledge()
|
|
394
|
+
if self._debounce_timer and self._debounce_timer.is_alive():
|
|
395
|
+
return
|
|
396
|
+
self._debounce_timer = threading.Timer(0.5, run)
|
|
397
|
+
self._debounce_timer.start()
|
|
398
|
+
|
|
399
|
+
def on_modified(self, event):
|
|
400
|
+
if not event.is_directory and event.src_path.lower().endswith(self.supported_extensions):
|
|
401
|
+
print(f"File modified: {event.src_path}. Re-syncing...")
|
|
402
|
+
self._trigger_sync()
|
|
403
|
+
|
|
404
|
+
def on_created(self, event):
|
|
405
|
+
if not event.is_directory and event.src_path.lower().endswith(self.supported_extensions):
|
|
406
|
+
print(f"File created: {event.src_path}. Syncing...")
|
|
407
|
+
self._trigger_sync()
|
|
408
|
+
|
|
@@ -4729,7 +4729,7 @@ async def main():
|
|
|
4729
4729
|
input_str = input()
|
|
4730
4730
|
if input_str == "":
|
|
4731
4731
|
config_path = os.path.join(os.path.dirname(__file__), 'transformer_config.json')
|
|
4732
|
-
with open(config_path, 'r') as f:
|
|
4732
|
+
with open(config_path, 'r', encoding='utf-8') as f:
|
|
4733
4733
|
config = json.load(f)
|
|
4734
4734
|
print("\n" + "="*60)
|
|
4735
4735
|
print("✓ 已从 transformer_config.json 加载账号配置")
|
|
File without changes
|