realtimex-deeptutor 0.5.0.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (276) hide show
  1. realtimex_deeptutor/__init__.py +67 -0
  2. realtimex_deeptutor-0.5.0.post1.dist-info/METADATA +1612 -0
  3. realtimex_deeptutor-0.5.0.post1.dist-info/RECORD +276 -0
  4. realtimex_deeptutor-0.5.0.post1.dist-info/WHEEL +5 -0
  5. realtimex_deeptutor-0.5.0.post1.dist-info/entry_points.txt +2 -0
  6. realtimex_deeptutor-0.5.0.post1.dist-info/licenses/LICENSE +661 -0
  7. realtimex_deeptutor-0.5.0.post1.dist-info/top_level.txt +2 -0
  8. src/__init__.py +40 -0
  9. src/agents/__init__.py +24 -0
  10. src/agents/base_agent.py +657 -0
  11. src/agents/chat/__init__.py +24 -0
  12. src/agents/chat/chat_agent.py +435 -0
  13. src/agents/chat/prompts/en/chat_agent.yaml +35 -0
  14. src/agents/chat/prompts/zh/chat_agent.yaml +35 -0
  15. src/agents/chat/session_manager.py +311 -0
  16. src/agents/co_writer/__init__.py +0 -0
  17. src/agents/co_writer/edit_agent.py +260 -0
  18. src/agents/co_writer/narrator_agent.py +423 -0
  19. src/agents/co_writer/prompts/en/edit_agent.yaml +113 -0
  20. src/agents/co_writer/prompts/en/narrator_agent.yaml +88 -0
  21. src/agents/co_writer/prompts/zh/edit_agent.yaml +113 -0
  22. src/agents/co_writer/prompts/zh/narrator_agent.yaml +88 -0
  23. src/agents/guide/__init__.py +16 -0
  24. src/agents/guide/agents/__init__.py +11 -0
  25. src/agents/guide/agents/chat_agent.py +104 -0
  26. src/agents/guide/agents/interactive_agent.py +223 -0
  27. src/agents/guide/agents/locate_agent.py +149 -0
  28. src/agents/guide/agents/summary_agent.py +150 -0
  29. src/agents/guide/guide_manager.py +500 -0
  30. src/agents/guide/prompts/en/chat_agent.yaml +41 -0
  31. src/agents/guide/prompts/en/interactive_agent.yaml +202 -0
  32. src/agents/guide/prompts/en/locate_agent.yaml +68 -0
  33. src/agents/guide/prompts/en/summary_agent.yaml +157 -0
  34. src/agents/guide/prompts/zh/chat_agent.yaml +41 -0
  35. src/agents/guide/prompts/zh/interactive_agent.yaml +626 -0
  36. src/agents/guide/prompts/zh/locate_agent.yaml +68 -0
  37. src/agents/guide/prompts/zh/summary_agent.yaml +157 -0
  38. src/agents/ideagen/__init__.py +12 -0
  39. src/agents/ideagen/idea_generation_workflow.py +426 -0
  40. src/agents/ideagen/material_organizer_agent.py +173 -0
  41. src/agents/ideagen/prompts/en/idea_generation.yaml +187 -0
  42. src/agents/ideagen/prompts/en/material_organizer.yaml +69 -0
  43. src/agents/ideagen/prompts/zh/idea_generation.yaml +187 -0
  44. src/agents/ideagen/prompts/zh/material_organizer.yaml +69 -0
  45. src/agents/question/__init__.py +24 -0
  46. src/agents/question/agents/__init__.py +18 -0
  47. src/agents/question/agents/generate_agent.py +381 -0
  48. src/agents/question/agents/relevance_analyzer.py +207 -0
  49. src/agents/question/agents/retrieve_agent.py +239 -0
  50. src/agents/question/coordinator.py +718 -0
  51. src/agents/question/example.py +109 -0
  52. src/agents/question/prompts/en/coordinator.yaml +75 -0
  53. src/agents/question/prompts/en/generate_agent.yaml +77 -0
  54. src/agents/question/prompts/en/relevance_analyzer.yaml +41 -0
  55. src/agents/question/prompts/en/retrieve_agent.yaml +32 -0
  56. src/agents/question/prompts/zh/coordinator.yaml +75 -0
  57. src/agents/question/prompts/zh/generate_agent.yaml +77 -0
  58. src/agents/question/prompts/zh/relevance_analyzer.yaml +39 -0
  59. src/agents/question/prompts/zh/retrieve_agent.yaml +30 -0
  60. src/agents/research/agents/__init__.py +23 -0
  61. src/agents/research/agents/decompose_agent.py +507 -0
  62. src/agents/research/agents/manager_agent.py +228 -0
  63. src/agents/research/agents/note_agent.py +180 -0
  64. src/agents/research/agents/rephrase_agent.py +263 -0
  65. src/agents/research/agents/reporting_agent.py +1333 -0
  66. src/agents/research/agents/research_agent.py +714 -0
  67. src/agents/research/data_structures.py +451 -0
  68. src/agents/research/main.py +188 -0
  69. src/agents/research/prompts/en/decompose_agent.yaml +89 -0
  70. src/agents/research/prompts/en/manager_agent.yaml +24 -0
  71. src/agents/research/prompts/en/note_agent.yaml +121 -0
  72. src/agents/research/prompts/en/rephrase_agent.yaml +58 -0
  73. src/agents/research/prompts/en/reporting_agent.yaml +380 -0
  74. src/agents/research/prompts/en/research_agent.yaml +173 -0
  75. src/agents/research/prompts/zh/decompose_agent.yaml +89 -0
  76. src/agents/research/prompts/zh/manager_agent.yaml +24 -0
  77. src/agents/research/prompts/zh/note_agent.yaml +121 -0
  78. src/agents/research/prompts/zh/rephrase_agent.yaml +58 -0
  79. src/agents/research/prompts/zh/reporting_agent.yaml +380 -0
  80. src/agents/research/prompts/zh/research_agent.yaml +173 -0
  81. src/agents/research/research_pipeline.py +1309 -0
  82. src/agents/research/utils/__init__.py +60 -0
  83. src/agents/research/utils/citation_manager.py +799 -0
  84. src/agents/research/utils/json_utils.py +98 -0
  85. src/agents/research/utils/token_tracker.py +297 -0
  86. src/agents/solve/__init__.py +80 -0
  87. src/agents/solve/analysis_loop/__init__.py +14 -0
  88. src/agents/solve/analysis_loop/investigate_agent.py +414 -0
  89. src/agents/solve/analysis_loop/note_agent.py +190 -0
  90. src/agents/solve/main_solver.py +862 -0
  91. src/agents/solve/memory/__init__.py +34 -0
  92. src/agents/solve/memory/citation_memory.py +353 -0
  93. src/agents/solve/memory/investigate_memory.py +226 -0
  94. src/agents/solve/memory/solve_memory.py +340 -0
  95. src/agents/solve/prompts/en/analysis_loop/investigate_agent.yaml +55 -0
  96. src/agents/solve/prompts/en/analysis_loop/note_agent.yaml +54 -0
  97. src/agents/solve/prompts/en/solve_loop/manager_agent.yaml +67 -0
  98. src/agents/solve/prompts/en/solve_loop/precision_answer_agent.yaml +62 -0
  99. src/agents/solve/prompts/en/solve_loop/response_agent.yaml +90 -0
  100. src/agents/solve/prompts/en/solve_loop/solve_agent.yaml +75 -0
  101. src/agents/solve/prompts/en/solve_loop/tool_agent.yaml +38 -0
  102. src/agents/solve/prompts/zh/analysis_loop/investigate_agent.yaml +53 -0
  103. src/agents/solve/prompts/zh/analysis_loop/note_agent.yaml +54 -0
  104. src/agents/solve/prompts/zh/solve_loop/manager_agent.yaml +66 -0
  105. src/agents/solve/prompts/zh/solve_loop/precision_answer_agent.yaml +62 -0
  106. src/agents/solve/prompts/zh/solve_loop/response_agent.yaml +90 -0
  107. src/agents/solve/prompts/zh/solve_loop/solve_agent.yaml +76 -0
  108. src/agents/solve/prompts/zh/solve_loop/tool_agent.yaml +41 -0
  109. src/agents/solve/solve_loop/__init__.py +22 -0
  110. src/agents/solve/solve_loop/citation_manager.py +74 -0
  111. src/agents/solve/solve_loop/manager_agent.py +274 -0
  112. src/agents/solve/solve_loop/precision_answer_agent.py +96 -0
  113. src/agents/solve/solve_loop/response_agent.py +301 -0
  114. src/agents/solve/solve_loop/solve_agent.py +325 -0
  115. src/agents/solve/solve_loop/tool_agent.py +470 -0
  116. src/agents/solve/utils/__init__.py +64 -0
  117. src/agents/solve/utils/config_validator.py +313 -0
  118. src/agents/solve/utils/display_manager.py +223 -0
  119. src/agents/solve/utils/error_handler.py +363 -0
  120. src/agents/solve/utils/json_utils.py +98 -0
  121. src/agents/solve/utils/performance_monitor.py +407 -0
  122. src/agents/solve/utils/token_tracker.py +541 -0
  123. src/api/__init__.py +0 -0
  124. src/api/main.py +240 -0
  125. src/api/routers/__init__.py +1 -0
  126. src/api/routers/agent_config.py +69 -0
  127. src/api/routers/chat.py +296 -0
  128. src/api/routers/co_writer.py +337 -0
  129. src/api/routers/config.py +627 -0
  130. src/api/routers/dashboard.py +18 -0
  131. src/api/routers/guide.py +337 -0
  132. src/api/routers/ideagen.py +436 -0
  133. src/api/routers/knowledge.py +821 -0
  134. src/api/routers/notebook.py +247 -0
  135. src/api/routers/question.py +537 -0
  136. src/api/routers/research.py +394 -0
  137. src/api/routers/settings.py +164 -0
  138. src/api/routers/solve.py +305 -0
  139. src/api/routers/system.py +252 -0
  140. src/api/run_server.py +61 -0
  141. src/api/utils/history.py +172 -0
  142. src/api/utils/log_interceptor.py +21 -0
  143. src/api/utils/notebook_manager.py +415 -0
  144. src/api/utils/progress_broadcaster.py +72 -0
  145. src/api/utils/task_id_manager.py +100 -0
  146. src/config/__init__.py +0 -0
  147. src/config/accessors.py +18 -0
  148. src/config/constants.py +34 -0
  149. src/config/defaults.py +18 -0
  150. src/config/schema.py +38 -0
  151. src/config/settings.py +50 -0
  152. src/core/errors.py +62 -0
  153. src/knowledge/__init__.py +23 -0
  154. src/knowledge/add_documents.py +606 -0
  155. src/knowledge/config.py +65 -0
  156. src/knowledge/example_add_documents.py +236 -0
  157. src/knowledge/extract_numbered_items.py +1039 -0
  158. src/knowledge/initializer.py +621 -0
  159. src/knowledge/kb.py +22 -0
  160. src/knowledge/manager.py +782 -0
  161. src/knowledge/progress_tracker.py +182 -0
  162. src/knowledge/start_kb.py +535 -0
  163. src/logging/__init__.py +103 -0
  164. src/logging/adapters/__init__.py +17 -0
  165. src/logging/adapters/lightrag.py +184 -0
  166. src/logging/adapters/llamaindex.py +141 -0
  167. src/logging/config.py +80 -0
  168. src/logging/handlers/__init__.py +20 -0
  169. src/logging/handlers/console.py +75 -0
  170. src/logging/handlers/file.py +201 -0
  171. src/logging/handlers/websocket.py +127 -0
  172. src/logging/logger.py +709 -0
  173. src/logging/stats/__init__.py +16 -0
  174. src/logging/stats/llm_stats.py +179 -0
  175. src/services/__init__.py +56 -0
  176. src/services/config/__init__.py +61 -0
  177. src/services/config/knowledge_base_config.py +210 -0
  178. src/services/config/loader.py +260 -0
  179. src/services/config/unified_config.py +603 -0
  180. src/services/embedding/__init__.py +45 -0
  181. src/services/embedding/adapters/__init__.py +22 -0
  182. src/services/embedding/adapters/base.py +106 -0
  183. src/services/embedding/adapters/cohere.py +127 -0
  184. src/services/embedding/adapters/jina.py +99 -0
  185. src/services/embedding/adapters/ollama.py +116 -0
  186. src/services/embedding/adapters/openai_compatible.py +96 -0
  187. src/services/embedding/client.py +159 -0
  188. src/services/embedding/config.py +156 -0
  189. src/services/embedding/provider.py +119 -0
  190. src/services/llm/__init__.py +152 -0
  191. src/services/llm/capabilities.py +313 -0
  192. src/services/llm/client.py +302 -0
  193. src/services/llm/cloud_provider.py +530 -0
  194. src/services/llm/config.py +200 -0
  195. src/services/llm/error_mapping.py +103 -0
  196. src/services/llm/exceptions.py +152 -0
  197. src/services/llm/factory.py +450 -0
  198. src/services/llm/local_provider.py +347 -0
  199. src/services/llm/providers/anthropic.py +95 -0
  200. src/services/llm/providers/base_provider.py +93 -0
  201. src/services/llm/providers/open_ai.py +83 -0
  202. src/services/llm/registry.py +71 -0
  203. src/services/llm/telemetry.py +40 -0
  204. src/services/llm/types.py +27 -0
  205. src/services/llm/utils.py +333 -0
  206. src/services/prompt/__init__.py +25 -0
  207. src/services/prompt/manager.py +206 -0
  208. src/services/rag/__init__.py +64 -0
  209. src/services/rag/components/__init__.py +29 -0
  210. src/services/rag/components/base.py +59 -0
  211. src/services/rag/components/chunkers/__init__.py +18 -0
  212. src/services/rag/components/chunkers/base.py +34 -0
  213. src/services/rag/components/chunkers/fixed.py +71 -0
  214. src/services/rag/components/chunkers/numbered_item.py +94 -0
  215. src/services/rag/components/chunkers/semantic.py +97 -0
  216. src/services/rag/components/embedders/__init__.py +14 -0
  217. src/services/rag/components/embedders/base.py +32 -0
  218. src/services/rag/components/embedders/openai.py +63 -0
  219. src/services/rag/components/indexers/__init__.py +18 -0
  220. src/services/rag/components/indexers/base.py +35 -0
  221. src/services/rag/components/indexers/graph.py +172 -0
  222. src/services/rag/components/indexers/lightrag.py +156 -0
  223. src/services/rag/components/indexers/vector.py +146 -0
  224. src/services/rag/components/parsers/__init__.py +18 -0
  225. src/services/rag/components/parsers/base.py +35 -0
  226. src/services/rag/components/parsers/markdown.py +52 -0
  227. src/services/rag/components/parsers/pdf.py +115 -0
  228. src/services/rag/components/parsers/text.py +86 -0
  229. src/services/rag/components/retrievers/__init__.py +18 -0
  230. src/services/rag/components/retrievers/base.py +34 -0
  231. src/services/rag/components/retrievers/dense.py +200 -0
  232. src/services/rag/components/retrievers/hybrid.py +164 -0
  233. src/services/rag/components/retrievers/lightrag.py +169 -0
  234. src/services/rag/components/routing.py +286 -0
  235. src/services/rag/factory.py +234 -0
  236. src/services/rag/pipeline.py +215 -0
  237. src/services/rag/pipelines/__init__.py +32 -0
  238. src/services/rag/pipelines/academic.py +44 -0
  239. src/services/rag/pipelines/lightrag.py +43 -0
  240. src/services/rag/pipelines/llamaindex.py +313 -0
  241. src/services/rag/pipelines/raganything.py +384 -0
  242. src/services/rag/service.py +244 -0
  243. src/services/rag/types.py +73 -0
  244. src/services/search/__init__.py +284 -0
  245. src/services/search/base.py +87 -0
  246. src/services/search/consolidation.py +398 -0
  247. src/services/search/providers/__init__.py +128 -0
  248. src/services/search/providers/baidu.py +188 -0
  249. src/services/search/providers/exa.py +194 -0
  250. src/services/search/providers/jina.py +161 -0
  251. src/services/search/providers/perplexity.py +153 -0
  252. src/services/search/providers/serper.py +209 -0
  253. src/services/search/providers/tavily.py +161 -0
  254. src/services/search/types.py +114 -0
  255. src/services/setup/__init__.py +34 -0
  256. src/services/setup/init.py +285 -0
  257. src/services/tts/__init__.py +16 -0
  258. src/services/tts/config.py +99 -0
  259. src/tools/__init__.py +91 -0
  260. src/tools/code_executor.py +536 -0
  261. src/tools/paper_search_tool.py +171 -0
  262. src/tools/query_item_tool.py +310 -0
  263. src/tools/question/__init__.py +15 -0
  264. src/tools/question/exam_mimic.py +616 -0
  265. src/tools/question/pdf_parser.py +211 -0
  266. src/tools/question/question_extractor.py +397 -0
  267. src/tools/rag_tool.py +173 -0
  268. src/tools/tex_chunker.py +339 -0
  269. src/tools/tex_downloader.py +253 -0
  270. src/tools/web_search.py +71 -0
  271. src/utils/config_manager.py +206 -0
  272. src/utils/document_validator.py +168 -0
  273. src/utils/error_rate_tracker.py +111 -0
  274. src/utils/error_utils.py +82 -0
  275. src/utils/json_parser.py +110 -0
  276. src/utils/network/circuit_breaker.py +79 -0
@@ -0,0 +1,621 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Knowledge Base Initialization Script
5
+
6
+ This script initializes a new knowledge base from given documents:
7
+ 1. Creates directory structure
8
+ 2. Processes documents using RAG-Anything
9
+ 3. Builds knowledge graph database
10
+ 4. Extracts images and content lists
11
+ """
12
+
13
+ import argparse
14
+ import asyncio
15
+ from datetime import datetime
16
+ import json
17
+ import os
18
+ from pathlib import Path
19
+ import shutil
20
+
21
+ from src.logging import get_logger
22
+ from src.services.embedding import get_embedding_config
23
+ from src.services.llm import get_llm_config
24
+ from src.services.rag.service import RAGService
25
+
26
+ logger = get_logger("KnowledgeInit")
27
+
28
+ # Import numbered items extraction functionality
29
+ from src.knowledge.extract_numbered_items import process_content_list
30
+ from src.knowledge.progress_tracker import ProgressStage, ProgressTracker
31
+
32
+
33
+ class KnowledgeBaseInitializer:
34
+ """Knowledge base initializer"""
35
+
36
+ def __init__(
37
+ self,
38
+ kb_name: str,
39
+ base_dir="./data/knowledge_bases",
40
+ api_key: str | None = None,
41
+ base_url: str | None = None,
42
+ progress_tracker: ProgressTracker | None = None,
43
+ rag_provider: str | None = None,
44
+ ):
45
+ self.kb_name = kb_name
46
+ self.base_dir = Path(base_dir)
47
+ self.kb_dir = self.base_dir / kb_name
48
+
49
+ # Directory structure
50
+ self.raw_dir = self.kb_dir / "raw"
51
+ self.images_dir = self.kb_dir / "images"
52
+ self.rag_storage_dir = self.kb_dir / "rag_storage"
53
+ self.content_list_dir = self.kb_dir / "content_list"
54
+
55
+ self.api_key = api_key
56
+ self.base_url = base_url
57
+ self.embedding_cfg = get_embedding_config()
58
+ self.progress_tracker = progress_tracker or ProgressTracker(kb_name, self.base_dir)
59
+ self.rag_provider = rag_provider
60
+
61
+ def _register_to_config(self):
62
+ """Register KB to kb_config.json (only knowledge_bases list, no default)."""
63
+ config_file = self.base_dir / "kb_config.json"
64
+ if config_file.exists():
65
+ try:
66
+ with open(config_file, encoding="utf-8") as f:
67
+ config = json.load(f)
68
+ except Exception as e:
69
+ logger.warning(f"Failed to read config: {e}, creating new")
70
+ config = {"knowledge_bases": {}}
71
+ else:
72
+ config = {"knowledge_bases": {}}
73
+
74
+ if "knowledge_bases" not in config:
75
+ config["knowledge_bases"] = {}
76
+
77
+ # Remove old "default" field if exists (migration)
78
+ if "default" in config:
79
+ del config["default"]
80
+
81
+ if self.kb_name not in config.get("knowledge_bases", {}):
82
+ config["knowledge_bases"][self.kb_name] = {
83
+ "path": self.kb_name,
84
+ "description": f"Knowledge base: {self.kb_name}",
85
+ }
86
+
87
+ try:
88
+ with open(config_file, "w", encoding="utf-8") as f:
89
+ json.dump(config, indent=2, ensure_ascii=False, fp=f)
90
+ logger.info(" ✓ Registered to kb_config.json")
91
+ except Exception as e:
92
+ logger.warning(f"Failed to update config: {e}")
93
+ else:
94
+ logger.info(" ✓ Already registered in kb_config.json")
95
+
96
+ def _update_metadata_with_provider(self, provider: str):
97
+ """Update metadata.json and centralized config with the RAG provider used."""
98
+ metadata_file = self.kb_dir / "metadata.json"
99
+ try:
100
+ if metadata_file.exists():
101
+ with open(metadata_file, encoding="utf-8") as f:
102
+ metadata = json.load(f)
103
+ else:
104
+ metadata = {}
105
+
106
+ metadata["rag_provider"] = provider
107
+ metadata["last_updated"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
108
+
109
+ with open(metadata_file, "w", encoding="utf-8") as f:
110
+ json.dump(metadata, indent=2, ensure_ascii=False, fp=f)
111
+
112
+ logger.info(f" ✓ Updated metadata with RAG provider: {provider}")
113
+
114
+ # Also save to centralized config file
115
+ try:
116
+ from src.services.config import get_kb_config_service
117
+
118
+ kb_config_service = get_kb_config_service()
119
+ kb_config_service.set_rag_provider(self.kb_name, provider)
120
+ logger.info(" ✓ Saved RAG provider to centralized config")
121
+ except Exception as config_err:
122
+ logger.warning(f"Failed to save to centralized config: {config_err}")
123
+
124
+ except Exception as e:
125
+ logger.warning(f"Failed to update metadata with provider: {e}")
126
+
127
+ def create_directory_structure(self):
128
+ """Create knowledge base directory structure"""
129
+ logger.info(f"Creating directory structure for knowledge base: {self.kb_name}")
130
+
131
+ for dir_path in [
132
+ self.raw_dir,
133
+ self.images_dir,
134
+ self.rag_storage_dir,
135
+ self.content_list_dir,
136
+ ]:
137
+ dir_path.mkdir(parents=True, exist_ok=True)
138
+ logger.info(f" ✓ Created: {dir_path}")
139
+
140
+ # Create metadata file
141
+ metadata = {
142
+ "name": self.kb_name,
143
+ "created_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
144
+ "description": f"Knowledge base: {self.kb_name}",
145
+ "version": "1.0",
146
+ "rag_provider": None, # Will be set during document processing
147
+ }
148
+
149
+ metadata_file = self.kb_dir / "metadata.json"
150
+ with open(metadata_file, "w", encoding="utf-8") as f:
151
+ json.dump(metadata, indent=2, ensure_ascii=False, fp=f)
152
+
153
+ logger.info(f" ✓ Created metadata file: {metadata_file}")
154
+
155
+ # Automatically register to kb_config.json
156
+ self._register_to_config()
157
+
158
+ def copy_documents(self, source_files: list[str]):
159
+ """Copy documents to raw directory"""
160
+ logger.info(f"Copying {len(source_files)} documents to {self.raw_dir}")
161
+
162
+ copied_files = []
163
+ for source in source_files:
164
+ source_path = Path(source)
165
+ if not source_path.exists():
166
+ logger.warning(f" ⚠ Source file not found: {source}")
167
+ continue
168
+
169
+ dest_path = self.raw_dir / source_path.name
170
+ shutil.copy2(source_path, dest_path)
171
+ copied_files.append(str(dest_path))
172
+ logger.info(f" ✓ Copied: {source_path.name}")
173
+
174
+ return copied_files
175
+
176
+ async def process_documents(self):
177
+ """Process documents using RAGService with dynamic provider selection"""
178
+ # Use the provider passed during initialization, or fallback to env var
179
+ provider = self.rag_provider or os.getenv("RAG_PROVIDER", "lightrag")
180
+ logger.info(f"Processing documents with RAG provider: {provider}")
181
+
182
+ self.progress_tracker.update(
183
+ ProgressStage.PROCESSING_DOCUMENTS,
184
+ f"Starting to process documents with {provider} provider...",
185
+ current=0,
186
+ total=0,
187
+ )
188
+
189
+ # Get all documents in raw directory
190
+ doc_files = []
191
+ for ext in ["*.pdf", "*.docx", "*.doc", "*.txt", "*.md"]:
192
+ doc_files.extend(list(self.raw_dir.glob(ext)))
193
+
194
+ if not doc_files:
195
+ logger.warning("No documents found to process")
196
+ self.progress_tracker.update(
197
+ ProgressStage.ERROR, "No documents found to process", error="No documents found"
198
+ )
199
+ return
200
+
201
+ logger.info(f"Found {len(doc_files)} document(s) to process")
202
+ self.progress_tracker.update(
203
+ ProgressStage.PROCESSING_DOCUMENTS,
204
+ f"Found {len(doc_files)} documents, starting to process...",
205
+ current=0,
206
+ total=len(doc_files),
207
+ )
208
+
209
+ # Initialize RAGService with the selected provider
210
+ rag_service = RAGService(
211
+ kb_base_dir=str(
212
+ self.base_dir
213
+ ), # Base directory for all KBs (e.g., data/knowledge_bases)
214
+ provider=provider,
215
+ )
216
+
217
+ # Convert Path objects to strings for file paths
218
+ file_paths = [str(doc_file) for doc_file in doc_files]
219
+
220
+ try:
221
+ # Process all documents using the RAGService
222
+ success = await rag_service.initialize(
223
+ kb_name=self.kb_name,
224
+ file_paths=file_paths,
225
+ extract_numbered_items=True, # Enable numbered items extraction
226
+ )
227
+
228
+ if success:
229
+ logger.info("✓ Document processing completed!")
230
+
231
+ # Update metadata with the RAG provider used
232
+ self._update_metadata_with_provider(provider)
233
+
234
+ self.progress_tracker.update(
235
+ ProgressStage.PROCESSING_DOCUMENTS,
236
+ "Documents processed successfully",
237
+ current=len(doc_files),
238
+ total=len(doc_files),
239
+ )
240
+ else:
241
+ logger.error("Document processing failed")
242
+ self.progress_tracker.update(
243
+ ProgressStage.ERROR,
244
+ "Document processing failed",
245
+ error="RAG pipeline returned failure",
246
+ )
247
+
248
+ except asyncio.TimeoutError:
249
+ error_msg = "Processing timeout (>10 minutes)"
250
+ logger.error("✗ Timeout processing documents")
251
+ logger.error("Possible causes: Large files, slow embedding API, network issues")
252
+ self.progress_tracker.update(
253
+ ProgressStage.ERROR,
254
+ "Timeout processing documents",
255
+ error=error_msg,
256
+ )
257
+ except Exception as e:
258
+ error_msg = str(e)
259
+ logger.error(f"✗ Error processing documents: {error_msg}")
260
+ import traceback
261
+
262
+ logger.error(traceback.format_exc())
263
+ self.progress_tracker.update(
264
+ ProgressStage.ERROR,
265
+ "Failed to process documents",
266
+ error=error_msg,
267
+ )
268
+
269
+ # Fix structure: flatten nested content_list directories (for RAGAnything compatibility)
270
+ await self.fix_structure()
271
+
272
+ # Display statistics
273
+ await self.display_statistics_generic()
274
+
275
+ async def fix_structure(self):
276
+ """
277
+ Fix the nested structure created by process_document_complete.
278
+ Flattens content_list directories and moves images to the correct location.
279
+ """
280
+ logger.info("\nFixing directory structure...")
281
+
282
+ # Find nested content lists
283
+ content_list_moves = []
284
+ for doc_dir in self.content_list_dir.glob("*"):
285
+ if not doc_dir.is_dir():
286
+ continue
287
+
288
+ auto_dir = doc_dir / "auto"
289
+ if not auto_dir.exists():
290
+ continue
291
+
292
+ # Find the _content_list.json file
293
+ for json_file in auto_dir.glob("*_content_list.json"):
294
+ target_file = self.content_list_dir / f"{doc_dir.name}.json"
295
+ content_list_moves.append((json_file, target_file))
296
+
297
+ # Move content list files
298
+ for source, target in content_list_moves:
299
+ try:
300
+ shutil.copy2(source, target)
301
+ logger.info(f" ✓ Moved: {source.name} -> {target.name}")
302
+ except Exception as e:
303
+ logger.error(f" ✗ Error moving {source.name}: {e!s}")
304
+
305
+ # Find and move nested images
306
+ for doc_dir in self.content_list_dir.glob("*"):
307
+ if not doc_dir.is_dir():
308
+ continue
309
+
310
+ auto_dir = doc_dir / "auto"
311
+ if not auto_dir.exists():
312
+ continue
313
+
314
+ images_dir = auto_dir / "images"
315
+ if images_dir.exists() and images_dir.is_dir():
316
+ image_count = 0
317
+ # Ensure target directory exists
318
+ self.images_dir.mkdir(parents=True, exist_ok=True)
319
+
320
+ for img_file in images_dir.glob("*"):
321
+ if img_file.is_file() and img_file.exists():
322
+ target_img = self.images_dir / img_file.name
323
+ if not target_img.exists():
324
+ try:
325
+ # Ensure source file exists
326
+ if not img_file.exists():
327
+ logger.warning(f" ⚠ Source image not found: {img_file}")
328
+ continue
329
+ shutil.copy2(img_file, target_img)
330
+ image_count += 1
331
+ except FileNotFoundError:
332
+ logger.error(
333
+ f" ✗ Error moving image {img_file.name}: Source file not found: {img_file}"
334
+ )
335
+ except Exception as e:
336
+ logger.error(f" ✗ Error moving image {img_file.name}: {e!s}")
337
+
338
+ if image_count > 0:
339
+ logger.info(f" ✓ Moved {image_count} images from {doc_dir.name}/auto/images/")
340
+
341
+ # Clean up nested directories
342
+ for doc_dir in self.content_list_dir.glob("*"):
343
+ if doc_dir.is_dir():
344
+ try:
345
+ shutil.rmtree(doc_dir)
346
+ logger.info(f" ✓ Cleaned up: {doc_dir.name}/")
347
+ except Exception as e:
348
+ logger.error(f" ✗ Error removing {doc_dir.name}: {e!s}")
349
+
350
+ logger.info("✓ Structure fixed!")
351
+
352
+ def extract_numbered_items(self, batch_size: int = 20):
353
+ """
354
+ Extract numbered items from knowledge base (Definition, Proposition, Equation, Figure, etc.)
355
+
356
+ Args:
357
+ batch_size: Number of items to process per batch
358
+ """
359
+ logger.info("\n" + "=" * 60)
360
+ logger.info("🔍 Starting to extract numbered items...")
361
+ logger.info("=" * 60 + "\n")
362
+
363
+ self.progress_tracker.update(
364
+ ProgressStage.EXTRACTING_ITEMS,
365
+ "Starting to extract numbered items...",
366
+ current=0,
367
+ total=0,
368
+ )
369
+
370
+ # Get LLM config for credentials
371
+ llm_cfg = get_llm_config()
372
+ api_key = self.api_key or llm_cfg.api_key
373
+ base_url = self.base_url or llm_cfg.base_url
374
+
375
+ output_file = self.kb_dir / "numbered_items.json"
376
+ content_list_files = sorted(self.content_list_dir.glob("*.json"))
377
+
378
+ if not content_list_files:
379
+ logger.warning("No content_list files found, skipping numbered items extraction")
380
+ return
381
+
382
+ logger.info(f"Found {len(content_list_files)} content_list files")
383
+ self.progress_tracker.update(
384
+ ProgressStage.EXTRACTING_ITEMS,
385
+ f"Found {len(content_list_files)} files, starting extraction...",
386
+ current=0,
387
+ total=len(content_list_files),
388
+ )
389
+
390
+ try:
391
+ # Process all content_list files
392
+ for idx, content_list_file in enumerate(content_list_files, 1):
393
+ logger.info(
394
+ f"\nProcessing file [{idx}/{len(content_list_files)}]: {content_list_file.name}"
395
+ )
396
+ self.progress_tracker.update(
397
+ ProgressStage.EXTRACTING_ITEMS,
398
+ f"Extracting: {content_list_file.name}",
399
+ current=idx,
400
+ total=len(content_list_files),
401
+ file_name=content_list_file.name,
402
+ )
403
+
404
+ # First file doesn't merge (creates new file), subsequent files merge into existing results
405
+ merge = idx > 1
406
+
407
+ process_content_list(
408
+ content_list_file=content_list_file,
409
+ output_file=output_file,
410
+ api_key=api_key,
411
+ base_url=base_url,
412
+ batch_size=batch_size,
413
+ merge=merge,
414
+ )
415
+
416
+ logger.info(f"\n{'=' * 60}")
417
+ logger.info("✓ Numbered items extraction completed!")
418
+ logger.info(f"Output file: {output_file}")
419
+ logger.info(f"{'=' * 60}\n")
420
+
421
+ self.progress_tracker.update(
422
+ ProgressStage.COMPLETED,
423
+ "Knowledge base initialization completed!",
424
+ current=len(content_list_files),
425
+ total=len(content_list_files),
426
+ )
427
+
428
+ except Exception as e:
429
+ error_msg = str(e)
430
+ logger.error(f"\n✗ Numbered items extraction failed: {error_msg}")
431
+ import traceback
432
+
433
+ traceback.print_exc()
434
+ self.progress_tracker.update(
435
+ ProgressStage.ERROR, "Numbered items extraction failed", error=error_msg
436
+ )
437
+
438
+ async def display_statistics(self, rag):
439
+ """Display knowledge base statistics (legacy - for RAGAnything)"""
440
+ await self.display_statistics_generic()
441
+
442
+ async def display_statistics_generic(self):
443
+ """Display knowledge base statistics (provider-agnostic)"""
444
+ logger.info("\n" + "=" * 50)
445
+ logger.info("Knowledge Base Statistics")
446
+ logger.info("=" * 50)
447
+
448
+ # Count files
449
+ raw_files = list(self.raw_dir.glob("*"))
450
+ image_files = list(self.images_dir.glob("*"))
451
+ content_files = list(self.content_list_dir.glob("*.json"))
452
+
453
+ logger.info(f"Raw documents: {len(raw_files)}")
454
+ logger.info(f"Extracted images: {len(image_files)}")
455
+ logger.info(f"Content lists: {len(content_files)}")
456
+
457
+ # Read provider from metadata instead of env var
458
+ provider = self.rag_provider or os.getenv("RAG_PROVIDER", "lightrag")
459
+
460
+ # Try to read from metadata.json if available
461
+ metadata_file = self.kb_dir / "metadata.json"
462
+ if metadata_file.exists():
463
+ try:
464
+ with open(metadata_file, encoding="utf-8") as f:
465
+ metadata = json.load(f)
466
+ if "rag_provider" in metadata and metadata["rag_provider"]:
467
+ provider = metadata["rag_provider"]
468
+ except Exception:
469
+ pass
470
+
471
+ # RAGAnything/LightRAG format
472
+ entities_file = self.rag_storage_dir / "kv_store_full_entities.json"
473
+ relations_file = self.rag_storage_dir / "kv_store_full_relations.json"
474
+ chunks_file = self.rag_storage_dir / "kv_store_text_chunks.json"
475
+
476
+ # LlamaIndex format
477
+ vector_store_dir = self.base_dir / self.kb_name / "vector_store"
478
+
479
+ try:
480
+ if entities_file.exists():
481
+ with open(entities_file, encoding="utf-8") as f:
482
+ entities = json.load(f)
483
+ logger.info(f"Knowledge entities: {len(entities)}")
484
+
485
+ if relations_file.exists():
486
+ with open(relations_file, encoding="utf-8") as f:
487
+ relations = json.load(f)
488
+ logger.info(f"Knowledge relations: {len(relations)}")
489
+
490
+ if chunks_file.exists():
491
+ with open(chunks_file, encoding="utf-8") as f:
492
+ chunks = json.load(f)
493
+ logger.info(f"Text chunks: {len(chunks)}")
494
+
495
+ if vector_store_dir.exists():
496
+ metadata_file = vector_store_dir / "metadata.json"
497
+ if metadata_file.exists():
498
+ with open(metadata_file, encoding="utf-8") as f:
499
+ metadata = json.load(f)
500
+ logger.info(f"Vector embeddings: {metadata.get('num_embeddings', 0)}")
501
+ logger.info(f"Embedding dimension: {metadata.get('dimension', 0)}")
502
+ except Exception as e:
503
+ logger.warning(f"Could not retrieve statistics: {e!s}")
504
+
505
+ logger.info(f"Provider used: {provider}")
506
+ logger.info("=" * 50)
507
+
508
+
509
+ async def main():
510
+ """Main function"""
511
+ parser = argparse.ArgumentParser(
512
+ description="Initialize a new knowledge base from documents",
513
+ formatter_class=argparse.RawDescriptionHelpFormatter,
514
+ epilog="""
515
+ Example usage:
516
+ # Initialize new knowledge base from documents (with auto extraction)
517
+ python init_knowledge_base.py my_kb --docs document1.pdf document2.pdf
518
+
519
+ # Initialize from a directory
520
+ python init_knowledge_base.py my_kb --docs-dir ./my_documents/
521
+
522
+ # Initialize without numbered items extraction
523
+ python init_knowledge_base.py my_kb --docs document.pdf --skip-extract
524
+
525
+ # Adjust batch size for extraction (for large knowledge bases)
526
+ python init_knowledge_base.py my_kb --docs document.pdf --batch-size 30
527
+ """,
528
+ )
529
+
530
+ parser.add_argument("name", help="Knowledge base name")
531
+ parser.add_argument("--docs", nargs="+", help="Document files to process")
532
+ parser.add_argument("--docs-dir", help="Directory containing documents to process")
533
+ parser.add_argument(
534
+ "--base-dir",
535
+ default="./knowledge_bases",
536
+ help="Base directory for knowledge bases (default: ./knowledge_bases)",
537
+ )
538
+ parser.add_argument("--api-key", default=os.getenv("LLM_API_KEY"), help="OpenAI API key")
539
+ parser.add_argument("--base-url", default=os.getenv("LLM_HOST"), help="API base URL")
540
+ parser.add_argument(
541
+ "--skip-processing",
542
+ action="store_true",
543
+ help="Skip document processing (only create structure)",
544
+ )
545
+ parser.add_argument(
546
+ "--skip-extract",
547
+ action="store_true",
548
+ help="Skip numbered items extraction after initialization",
549
+ )
550
+ parser.add_argument(
551
+ "--batch-size",
552
+ type=int,
553
+ default=20,
554
+ help="Batch size for numbered items extraction (default: 20)",
555
+ )
556
+
557
+ args = parser.parse_args()
558
+
559
+ # Check API key
560
+ if not args.skip_processing and not args.api_key:
561
+ logger.error("Error: OpenAI API key required")
562
+ logger.error("Set LLM_API_KEY environment variable or use --api-key option")
563
+ return
564
+
565
+ # Collect document files
566
+ doc_files = []
567
+ if args.docs:
568
+ doc_files.extend(args.docs)
569
+
570
+ if args.docs_dir:
571
+ docs_dir = Path(args.docs_dir)
572
+ if docs_dir.exists() and docs_dir.is_dir():
573
+ for ext in ["*.pdf", "*.docx", "*.doc", "*.txt", "*.md"]:
574
+ doc_files.extend([str(f) for f in docs_dir.glob(ext)])
575
+ else:
576
+ logger.error(f"Error: Documents directory not found: {args.docs_dir}")
577
+ return
578
+
579
+ if not args.skip_processing and not doc_files:
580
+ logger.error("Error: No documents specified")
581
+ logger.error("Use --docs or --docs-dir to specify documents")
582
+ return
583
+
584
+ # Initialize knowledge base
585
+ logger.info(f"\n{'=' * 60}")
586
+ logger.info(f"Initializing Knowledge Base: {args.name}")
587
+ logger.info(f"{'=' * 60}\n")
588
+
589
+ initializer = KnowledgeBaseInitializer(
590
+ kb_name=args.name, base_dir=args.base_dir, api_key=args.api_key, base_url=args.base_url
591
+ )
592
+
593
+ # Create directory structure
594
+ initializer.create_directory_structure()
595
+
596
+ # Copy documents
597
+ if doc_files:
598
+ copied_files = initializer.copy_documents(doc_files)
599
+ logger.info(f"\nCopied {len(copied_files)} file(s) to raw directory")
600
+
601
+ # Process documents
602
+ if not args.skip_processing:
603
+ await initializer.process_documents()
604
+ else:
605
+ logger.info("\nSkipping document processing (--skip-processing specified)")
606
+
607
+ # Extract numbered items (automatically after processing)
608
+ if not args.skip_processing and not args.skip_extract:
609
+ initializer.extract_numbered_items(batch_size=args.batch_size)
610
+ elif args.skip_extract:
611
+ logger.info("\nSkipping numbered items extraction (--skip-extract specified)")
612
+
613
+ logger.info(f"\n{'=' * 60}")
614
+ logger.info(f"✓ Knowledge base '{args.name}' initialized successfully!")
615
+ logger.info(f"Location: {initializer.kb_dir}")
616
+ logger.info(f"{'=' * 60}\n")
617
+
618
+
619
+ if __name__ == "__main__":
620
+ # Logging configuration already completed during module import, no need to configure again here
621
+ asyncio.run(main())
src/knowledge/kb.py ADDED
@@ -0,0 +1,22 @@
1
+ #!/usr/bin/env python
2
+ """
3
+ Knowledge Base Management Tool - Standalone Entry Script
4
+ Can be run directly: python knowledge_init/kb.py [command]
5
+ """
6
+
7
+ from pathlib import Path
8
+ import sys
9
+
10
+ # Ensure project root is in sys.path
11
+ project_root = Path(__file__).parent.parent.parent
12
+ if str(project_root) not in sys.path:
13
+ sys.path.insert(0, str(project_root))
14
+
15
+ # Import main function from startup script
16
+ from src.knowledge.manager import KnowledgeBaseManager as KnowledgeBase
17
+ from src.knowledge.start_kb import main
18
+
19
+ __all__ = ["KnowledgeBase"]
20
+
21
+ if __name__ == "__main__":
22
+ main()