realtimex-deeptutor 0.5.0.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (276) hide show
  1. realtimex_deeptutor/__init__.py +67 -0
  2. realtimex_deeptutor-0.5.0.post1.dist-info/METADATA +1612 -0
  3. realtimex_deeptutor-0.5.0.post1.dist-info/RECORD +276 -0
  4. realtimex_deeptutor-0.5.0.post1.dist-info/WHEEL +5 -0
  5. realtimex_deeptutor-0.5.0.post1.dist-info/entry_points.txt +2 -0
  6. realtimex_deeptutor-0.5.0.post1.dist-info/licenses/LICENSE +661 -0
  7. realtimex_deeptutor-0.5.0.post1.dist-info/top_level.txt +2 -0
  8. src/__init__.py +40 -0
  9. src/agents/__init__.py +24 -0
  10. src/agents/base_agent.py +657 -0
  11. src/agents/chat/__init__.py +24 -0
  12. src/agents/chat/chat_agent.py +435 -0
  13. src/agents/chat/prompts/en/chat_agent.yaml +35 -0
  14. src/agents/chat/prompts/zh/chat_agent.yaml +35 -0
  15. src/agents/chat/session_manager.py +311 -0
  16. src/agents/co_writer/__init__.py +0 -0
  17. src/agents/co_writer/edit_agent.py +260 -0
  18. src/agents/co_writer/narrator_agent.py +423 -0
  19. src/agents/co_writer/prompts/en/edit_agent.yaml +113 -0
  20. src/agents/co_writer/prompts/en/narrator_agent.yaml +88 -0
  21. src/agents/co_writer/prompts/zh/edit_agent.yaml +113 -0
  22. src/agents/co_writer/prompts/zh/narrator_agent.yaml +88 -0
  23. src/agents/guide/__init__.py +16 -0
  24. src/agents/guide/agents/__init__.py +11 -0
  25. src/agents/guide/agents/chat_agent.py +104 -0
  26. src/agents/guide/agents/interactive_agent.py +223 -0
  27. src/agents/guide/agents/locate_agent.py +149 -0
  28. src/agents/guide/agents/summary_agent.py +150 -0
  29. src/agents/guide/guide_manager.py +500 -0
  30. src/agents/guide/prompts/en/chat_agent.yaml +41 -0
  31. src/agents/guide/prompts/en/interactive_agent.yaml +202 -0
  32. src/agents/guide/prompts/en/locate_agent.yaml +68 -0
  33. src/agents/guide/prompts/en/summary_agent.yaml +157 -0
  34. src/agents/guide/prompts/zh/chat_agent.yaml +41 -0
  35. src/agents/guide/prompts/zh/interactive_agent.yaml +626 -0
  36. src/agents/guide/prompts/zh/locate_agent.yaml +68 -0
  37. src/agents/guide/prompts/zh/summary_agent.yaml +157 -0
  38. src/agents/ideagen/__init__.py +12 -0
  39. src/agents/ideagen/idea_generation_workflow.py +426 -0
  40. src/agents/ideagen/material_organizer_agent.py +173 -0
  41. src/agents/ideagen/prompts/en/idea_generation.yaml +187 -0
  42. src/agents/ideagen/prompts/en/material_organizer.yaml +69 -0
  43. src/agents/ideagen/prompts/zh/idea_generation.yaml +187 -0
  44. src/agents/ideagen/prompts/zh/material_organizer.yaml +69 -0
  45. src/agents/question/__init__.py +24 -0
  46. src/agents/question/agents/__init__.py +18 -0
  47. src/agents/question/agents/generate_agent.py +381 -0
  48. src/agents/question/agents/relevance_analyzer.py +207 -0
  49. src/agents/question/agents/retrieve_agent.py +239 -0
  50. src/agents/question/coordinator.py +718 -0
  51. src/agents/question/example.py +109 -0
  52. src/agents/question/prompts/en/coordinator.yaml +75 -0
  53. src/agents/question/prompts/en/generate_agent.yaml +77 -0
  54. src/agents/question/prompts/en/relevance_analyzer.yaml +41 -0
  55. src/agents/question/prompts/en/retrieve_agent.yaml +32 -0
  56. src/agents/question/prompts/zh/coordinator.yaml +75 -0
  57. src/agents/question/prompts/zh/generate_agent.yaml +77 -0
  58. src/agents/question/prompts/zh/relevance_analyzer.yaml +39 -0
  59. src/agents/question/prompts/zh/retrieve_agent.yaml +30 -0
  60. src/agents/research/agents/__init__.py +23 -0
  61. src/agents/research/agents/decompose_agent.py +507 -0
  62. src/agents/research/agents/manager_agent.py +228 -0
  63. src/agents/research/agents/note_agent.py +180 -0
  64. src/agents/research/agents/rephrase_agent.py +263 -0
  65. src/agents/research/agents/reporting_agent.py +1333 -0
  66. src/agents/research/agents/research_agent.py +714 -0
  67. src/agents/research/data_structures.py +451 -0
  68. src/agents/research/main.py +188 -0
  69. src/agents/research/prompts/en/decompose_agent.yaml +89 -0
  70. src/agents/research/prompts/en/manager_agent.yaml +24 -0
  71. src/agents/research/prompts/en/note_agent.yaml +121 -0
  72. src/agents/research/prompts/en/rephrase_agent.yaml +58 -0
  73. src/agents/research/prompts/en/reporting_agent.yaml +380 -0
  74. src/agents/research/prompts/en/research_agent.yaml +173 -0
  75. src/agents/research/prompts/zh/decompose_agent.yaml +89 -0
  76. src/agents/research/prompts/zh/manager_agent.yaml +24 -0
  77. src/agents/research/prompts/zh/note_agent.yaml +121 -0
  78. src/agents/research/prompts/zh/rephrase_agent.yaml +58 -0
  79. src/agents/research/prompts/zh/reporting_agent.yaml +380 -0
  80. src/agents/research/prompts/zh/research_agent.yaml +173 -0
  81. src/agents/research/research_pipeline.py +1309 -0
  82. src/agents/research/utils/__init__.py +60 -0
  83. src/agents/research/utils/citation_manager.py +799 -0
  84. src/agents/research/utils/json_utils.py +98 -0
  85. src/agents/research/utils/token_tracker.py +297 -0
  86. src/agents/solve/__init__.py +80 -0
  87. src/agents/solve/analysis_loop/__init__.py +14 -0
  88. src/agents/solve/analysis_loop/investigate_agent.py +414 -0
  89. src/agents/solve/analysis_loop/note_agent.py +190 -0
  90. src/agents/solve/main_solver.py +862 -0
  91. src/agents/solve/memory/__init__.py +34 -0
  92. src/agents/solve/memory/citation_memory.py +353 -0
  93. src/agents/solve/memory/investigate_memory.py +226 -0
  94. src/agents/solve/memory/solve_memory.py +340 -0
  95. src/agents/solve/prompts/en/analysis_loop/investigate_agent.yaml +55 -0
  96. src/agents/solve/prompts/en/analysis_loop/note_agent.yaml +54 -0
  97. src/agents/solve/prompts/en/solve_loop/manager_agent.yaml +67 -0
  98. src/agents/solve/prompts/en/solve_loop/precision_answer_agent.yaml +62 -0
  99. src/agents/solve/prompts/en/solve_loop/response_agent.yaml +90 -0
  100. src/agents/solve/prompts/en/solve_loop/solve_agent.yaml +75 -0
  101. src/agents/solve/prompts/en/solve_loop/tool_agent.yaml +38 -0
  102. src/agents/solve/prompts/zh/analysis_loop/investigate_agent.yaml +53 -0
  103. src/agents/solve/prompts/zh/analysis_loop/note_agent.yaml +54 -0
  104. src/agents/solve/prompts/zh/solve_loop/manager_agent.yaml +66 -0
  105. src/agents/solve/prompts/zh/solve_loop/precision_answer_agent.yaml +62 -0
  106. src/agents/solve/prompts/zh/solve_loop/response_agent.yaml +90 -0
  107. src/agents/solve/prompts/zh/solve_loop/solve_agent.yaml +76 -0
  108. src/agents/solve/prompts/zh/solve_loop/tool_agent.yaml +41 -0
  109. src/agents/solve/solve_loop/__init__.py +22 -0
  110. src/agents/solve/solve_loop/citation_manager.py +74 -0
  111. src/agents/solve/solve_loop/manager_agent.py +274 -0
  112. src/agents/solve/solve_loop/precision_answer_agent.py +96 -0
  113. src/agents/solve/solve_loop/response_agent.py +301 -0
  114. src/agents/solve/solve_loop/solve_agent.py +325 -0
  115. src/agents/solve/solve_loop/tool_agent.py +470 -0
  116. src/agents/solve/utils/__init__.py +64 -0
  117. src/agents/solve/utils/config_validator.py +313 -0
  118. src/agents/solve/utils/display_manager.py +223 -0
  119. src/agents/solve/utils/error_handler.py +363 -0
  120. src/agents/solve/utils/json_utils.py +98 -0
  121. src/agents/solve/utils/performance_monitor.py +407 -0
  122. src/agents/solve/utils/token_tracker.py +541 -0
  123. src/api/__init__.py +0 -0
  124. src/api/main.py +240 -0
  125. src/api/routers/__init__.py +1 -0
  126. src/api/routers/agent_config.py +69 -0
  127. src/api/routers/chat.py +296 -0
  128. src/api/routers/co_writer.py +337 -0
  129. src/api/routers/config.py +627 -0
  130. src/api/routers/dashboard.py +18 -0
  131. src/api/routers/guide.py +337 -0
  132. src/api/routers/ideagen.py +436 -0
  133. src/api/routers/knowledge.py +821 -0
  134. src/api/routers/notebook.py +247 -0
  135. src/api/routers/question.py +537 -0
  136. src/api/routers/research.py +394 -0
  137. src/api/routers/settings.py +164 -0
  138. src/api/routers/solve.py +305 -0
  139. src/api/routers/system.py +252 -0
  140. src/api/run_server.py +61 -0
  141. src/api/utils/history.py +172 -0
  142. src/api/utils/log_interceptor.py +21 -0
  143. src/api/utils/notebook_manager.py +415 -0
  144. src/api/utils/progress_broadcaster.py +72 -0
  145. src/api/utils/task_id_manager.py +100 -0
  146. src/config/__init__.py +0 -0
  147. src/config/accessors.py +18 -0
  148. src/config/constants.py +34 -0
  149. src/config/defaults.py +18 -0
  150. src/config/schema.py +38 -0
  151. src/config/settings.py +50 -0
  152. src/core/errors.py +62 -0
  153. src/knowledge/__init__.py +23 -0
  154. src/knowledge/add_documents.py +606 -0
  155. src/knowledge/config.py +65 -0
  156. src/knowledge/example_add_documents.py +236 -0
  157. src/knowledge/extract_numbered_items.py +1039 -0
  158. src/knowledge/initializer.py +621 -0
  159. src/knowledge/kb.py +22 -0
  160. src/knowledge/manager.py +782 -0
  161. src/knowledge/progress_tracker.py +182 -0
  162. src/knowledge/start_kb.py +535 -0
  163. src/logging/__init__.py +103 -0
  164. src/logging/adapters/__init__.py +17 -0
  165. src/logging/adapters/lightrag.py +184 -0
  166. src/logging/adapters/llamaindex.py +141 -0
  167. src/logging/config.py +80 -0
  168. src/logging/handlers/__init__.py +20 -0
  169. src/logging/handlers/console.py +75 -0
  170. src/logging/handlers/file.py +201 -0
  171. src/logging/handlers/websocket.py +127 -0
  172. src/logging/logger.py +709 -0
  173. src/logging/stats/__init__.py +16 -0
  174. src/logging/stats/llm_stats.py +179 -0
  175. src/services/__init__.py +56 -0
  176. src/services/config/__init__.py +61 -0
  177. src/services/config/knowledge_base_config.py +210 -0
  178. src/services/config/loader.py +260 -0
  179. src/services/config/unified_config.py +603 -0
  180. src/services/embedding/__init__.py +45 -0
  181. src/services/embedding/adapters/__init__.py +22 -0
  182. src/services/embedding/adapters/base.py +106 -0
  183. src/services/embedding/adapters/cohere.py +127 -0
  184. src/services/embedding/adapters/jina.py +99 -0
  185. src/services/embedding/adapters/ollama.py +116 -0
  186. src/services/embedding/adapters/openai_compatible.py +96 -0
  187. src/services/embedding/client.py +159 -0
  188. src/services/embedding/config.py +156 -0
  189. src/services/embedding/provider.py +119 -0
  190. src/services/llm/__init__.py +152 -0
  191. src/services/llm/capabilities.py +313 -0
  192. src/services/llm/client.py +302 -0
  193. src/services/llm/cloud_provider.py +530 -0
  194. src/services/llm/config.py +200 -0
  195. src/services/llm/error_mapping.py +103 -0
  196. src/services/llm/exceptions.py +152 -0
  197. src/services/llm/factory.py +450 -0
  198. src/services/llm/local_provider.py +347 -0
  199. src/services/llm/providers/anthropic.py +95 -0
  200. src/services/llm/providers/base_provider.py +93 -0
  201. src/services/llm/providers/open_ai.py +83 -0
  202. src/services/llm/registry.py +71 -0
  203. src/services/llm/telemetry.py +40 -0
  204. src/services/llm/types.py +27 -0
  205. src/services/llm/utils.py +333 -0
  206. src/services/prompt/__init__.py +25 -0
  207. src/services/prompt/manager.py +206 -0
  208. src/services/rag/__init__.py +64 -0
  209. src/services/rag/components/__init__.py +29 -0
  210. src/services/rag/components/base.py +59 -0
  211. src/services/rag/components/chunkers/__init__.py +18 -0
  212. src/services/rag/components/chunkers/base.py +34 -0
  213. src/services/rag/components/chunkers/fixed.py +71 -0
  214. src/services/rag/components/chunkers/numbered_item.py +94 -0
  215. src/services/rag/components/chunkers/semantic.py +97 -0
  216. src/services/rag/components/embedders/__init__.py +14 -0
  217. src/services/rag/components/embedders/base.py +32 -0
  218. src/services/rag/components/embedders/openai.py +63 -0
  219. src/services/rag/components/indexers/__init__.py +18 -0
  220. src/services/rag/components/indexers/base.py +35 -0
  221. src/services/rag/components/indexers/graph.py +172 -0
  222. src/services/rag/components/indexers/lightrag.py +156 -0
  223. src/services/rag/components/indexers/vector.py +146 -0
  224. src/services/rag/components/parsers/__init__.py +18 -0
  225. src/services/rag/components/parsers/base.py +35 -0
  226. src/services/rag/components/parsers/markdown.py +52 -0
  227. src/services/rag/components/parsers/pdf.py +115 -0
  228. src/services/rag/components/parsers/text.py +86 -0
  229. src/services/rag/components/retrievers/__init__.py +18 -0
  230. src/services/rag/components/retrievers/base.py +34 -0
  231. src/services/rag/components/retrievers/dense.py +200 -0
  232. src/services/rag/components/retrievers/hybrid.py +164 -0
  233. src/services/rag/components/retrievers/lightrag.py +169 -0
  234. src/services/rag/components/routing.py +286 -0
  235. src/services/rag/factory.py +234 -0
  236. src/services/rag/pipeline.py +215 -0
  237. src/services/rag/pipelines/__init__.py +32 -0
  238. src/services/rag/pipelines/academic.py +44 -0
  239. src/services/rag/pipelines/lightrag.py +43 -0
  240. src/services/rag/pipelines/llamaindex.py +313 -0
  241. src/services/rag/pipelines/raganything.py +384 -0
  242. src/services/rag/service.py +244 -0
  243. src/services/rag/types.py +73 -0
  244. src/services/search/__init__.py +284 -0
  245. src/services/search/base.py +87 -0
  246. src/services/search/consolidation.py +398 -0
  247. src/services/search/providers/__init__.py +128 -0
  248. src/services/search/providers/baidu.py +188 -0
  249. src/services/search/providers/exa.py +194 -0
  250. src/services/search/providers/jina.py +161 -0
  251. src/services/search/providers/perplexity.py +153 -0
  252. src/services/search/providers/serper.py +209 -0
  253. src/services/search/providers/tavily.py +161 -0
  254. src/services/search/types.py +114 -0
  255. src/services/setup/__init__.py +34 -0
  256. src/services/setup/init.py +285 -0
  257. src/services/tts/__init__.py +16 -0
  258. src/services/tts/config.py +99 -0
  259. src/tools/__init__.py +91 -0
  260. src/tools/code_executor.py +536 -0
  261. src/tools/paper_search_tool.py +171 -0
  262. src/tools/query_item_tool.py +310 -0
  263. src/tools/question/__init__.py +15 -0
  264. src/tools/question/exam_mimic.py +616 -0
  265. src/tools/question/pdf_parser.py +211 -0
  266. src/tools/question/question_extractor.py +397 -0
  267. src/tools/rag_tool.py +173 -0
  268. src/tools/tex_chunker.py +339 -0
  269. src/tools/tex_downloader.py +253 -0
  270. src/tools/web_search.py +71 -0
  271. src/utils/config_manager.py +206 -0
  272. src/utils/document_validator.py +168 -0
  273. src/utils/error_rate_tracker.py +111 -0
  274. src/utils/error_utils.py +82 -0
  275. src/utils/json_parser.py +110 -0
  276. src/utils/network/circuit_breaker.py +79 -0
@@ -0,0 +1,172 @@
1
+ """
2
+ Graph Indexer
3
+ =============
4
+
5
+ Knowledge graph indexer using LightRAG.
6
+ """
7
+
8
+ from pathlib import Path
9
+ import sys
10
+ from typing import Dict, List, Optional
11
+
12
+ from ...types import Document
13
+ from ..base import BaseComponent
14
+
15
+
16
+ class GraphIndexer(BaseComponent):
17
+ """
18
+ Knowledge graph indexer using LightRAG.
19
+
20
+ Builds a knowledge graph from documents for graph-based retrieval.
21
+ """
22
+
23
+ name = "graph_indexer"
24
+ _instances: Dict[str, any] = {} # Cache RAG instances
25
+
26
+ def __init__(self, kb_base_dir: Optional[str] = None):
27
+ """
28
+ Initialize graph indexer.
29
+
30
+ Args:
31
+ kb_base_dir: Base directory for knowledge bases
32
+ """
33
+ super().__init__()
34
+ self.kb_base_dir = kb_base_dir or str(
35
+ Path(__file__).resolve().parent.parent.parent.parent.parent.parent
36
+ / "data"
37
+ / "knowledge_bases"
38
+ )
39
+
40
+ def _get_rag_instance(self, kb_name: str):
41
+ """Get or create a RAGAnything instance."""
42
+ working_dir = str(Path(self.kb_base_dir) / kb_name / "rag_storage")
43
+
44
+ if working_dir in self._instances:
45
+ return self._instances[working_dir]
46
+
47
+ # Add RAG-Anything path
48
+ project_root = Path(__file__).resolve().parent.parent.parent.parent.parent.parent
49
+ raganything_path = project_root.parent / "raganything" / "RAG-Anything"
50
+ if raganything_path.exists() and str(raganything_path) not in sys.path:
51
+ sys.path.insert(0, str(raganything_path))
52
+
53
+ try:
54
+ from openai import AsyncOpenAI
55
+ from raganything import RAGAnything, RAGAnythingConfig
56
+
57
+ from src.services.embedding import get_embedding_client
58
+ from src.services.llm import get_llm_client
59
+
60
+ llm_client = get_llm_client()
61
+ embed_client = get_embedding_client()
62
+
63
+ # Create AsyncOpenAI client directly
64
+ openai_client = AsyncOpenAI(
65
+ api_key=llm_client.config.api_key,
66
+ base_url=llm_client.config.base_url,
67
+ )
68
+
69
+ # LLM function using services (ASYNC - LightRAG expects async functions)
70
+ async def llm_model_func(prompt, system_prompt=None, history_messages=None, **kwargs):
71
+ """Custom async LLM function that bypasses LightRAG's openai_complete_if_cache."""
72
+ if history_messages is None:
73
+ history_messages = []
74
+
75
+ # Build messages
76
+ messages = []
77
+ if system_prompt:
78
+ messages.append({"role": "system", "content": system_prompt})
79
+ messages.extend(history_messages)
80
+ messages.append({"role": "user", "content": prompt})
81
+
82
+ # Whitelist only valid OpenAI parameters
83
+ valid_params = {
84
+ "temperature",
85
+ "top_p",
86
+ "n",
87
+ "stream",
88
+ "stop",
89
+ "max_tokens",
90
+ "presence_penalty",
91
+ "frequency_penalty",
92
+ "logit_bias",
93
+ "user",
94
+ "seed",
95
+ }
96
+ clean_kwargs = {k: v for k, v in kwargs.items() if k in valid_params}
97
+
98
+ # Call OpenAI API directly (async)
99
+ response = await openai_client.chat.completions.create(
100
+ model=llm_client.config.model,
101
+ messages=messages,
102
+ **clean_kwargs,
103
+ )
104
+
105
+ return response.choices[0].message.content
106
+
107
+ config = RAGAnythingConfig(
108
+ working_dir=working_dir,
109
+ enable_image_processing=True,
110
+ enable_table_processing=True,
111
+ enable_equation_processing=True,
112
+ )
113
+
114
+ rag = RAGAnything(
115
+ config=config,
116
+ llm_model_func=llm_model_func,
117
+ embedding_func=embed_client.get_embedding_func(),
118
+ )
119
+
120
+ self._instances[working_dir] = rag
121
+ return rag
122
+
123
+ except ImportError as e:
124
+ self.logger.error(f"Failed to import RAG-Anything: {e}")
125
+ raise
126
+
127
+ async def process(self, kb_name: str, documents: List[Document], **kwargs) -> bool:
128
+ """
129
+ Build knowledge graph from documents.
130
+
131
+ Args:
132
+ kb_name: Knowledge base name
133
+ documents: List of documents to index
134
+ **kwargs: Additional arguments
135
+
136
+ Returns:
137
+ True if successful
138
+ """
139
+ self.logger.info(f"Building knowledge graph for {kb_name}...")
140
+
141
+ from src.logging.adapters import LightRAGLogContext
142
+
143
+ # Use log forwarding context
144
+ with LightRAGLogContext(scene="indexer"):
145
+ rag = self._get_rag_instance(kb_name)
146
+ await rag._ensure_lightrag_initialized()
147
+
148
+ for doc in documents:
149
+ if doc.content:
150
+ # Write content to temporary file
151
+ import os
152
+ import tempfile
153
+
154
+ tmp_path = None
155
+ try:
156
+ with tempfile.NamedTemporaryFile(
157
+ mode="w", encoding="utf-8", suffix=".txt", delete=False
158
+ ) as tmp_file:
159
+ tmp_file.write(doc.content)
160
+ tmp_path = tmp_file.name
161
+
162
+ # Use RAGAnything API
163
+ working_dir = str(Path(self.kb_base_dir) / kb_name / "rag_storage")
164
+ output_dir = os.path.join(working_dir, "output")
165
+ os.makedirs(output_dir, exist_ok=True)
166
+ await rag.process_document_complete(tmp_path, output_dir)
167
+ finally:
168
+ if tmp_path and os.path.exists(tmp_path):
169
+ os.unlink(tmp_path)
170
+
171
+ self.logger.info("Knowledge graph built successfully")
172
+ return True
@@ -0,0 +1,156 @@
1
+ """
2
+ LightRAG Indexer
3
+ ================
4
+
5
+ Pure LightRAG indexer (text-only, no multimodal processing).
6
+ """
7
+
8
+ from pathlib import Path
9
+ import sys
10
+ from typing import Dict, List, Optional
11
+
12
+ from ...types import Document
13
+ from ..base import BaseComponent
14
+
15
+
16
+ class LightRAGIndexer(BaseComponent):
17
+ """
18
+ Pure LightRAG knowledge graph indexer (text-only).
19
+
20
+ Uses LightRAG library directly without multimodal processing.
21
+ Faster than RAGAnything for text-only documents.
22
+ """
23
+
24
+ name = "lightrag_indexer"
25
+ _instances: Dict[str, any] = {} # Cache LightRAG instances
26
+
27
+ def __init__(self, kb_base_dir: Optional[str] = None):
28
+ """
29
+ Initialize LightRAG indexer.
30
+
31
+ Args:
32
+ kb_base_dir: Base directory for knowledge bases
33
+ """
34
+ super().__init__()
35
+ self.kb_base_dir = kb_base_dir or str(
36
+ Path(__file__).resolve().parent.parent.parent.parent.parent.parent
37
+ / "data"
38
+ / "knowledge_bases"
39
+ )
40
+
41
+ def _get_lightrag_instance(self, kb_name: str):
42
+ """Get or create a LightRAG instance (text-only)."""
43
+ working_dir = str(Path(self.kb_base_dir) / kb_name / "rag_storage")
44
+
45
+ if working_dir in self._instances:
46
+ return self._instances[working_dir]
47
+
48
+ # Add LightRAG path
49
+ project_root = Path(__file__).resolve().parent.parent.parent.parent.parent.parent
50
+ raganything_path = project_root.parent / "raganything" / "RAG-Anything"
51
+ if raganything_path.exists() and str(raganything_path) not in sys.path:
52
+ sys.path.insert(0, str(raganything_path))
53
+
54
+ try:
55
+ from lightrag import LightRAG
56
+ from openai import AsyncOpenAI
57
+
58
+ from src.services.embedding import get_embedding_client
59
+ from src.services.llm import get_llm_client
60
+
61
+ llm_client = get_llm_client()
62
+ embed_client = get_embedding_client()
63
+
64
+ # Create AsyncOpenAI client directly
65
+ openai_client = AsyncOpenAI(
66
+ api_key=llm_client.config.api_key,
67
+ base_url=llm_client.config.base_url,
68
+ )
69
+
70
+ # LLM function using services (ASYNC - LightRAG expects async functions)
71
+ async def llm_model_func(prompt, system_prompt=None, history_messages=None, **kwargs):
72
+ """Custom async LLM function that bypasses LightRAG's openai_complete_if_cache."""
73
+ if history_messages is None:
74
+ history_messages = []
75
+
76
+ # Build messages
77
+ messages = []
78
+ if system_prompt:
79
+ messages.append({"role": "system", "content": system_prompt})
80
+ messages.extend(history_messages)
81
+ messages.append({"role": "user", "content": prompt})
82
+
83
+ # Whitelist only valid OpenAI parameters
84
+ valid_params = {
85
+ "temperature",
86
+ "top_p",
87
+ "n",
88
+ "stream",
89
+ "stop",
90
+ "max_tokens",
91
+ "presence_penalty",
92
+ "frequency_penalty",
93
+ "logit_bias",
94
+ "user",
95
+ "seed",
96
+ }
97
+ clean_kwargs = {k: v for k, v in kwargs.items() if k in valid_params}
98
+
99
+ # Call OpenAI API directly (async)
100
+ response = await openai_client.chat.completions.create(
101
+ model=llm_client.config.model,
102
+ messages=messages,
103
+ **clean_kwargs,
104
+ )
105
+
106
+ return response.choices[0].message.content
107
+
108
+ # Create pure LightRAG instance (no multimodal)
109
+ rag = LightRAG(
110
+ working_dir=working_dir,
111
+ llm_model_func=llm_model_func,
112
+ embedding_func=embed_client.get_embedding_func(), # Use proper EmbeddingFunc object
113
+ )
114
+
115
+ self._instances[working_dir] = rag
116
+ return rag
117
+
118
+ except ImportError as e:
119
+ self.logger.error(f"Failed to import LightRAG: {e}")
120
+ raise
121
+
122
+ async def process(self, kb_name: str, documents: List[Document], **kwargs) -> bool:
123
+ """
124
+ Build knowledge graph from documents (text-only).
125
+
126
+ Args:
127
+ kb_name: Knowledge base name
128
+ documents: List of documents to index
129
+ **kwargs: Additional arguments
130
+
131
+ Returns:
132
+ True if successful
133
+ """
134
+ self.logger.info(f"Building knowledge graph for {kb_name} (text-only)...")
135
+
136
+ from src.logging.adapters import LightRAGLogContext
137
+
138
+ # Use log forwarding context
139
+ with LightRAGLogContext(scene="LightRAG-Indexer"):
140
+ rag = self._get_lightrag_instance(kb_name)
141
+
142
+ # Initialize storages (required for LightRAG)
143
+ await rag.initialize_storages()
144
+
145
+ # Initialize pipeline status (required for document processing)
146
+ from lightrag.kg.shared_storage import initialize_pipeline_status
147
+
148
+ await initialize_pipeline_status()
149
+
150
+ for doc in documents:
151
+ if doc.content:
152
+ # Use direct LightRAG insert (text-only, fast)
153
+ await rag.ainsert(doc.content)
154
+
155
+ self.logger.info("Knowledge graph built successfully (text-only)")
156
+ return True
@@ -0,0 +1,146 @@
1
+ """
2
+ Vector Indexer
3
+ ==============
4
+
5
+ Vector-based indexer using dense embeddings with FAISS.
6
+ Provides fast similarity search for RAG retrieval.
7
+ """
8
+
9
+ import json
10
+ from pathlib import Path
11
+ import pickle
12
+ from typing import List, Optional
13
+
14
+ import numpy as np
15
+
16
+ from ...types import Document
17
+ from ..base import BaseComponent
18
+
19
+
20
+ class VectorIndexer(BaseComponent):
21
+ """
22
+ Vector indexer using FAISS for fast similarity search.
23
+
24
+ Creates and stores vector embeddings for efficient retrieval.
25
+ Falls back to simple vector storage if FAISS is not available.
26
+ """
27
+
28
+ name = "vector_indexer"
29
+
30
+ def __init__(self, kb_base_dir: Optional[str] = None):
31
+ """
32
+ Initialize vector indexer.
33
+
34
+ Args:
35
+ kb_base_dir: Base directory for knowledge bases
36
+ """
37
+ super().__init__()
38
+ self.kb_base_dir = kb_base_dir or str(
39
+ Path(__file__).resolve().parent.parent.parent.parent.parent.parent
40
+ / "data"
41
+ / "knowledge_bases"
42
+ )
43
+
44
+ # Try to import FAISS, fallback to simple storage if not available
45
+ self.use_faiss = False
46
+ try:
47
+ import faiss
48
+
49
+ self.faiss = faiss
50
+ self.use_faiss = True
51
+ self.logger.info("Using FAISS for vector indexing")
52
+ except ImportError:
53
+ self.logger.warning("FAISS not available, using simple vector storage")
54
+
55
+ async def process(self, kb_name: str, documents: List[Document], **kwargs) -> bool:
56
+ """
57
+ Index documents using vector embeddings.
58
+
59
+ Creates FAISS index for fast similarity search or falls back to
60
+ simple JSON storage if FAISS is unavailable.
61
+
62
+ Args:
63
+ kb_name: Knowledge base name
64
+ documents: List of documents to index
65
+ **kwargs: Additional arguments
66
+
67
+ Returns:
68
+ True if successful
69
+ """
70
+ self.logger.info(f"Indexing {len(documents)} documents into vector store for {kb_name}")
71
+
72
+ # Collect all chunks with embeddings
73
+ all_chunks = []
74
+ for doc in documents:
75
+ for chunk in doc.chunks:
76
+ # Check if embedding exists (handles numpy arrays and lists)
77
+ if chunk.embedding is not None and len(chunk.embedding) > 0:
78
+ all_chunks.append(chunk)
79
+
80
+ if not all_chunks:
81
+ self.logger.warning("No chunks with embeddings to index")
82
+ return False
83
+
84
+ self.logger.info(f"Indexing {len(all_chunks)} chunks")
85
+
86
+ # Create vector store directory
87
+ kb_dir = Path(self.kb_base_dir) / kb_name / "vector_store"
88
+ kb_dir.mkdir(parents=True, exist_ok=True)
89
+
90
+ # Convert embeddings to numpy array
91
+ embeddings = np.array(
92
+ [
93
+ chunk.embedding if isinstance(chunk.embedding, list) else chunk.embedding.tolist()
94
+ for chunk in all_chunks
95
+ ],
96
+ dtype=np.float32,
97
+ )
98
+
99
+ # Store metadata separately
100
+ metadata = []
101
+ for i, chunk in enumerate(all_chunks):
102
+ metadata.append(
103
+ {
104
+ "id": i,
105
+ "content": chunk.content,
106
+ "type": chunk.chunk_type,
107
+ "metadata": chunk.metadata,
108
+ }
109
+ )
110
+
111
+ # Save metadata
112
+ with open(kb_dir / "metadata.json", "w", encoding="utf-8") as f:
113
+ json.dump(metadata, f, ensure_ascii=False, indent=2)
114
+
115
+ if self.use_faiss:
116
+ # Create FAISS index for inner product (cosine similarity with normalized vectors)
117
+ dimension = embeddings.shape[1]
118
+ index = self.faiss.IndexFlatIP(dimension) # Inner product for cosine similarity
119
+
120
+ # Normalize vectors for cosine similarity (inner product of normalized vectors = cosine similarity)
121
+ self.faiss.normalize_L2(embeddings)
122
+
123
+ # Add vectors to index
124
+ index.add(embeddings)
125
+
126
+ # Save FAISS index
127
+ self.faiss.write_index(index, str(kb_dir / "index.faiss"))
128
+ self.logger.info(f"FAISS index saved with {index.ntotal} vectors")
129
+ else:
130
+ # Simple storage: save embeddings as pickle
131
+ with open(kb_dir / "embeddings.pkl", "wb") as f:
132
+ pickle.dump(embeddings, f)
133
+ self.logger.info(f"Embeddings saved for {len(all_chunks)} chunks")
134
+
135
+ # Save index info
136
+ info = {
137
+ "num_chunks": len(all_chunks),
138
+ "num_documents": len(documents),
139
+ "embedding_dim": embeddings.shape[1],
140
+ "use_faiss": self.use_faiss,
141
+ }
142
+ with open(kb_dir / "info.json", "w", encoding="utf-8") as f:
143
+ json.dump(info, f, indent=2)
144
+
145
+ self.logger.info(f"Vector index saved to {kb_dir}")
146
+ return True
@@ -0,0 +1,18 @@
1
+ """
2
+ Document Parsers
3
+ ================
4
+
5
+ Parsers for extracting content from various document formats.
6
+ """
7
+
8
+ from .base import BaseParser
9
+ from .markdown import MarkdownParser
10
+ from .pdf import PDFParser
11
+ from .text import TextParser
12
+
13
+ __all__ = [
14
+ "BaseParser",
15
+ "PDFParser",
16
+ "MarkdownParser",
17
+ "TextParser",
18
+ ]
@@ -0,0 +1,35 @@
1
+ """
2
+ Base Parser
3
+ ===========
4
+
5
+ Base class for document parsers.
6
+ """
7
+
8
+ from pathlib import Path
9
+ from typing import Union
10
+
11
+ from ...types import Document
12
+ from ..base import BaseComponent
13
+
14
+
15
+ class BaseParser(BaseComponent):
16
+ """
17
+ Base class for document parsers.
18
+
19
+ Parsers convert raw files into Document objects.
20
+ """
21
+
22
+ name = "base_parser"
23
+
24
+ async def process(self, file_path: Union[str, Path], **kwargs) -> Document:
25
+ """
26
+ Parse a file into a Document.
27
+
28
+ Args:
29
+ file_path: Path to the file to parse
30
+ **kwargs: Additional arguments
31
+
32
+ Returns:
33
+ Parsed Document
34
+ """
35
+ raise NotImplementedError("Subclasses must implement process()")
@@ -0,0 +1,52 @@
1
+ """
2
+ Markdown Parser
3
+ ===============
4
+
5
+ Parser for Markdown documents.
6
+ """
7
+
8
+ from pathlib import Path
9
+ from typing import Union
10
+
11
+ from ...types import Document
12
+ from ..base import BaseComponent
13
+
14
+
15
+ class MarkdownParser(BaseComponent):
16
+ """
17
+ Markdown parser.
18
+
19
+ Parses Markdown files into Document objects.
20
+ """
21
+
22
+ name = "markdown_parser"
23
+
24
+ async def process(self, file_path: Union[str, Path], **kwargs) -> Document:
25
+ """
26
+ Parse a Markdown file into a Document.
27
+
28
+ Args:
29
+ file_path: Path to the Markdown file
30
+ **kwargs: Additional arguments
31
+
32
+ Returns:
33
+ Parsed Document
34
+ """
35
+ file_path = Path(file_path)
36
+
37
+ if not file_path.exists():
38
+ raise FileNotFoundError(f"Markdown file not found: {file_path}")
39
+
40
+ self.logger.info(f"Parsing Markdown: {file_path.name}")
41
+
42
+ with open(file_path, "r", encoding="utf-8") as f:
43
+ content = f.read()
44
+
45
+ return Document(
46
+ content=content,
47
+ file_path=str(file_path),
48
+ metadata={
49
+ "filename": file_path.name,
50
+ "parser": self.name,
51
+ },
52
+ )
@@ -0,0 +1,115 @@
1
+ """
2
+ PDF Parser
3
+ ==========
4
+
5
+ Parser for PDF documents using MinerU/RAG-Anything.
6
+ """
7
+
8
+ import json
9
+ from pathlib import Path
10
+ from typing import Optional, Union
11
+
12
+ from ...types import Document
13
+ from ..base import BaseComponent
14
+
15
+
16
+ class PDFParser(BaseComponent):
17
+ """
18
+ PDF parser using MinerU for extraction.
19
+
20
+ Can use RAG-Anything's MinerU integration or standalone MinerU.
21
+ """
22
+
23
+ name = "pdf_parser"
24
+
25
+ def __init__(self, use_mineru: bool = True, output_dir: Optional[str] = None):
26
+ """
27
+ Initialize PDF parser.
28
+
29
+ Args:
30
+ use_mineru: Whether to use MinerU for parsing
31
+ output_dir: Directory to store parsed output
32
+ """
33
+ super().__init__()
34
+ self.use_mineru = use_mineru
35
+ self.output_dir = output_dir
36
+
37
+ async def process(self, file_path: Union[str, Path], **kwargs) -> Document:
38
+ """
39
+ Parse a PDF file into a Document.
40
+
41
+ Args:
42
+ file_path: Path to the PDF file
43
+ **kwargs: Additional arguments
44
+
45
+ Returns:
46
+ Parsed Document with content and content_items
47
+ """
48
+ file_path = Path(file_path)
49
+
50
+ if not file_path.exists():
51
+ raise FileNotFoundError(f"PDF file not found: {file_path}")
52
+
53
+ self.logger.info(f"Parsing PDF: {file_path.name}")
54
+
55
+ # Check for existing parsed content
56
+ output_dir = Path(kwargs.get("output_dir", self.output_dir or file_path.parent))
57
+ content_list_file = output_dir / f"{file_path.stem}.json"
58
+
59
+ content_items = []
60
+ content = ""
61
+
62
+ if content_list_file.exists():
63
+ # Load existing parsed content
64
+ self.logger.info(f"Loading existing parsed content from {content_list_file}")
65
+ with open(content_list_file, "r", encoding="utf-8") as f:
66
+ content_items = json.load(f)
67
+
68
+ # Extract text content
69
+ content = self._extract_text_from_content_items(content_items)
70
+ else:
71
+ # Parse PDF (placeholder - actual MinerU parsing would happen here)
72
+ self.logger.warning(
73
+ "No pre-parsed content found. Use RAGAnythingPipeline for full PDF parsing."
74
+ )
75
+ # Basic text extraction fallback
76
+ content = await self._basic_pdf_extract(file_path)
77
+
78
+ return Document(
79
+ content=content,
80
+ file_path=str(file_path),
81
+ content_items=content_items,
82
+ metadata={
83
+ "filename": file_path.name,
84
+ "parser": self.name,
85
+ },
86
+ )
87
+
88
+ def _extract_text_from_content_items(self, content_items: list) -> str:
89
+ """Extract plain text from MinerU content items."""
90
+ texts = []
91
+ for item in content_items:
92
+ if isinstance(item, dict):
93
+ if "text" in item:
94
+ texts.append(item["text"])
95
+ elif "content" in item:
96
+ texts.append(item["content"])
97
+ return "\n\n".join(texts)
98
+
99
+ async def _basic_pdf_extract(self, file_path: Path) -> str:
100
+ """Basic PDF text extraction fallback."""
101
+ try:
102
+ import fitz # PyMuPDF
103
+
104
+ doc = fitz.open(file_path)
105
+ texts = []
106
+ for page in doc:
107
+ texts.append(page.get_text())
108
+ doc.close()
109
+ return "\n\n".join(texts)
110
+ except ImportError:
111
+ self.logger.warning("PyMuPDF not installed. Cannot extract PDF text.")
112
+ return ""
113
+ except Exception as e:
114
+ self.logger.error(f"Failed to extract PDF text: {e}")
115
+ return ""