realtimex-deeptutor 0.5.0.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (276) hide show
  1. realtimex_deeptutor/__init__.py +67 -0
  2. realtimex_deeptutor-0.5.0.post1.dist-info/METADATA +1612 -0
  3. realtimex_deeptutor-0.5.0.post1.dist-info/RECORD +276 -0
  4. realtimex_deeptutor-0.5.0.post1.dist-info/WHEEL +5 -0
  5. realtimex_deeptutor-0.5.0.post1.dist-info/entry_points.txt +2 -0
  6. realtimex_deeptutor-0.5.0.post1.dist-info/licenses/LICENSE +661 -0
  7. realtimex_deeptutor-0.5.0.post1.dist-info/top_level.txt +2 -0
  8. src/__init__.py +40 -0
  9. src/agents/__init__.py +24 -0
  10. src/agents/base_agent.py +657 -0
  11. src/agents/chat/__init__.py +24 -0
  12. src/agents/chat/chat_agent.py +435 -0
  13. src/agents/chat/prompts/en/chat_agent.yaml +35 -0
  14. src/agents/chat/prompts/zh/chat_agent.yaml +35 -0
  15. src/agents/chat/session_manager.py +311 -0
  16. src/agents/co_writer/__init__.py +0 -0
  17. src/agents/co_writer/edit_agent.py +260 -0
  18. src/agents/co_writer/narrator_agent.py +423 -0
  19. src/agents/co_writer/prompts/en/edit_agent.yaml +113 -0
  20. src/agents/co_writer/prompts/en/narrator_agent.yaml +88 -0
  21. src/agents/co_writer/prompts/zh/edit_agent.yaml +113 -0
  22. src/agents/co_writer/prompts/zh/narrator_agent.yaml +88 -0
  23. src/agents/guide/__init__.py +16 -0
  24. src/agents/guide/agents/__init__.py +11 -0
  25. src/agents/guide/agents/chat_agent.py +104 -0
  26. src/agents/guide/agents/interactive_agent.py +223 -0
  27. src/agents/guide/agents/locate_agent.py +149 -0
  28. src/agents/guide/agents/summary_agent.py +150 -0
  29. src/agents/guide/guide_manager.py +500 -0
  30. src/agents/guide/prompts/en/chat_agent.yaml +41 -0
  31. src/agents/guide/prompts/en/interactive_agent.yaml +202 -0
  32. src/agents/guide/prompts/en/locate_agent.yaml +68 -0
  33. src/agents/guide/prompts/en/summary_agent.yaml +157 -0
  34. src/agents/guide/prompts/zh/chat_agent.yaml +41 -0
  35. src/agents/guide/prompts/zh/interactive_agent.yaml +626 -0
  36. src/agents/guide/prompts/zh/locate_agent.yaml +68 -0
  37. src/agents/guide/prompts/zh/summary_agent.yaml +157 -0
  38. src/agents/ideagen/__init__.py +12 -0
  39. src/agents/ideagen/idea_generation_workflow.py +426 -0
  40. src/agents/ideagen/material_organizer_agent.py +173 -0
  41. src/agents/ideagen/prompts/en/idea_generation.yaml +187 -0
  42. src/agents/ideagen/prompts/en/material_organizer.yaml +69 -0
  43. src/agents/ideagen/prompts/zh/idea_generation.yaml +187 -0
  44. src/agents/ideagen/prompts/zh/material_organizer.yaml +69 -0
  45. src/agents/question/__init__.py +24 -0
  46. src/agents/question/agents/__init__.py +18 -0
  47. src/agents/question/agents/generate_agent.py +381 -0
  48. src/agents/question/agents/relevance_analyzer.py +207 -0
  49. src/agents/question/agents/retrieve_agent.py +239 -0
  50. src/agents/question/coordinator.py +718 -0
  51. src/agents/question/example.py +109 -0
  52. src/agents/question/prompts/en/coordinator.yaml +75 -0
  53. src/agents/question/prompts/en/generate_agent.yaml +77 -0
  54. src/agents/question/prompts/en/relevance_analyzer.yaml +41 -0
  55. src/agents/question/prompts/en/retrieve_agent.yaml +32 -0
  56. src/agents/question/prompts/zh/coordinator.yaml +75 -0
  57. src/agents/question/prompts/zh/generate_agent.yaml +77 -0
  58. src/agents/question/prompts/zh/relevance_analyzer.yaml +39 -0
  59. src/agents/question/prompts/zh/retrieve_agent.yaml +30 -0
  60. src/agents/research/agents/__init__.py +23 -0
  61. src/agents/research/agents/decompose_agent.py +507 -0
  62. src/agents/research/agents/manager_agent.py +228 -0
  63. src/agents/research/agents/note_agent.py +180 -0
  64. src/agents/research/agents/rephrase_agent.py +263 -0
  65. src/agents/research/agents/reporting_agent.py +1333 -0
  66. src/agents/research/agents/research_agent.py +714 -0
  67. src/agents/research/data_structures.py +451 -0
  68. src/agents/research/main.py +188 -0
  69. src/agents/research/prompts/en/decompose_agent.yaml +89 -0
  70. src/agents/research/prompts/en/manager_agent.yaml +24 -0
  71. src/agents/research/prompts/en/note_agent.yaml +121 -0
  72. src/agents/research/prompts/en/rephrase_agent.yaml +58 -0
  73. src/agents/research/prompts/en/reporting_agent.yaml +380 -0
  74. src/agents/research/prompts/en/research_agent.yaml +173 -0
  75. src/agents/research/prompts/zh/decompose_agent.yaml +89 -0
  76. src/agents/research/prompts/zh/manager_agent.yaml +24 -0
  77. src/agents/research/prompts/zh/note_agent.yaml +121 -0
  78. src/agents/research/prompts/zh/rephrase_agent.yaml +58 -0
  79. src/agents/research/prompts/zh/reporting_agent.yaml +380 -0
  80. src/agents/research/prompts/zh/research_agent.yaml +173 -0
  81. src/agents/research/research_pipeline.py +1309 -0
  82. src/agents/research/utils/__init__.py +60 -0
  83. src/agents/research/utils/citation_manager.py +799 -0
  84. src/agents/research/utils/json_utils.py +98 -0
  85. src/agents/research/utils/token_tracker.py +297 -0
  86. src/agents/solve/__init__.py +80 -0
  87. src/agents/solve/analysis_loop/__init__.py +14 -0
  88. src/agents/solve/analysis_loop/investigate_agent.py +414 -0
  89. src/agents/solve/analysis_loop/note_agent.py +190 -0
  90. src/agents/solve/main_solver.py +862 -0
  91. src/agents/solve/memory/__init__.py +34 -0
  92. src/agents/solve/memory/citation_memory.py +353 -0
  93. src/agents/solve/memory/investigate_memory.py +226 -0
  94. src/agents/solve/memory/solve_memory.py +340 -0
  95. src/agents/solve/prompts/en/analysis_loop/investigate_agent.yaml +55 -0
  96. src/agents/solve/prompts/en/analysis_loop/note_agent.yaml +54 -0
  97. src/agents/solve/prompts/en/solve_loop/manager_agent.yaml +67 -0
  98. src/agents/solve/prompts/en/solve_loop/precision_answer_agent.yaml +62 -0
  99. src/agents/solve/prompts/en/solve_loop/response_agent.yaml +90 -0
  100. src/agents/solve/prompts/en/solve_loop/solve_agent.yaml +75 -0
  101. src/agents/solve/prompts/en/solve_loop/tool_agent.yaml +38 -0
  102. src/agents/solve/prompts/zh/analysis_loop/investigate_agent.yaml +53 -0
  103. src/agents/solve/prompts/zh/analysis_loop/note_agent.yaml +54 -0
  104. src/agents/solve/prompts/zh/solve_loop/manager_agent.yaml +66 -0
  105. src/agents/solve/prompts/zh/solve_loop/precision_answer_agent.yaml +62 -0
  106. src/agents/solve/prompts/zh/solve_loop/response_agent.yaml +90 -0
  107. src/agents/solve/prompts/zh/solve_loop/solve_agent.yaml +76 -0
  108. src/agents/solve/prompts/zh/solve_loop/tool_agent.yaml +41 -0
  109. src/agents/solve/solve_loop/__init__.py +22 -0
  110. src/agents/solve/solve_loop/citation_manager.py +74 -0
  111. src/agents/solve/solve_loop/manager_agent.py +274 -0
  112. src/agents/solve/solve_loop/precision_answer_agent.py +96 -0
  113. src/agents/solve/solve_loop/response_agent.py +301 -0
  114. src/agents/solve/solve_loop/solve_agent.py +325 -0
  115. src/agents/solve/solve_loop/tool_agent.py +470 -0
  116. src/agents/solve/utils/__init__.py +64 -0
  117. src/agents/solve/utils/config_validator.py +313 -0
  118. src/agents/solve/utils/display_manager.py +223 -0
  119. src/agents/solve/utils/error_handler.py +363 -0
  120. src/agents/solve/utils/json_utils.py +98 -0
  121. src/agents/solve/utils/performance_monitor.py +407 -0
  122. src/agents/solve/utils/token_tracker.py +541 -0
  123. src/api/__init__.py +0 -0
  124. src/api/main.py +240 -0
  125. src/api/routers/__init__.py +1 -0
  126. src/api/routers/agent_config.py +69 -0
  127. src/api/routers/chat.py +296 -0
  128. src/api/routers/co_writer.py +337 -0
  129. src/api/routers/config.py +627 -0
  130. src/api/routers/dashboard.py +18 -0
  131. src/api/routers/guide.py +337 -0
  132. src/api/routers/ideagen.py +436 -0
  133. src/api/routers/knowledge.py +821 -0
  134. src/api/routers/notebook.py +247 -0
  135. src/api/routers/question.py +537 -0
  136. src/api/routers/research.py +394 -0
  137. src/api/routers/settings.py +164 -0
  138. src/api/routers/solve.py +305 -0
  139. src/api/routers/system.py +252 -0
  140. src/api/run_server.py +61 -0
  141. src/api/utils/history.py +172 -0
  142. src/api/utils/log_interceptor.py +21 -0
  143. src/api/utils/notebook_manager.py +415 -0
  144. src/api/utils/progress_broadcaster.py +72 -0
  145. src/api/utils/task_id_manager.py +100 -0
  146. src/config/__init__.py +0 -0
  147. src/config/accessors.py +18 -0
  148. src/config/constants.py +34 -0
  149. src/config/defaults.py +18 -0
  150. src/config/schema.py +38 -0
  151. src/config/settings.py +50 -0
  152. src/core/errors.py +62 -0
  153. src/knowledge/__init__.py +23 -0
  154. src/knowledge/add_documents.py +606 -0
  155. src/knowledge/config.py +65 -0
  156. src/knowledge/example_add_documents.py +236 -0
  157. src/knowledge/extract_numbered_items.py +1039 -0
  158. src/knowledge/initializer.py +621 -0
  159. src/knowledge/kb.py +22 -0
  160. src/knowledge/manager.py +782 -0
  161. src/knowledge/progress_tracker.py +182 -0
  162. src/knowledge/start_kb.py +535 -0
  163. src/logging/__init__.py +103 -0
  164. src/logging/adapters/__init__.py +17 -0
  165. src/logging/adapters/lightrag.py +184 -0
  166. src/logging/adapters/llamaindex.py +141 -0
  167. src/logging/config.py +80 -0
  168. src/logging/handlers/__init__.py +20 -0
  169. src/logging/handlers/console.py +75 -0
  170. src/logging/handlers/file.py +201 -0
  171. src/logging/handlers/websocket.py +127 -0
  172. src/logging/logger.py +709 -0
  173. src/logging/stats/__init__.py +16 -0
  174. src/logging/stats/llm_stats.py +179 -0
  175. src/services/__init__.py +56 -0
  176. src/services/config/__init__.py +61 -0
  177. src/services/config/knowledge_base_config.py +210 -0
  178. src/services/config/loader.py +260 -0
  179. src/services/config/unified_config.py +603 -0
  180. src/services/embedding/__init__.py +45 -0
  181. src/services/embedding/adapters/__init__.py +22 -0
  182. src/services/embedding/adapters/base.py +106 -0
  183. src/services/embedding/adapters/cohere.py +127 -0
  184. src/services/embedding/adapters/jina.py +99 -0
  185. src/services/embedding/adapters/ollama.py +116 -0
  186. src/services/embedding/adapters/openai_compatible.py +96 -0
  187. src/services/embedding/client.py +159 -0
  188. src/services/embedding/config.py +156 -0
  189. src/services/embedding/provider.py +119 -0
  190. src/services/llm/__init__.py +152 -0
  191. src/services/llm/capabilities.py +313 -0
  192. src/services/llm/client.py +302 -0
  193. src/services/llm/cloud_provider.py +530 -0
  194. src/services/llm/config.py +200 -0
  195. src/services/llm/error_mapping.py +103 -0
  196. src/services/llm/exceptions.py +152 -0
  197. src/services/llm/factory.py +450 -0
  198. src/services/llm/local_provider.py +347 -0
  199. src/services/llm/providers/anthropic.py +95 -0
  200. src/services/llm/providers/base_provider.py +93 -0
  201. src/services/llm/providers/open_ai.py +83 -0
  202. src/services/llm/registry.py +71 -0
  203. src/services/llm/telemetry.py +40 -0
  204. src/services/llm/types.py +27 -0
  205. src/services/llm/utils.py +333 -0
  206. src/services/prompt/__init__.py +25 -0
  207. src/services/prompt/manager.py +206 -0
  208. src/services/rag/__init__.py +64 -0
  209. src/services/rag/components/__init__.py +29 -0
  210. src/services/rag/components/base.py +59 -0
  211. src/services/rag/components/chunkers/__init__.py +18 -0
  212. src/services/rag/components/chunkers/base.py +34 -0
  213. src/services/rag/components/chunkers/fixed.py +71 -0
  214. src/services/rag/components/chunkers/numbered_item.py +94 -0
  215. src/services/rag/components/chunkers/semantic.py +97 -0
  216. src/services/rag/components/embedders/__init__.py +14 -0
  217. src/services/rag/components/embedders/base.py +32 -0
  218. src/services/rag/components/embedders/openai.py +63 -0
  219. src/services/rag/components/indexers/__init__.py +18 -0
  220. src/services/rag/components/indexers/base.py +35 -0
  221. src/services/rag/components/indexers/graph.py +172 -0
  222. src/services/rag/components/indexers/lightrag.py +156 -0
  223. src/services/rag/components/indexers/vector.py +146 -0
  224. src/services/rag/components/parsers/__init__.py +18 -0
  225. src/services/rag/components/parsers/base.py +35 -0
  226. src/services/rag/components/parsers/markdown.py +52 -0
  227. src/services/rag/components/parsers/pdf.py +115 -0
  228. src/services/rag/components/parsers/text.py +86 -0
  229. src/services/rag/components/retrievers/__init__.py +18 -0
  230. src/services/rag/components/retrievers/base.py +34 -0
  231. src/services/rag/components/retrievers/dense.py +200 -0
  232. src/services/rag/components/retrievers/hybrid.py +164 -0
  233. src/services/rag/components/retrievers/lightrag.py +169 -0
  234. src/services/rag/components/routing.py +286 -0
  235. src/services/rag/factory.py +234 -0
  236. src/services/rag/pipeline.py +215 -0
  237. src/services/rag/pipelines/__init__.py +32 -0
  238. src/services/rag/pipelines/academic.py +44 -0
  239. src/services/rag/pipelines/lightrag.py +43 -0
  240. src/services/rag/pipelines/llamaindex.py +313 -0
  241. src/services/rag/pipelines/raganything.py +384 -0
  242. src/services/rag/service.py +244 -0
  243. src/services/rag/types.py +73 -0
  244. src/services/search/__init__.py +284 -0
  245. src/services/search/base.py +87 -0
  246. src/services/search/consolidation.py +398 -0
  247. src/services/search/providers/__init__.py +128 -0
  248. src/services/search/providers/baidu.py +188 -0
  249. src/services/search/providers/exa.py +194 -0
  250. src/services/search/providers/jina.py +161 -0
  251. src/services/search/providers/perplexity.py +153 -0
  252. src/services/search/providers/serper.py +209 -0
  253. src/services/search/providers/tavily.py +161 -0
  254. src/services/search/types.py +114 -0
  255. src/services/setup/__init__.py +34 -0
  256. src/services/setup/init.py +285 -0
  257. src/services/tts/__init__.py +16 -0
  258. src/services/tts/config.py +99 -0
  259. src/tools/__init__.py +91 -0
  260. src/tools/code_executor.py +536 -0
  261. src/tools/paper_search_tool.py +171 -0
  262. src/tools/query_item_tool.py +310 -0
  263. src/tools/question/__init__.py +15 -0
  264. src/tools/question/exam_mimic.py +616 -0
  265. src/tools/question/pdf_parser.py +211 -0
  266. src/tools/question/question_extractor.py +397 -0
  267. src/tools/rag_tool.py +173 -0
  268. src/tools/tex_chunker.py +339 -0
  269. src/tools/tex_downloader.py +253 -0
  270. src/tools/web_search.py +71 -0
  271. src/utils/config_manager.py +206 -0
  272. src/utils/document_validator.py +168 -0
  273. src/utils/error_rate_tracker.py +111 -0
  274. src/utils/error_utils.py +82 -0
  275. src/utils/json_parser.py +110 -0
  276. src/utils/network/circuit_breaker.py +79 -0
@@ -0,0 +1,286 @@
1
+ """
2
+ File Type Router
3
+ ================
4
+
5
+ Centralized file type classification and routing for RAG pipelines.
6
+ Determines the appropriate processing method for each document type.
7
+ """
8
+
9
+ from dataclasses import dataclass
10
+ from enum import Enum
11
+ from pathlib import Path
12
+ from typing import List
13
+
14
+ from src.logging import get_logger
15
+
16
+ logger = get_logger("FileTypeRouter")
17
+
18
+
19
+ class DocumentType(Enum):
20
+ """Document type classification"""
21
+
22
+ PDF = "pdf" # Requires MinerU complex parsing
23
+ TEXT = "text" # Plain text, direct read
24
+ MARKDOWN = "markdown" # Structured text
25
+ DOCX = "docx" # Word documents
26
+ IMAGE = "image" # Images (may need OCR)
27
+ UNKNOWN = "unknown" # Unsupported
28
+
29
+
30
+ @dataclass
31
+ class FileClassification:
32
+ """
33
+ Result of file classification.
34
+
35
+ Attributes:
36
+ needs_mineru: Files requiring MinerU parsing (PDF, etc.)
37
+ text_files: Files that can be read directly as text
38
+ unsupported: Files with unsupported formats
39
+ """
40
+
41
+ needs_mineru: List[str]
42
+ text_files: List[str]
43
+ unsupported: List[str]
44
+
45
+
46
+ class FileTypeRouter:
47
+ """
48
+ File type router for RAG pipelines.
49
+
50
+ Classifies files before processing to route them to appropriate handlers:
51
+ - PDF files -> MinerU parser (complex document parsing)
52
+ - Text files -> Direct read (fast, simple)
53
+ - Unsupported -> Skip with warning
54
+
55
+ Usage:
56
+ router = FileTypeRouter()
57
+ classification = router.classify_files(file_paths)
58
+
59
+ # Process PDF files with MinerU
60
+ for pdf in classification.needs_mineru:
61
+ await rag.process_document_complete(pdf, ...)
62
+
63
+ # Process text files directly
64
+ for txt in classification.text_files:
65
+ content = await FileTypeRouter.read_text_file(txt)
66
+ await rag.lightrag.ainsert(content)
67
+ """
68
+
69
+ # Extensions requiring MinerU parsing (complex document formats)
70
+ MINERU_EXTENSIONS = {".pdf"}
71
+
72
+ # Extensions for direct text reading
73
+ TEXT_EXTENSIONS = {
74
+ # Plain text
75
+ ".txt",
76
+ ".text",
77
+ ".log",
78
+ # Markup languages
79
+ ".md",
80
+ ".markdown",
81
+ ".rst",
82
+ ".asciidoc",
83
+ # Data formats
84
+ ".json",
85
+ ".yaml",
86
+ ".yml",
87
+ ".toml",
88
+ ".csv",
89
+ ".tsv",
90
+ # LaTeX
91
+ ".tex",
92
+ ".latex",
93
+ ".bib",
94
+ # Code files
95
+ ".py",
96
+ ".js",
97
+ ".ts",
98
+ ".jsx",
99
+ ".tsx",
100
+ ".java",
101
+ ".c",
102
+ ".cpp",
103
+ ".h",
104
+ ".hpp",
105
+ ".go",
106
+ ".rs",
107
+ ".rb",
108
+ ".php",
109
+ ".swift",
110
+ ".kt",
111
+ ".scala",
112
+ ".r",
113
+ ".sql",
114
+ ".sh",
115
+ ".bash",
116
+ ".zsh",
117
+ ".ps1",
118
+ # Web
119
+ ".html",
120
+ ".htm",
121
+ ".xml",
122
+ ".css",
123
+ ".scss",
124
+ ".sass",
125
+ ".less",
126
+ # Config
127
+ ".ini",
128
+ ".cfg",
129
+ ".conf",
130
+ ".env",
131
+ ".properties",
132
+ }
133
+
134
+ # Word document extensions (special handling)
135
+ DOCX_EXTENSIONS = {".docx", ".doc"}
136
+
137
+ # Image extensions (may need OCR)
138
+ IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp", ".tiff", ".tif"}
139
+
140
+ @classmethod
141
+ def get_document_type(cls, file_path: str) -> DocumentType:
142
+ """
143
+ Classify a single file by its type.
144
+
145
+ Args:
146
+ file_path: Path to the file
147
+
148
+ Returns:
149
+ DocumentType enum value
150
+ """
151
+ ext = Path(file_path).suffix.lower()
152
+
153
+ if ext in cls.MINERU_EXTENSIONS:
154
+ return DocumentType.PDF
155
+ elif ext in cls.TEXT_EXTENSIONS:
156
+ return DocumentType.TEXT
157
+ elif ext in cls.DOCX_EXTENSIONS:
158
+ return DocumentType.DOCX
159
+ elif ext in cls.IMAGE_EXTENSIONS:
160
+ return DocumentType.IMAGE
161
+ else:
162
+ # Try to detect if it's a text file by content
163
+ if cls._is_text_file(file_path):
164
+ return DocumentType.TEXT
165
+ return DocumentType.UNKNOWN
166
+
167
+ @classmethod
168
+ def _is_text_file(cls, file_path: str, sample_size: int = 8192) -> bool:
169
+ """
170
+ Detect if a file is text-based by examining its content.
171
+
172
+ Args:
173
+ file_path: Path to the file
174
+ sample_size: Number of bytes to sample
175
+
176
+ Returns:
177
+ True if file appears to be text
178
+ """
179
+ try:
180
+ with open(file_path, "rb") as f:
181
+ chunk = f.read(sample_size)
182
+
183
+ # Check for null bytes (binary file indicator)
184
+ if b"\x00" in chunk:
185
+ return False
186
+
187
+ # Try to decode as UTF-8
188
+ chunk.decode("utf-8")
189
+ return True
190
+ except (UnicodeDecodeError, IOError, OSError):
191
+ return False
192
+
193
+ @classmethod
194
+ def classify_files(cls, file_paths: List[str]) -> FileClassification:
195
+ """
196
+ Classify a list of files by processing method.
197
+
198
+ Args:
199
+ file_paths: List of file paths to classify
200
+
201
+ Returns:
202
+ FileClassification with files grouped by processing method
203
+ """
204
+ needs_mineru = []
205
+ text_files = []
206
+ unsupported = []
207
+
208
+ for path in file_paths:
209
+ doc_type = cls.get_document_type(path)
210
+
211
+ if doc_type == DocumentType.PDF:
212
+ needs_mineru.append(path)
213
+ elif doc_type in (DocumentType.TEXT, DocumentType.MARKDOWN):
214
+ text_files.append(path)
215
+ elif doc_type == DocumentType.DOCX:
216
+ # DOCX files need special handling
217
+ # For now, route to MinerU which can handle them
218
+ needs_mineru.append(path)
219
+ elif doc_type == DocumentType.IMAGE:
220
+ # Images might need OCR - route to MinerU if multimodal is enabled
221
+ needs_mineru.append(path)
222
+ else:
223
+ unsupported.append(path)
224
+
225
+ logger.debug(
226
+ f"Classified {len(file_paths)} files: "
227
+ f"{len(needs_mineru)} MinerU, {len(text_files)} text, {len(unsupported)} unsupported"
228
+ )
229
+
230
+ return FileClassification(
231
+ needs_mineru=needs_mineru,
232
+ text_files=text_files,
233
+ unsupported=unsupported,
234
+ )
235
+
236
+ @classmethod
237
+ async def read_text_file(cls, file_path: str) -> str:
238
+ """
239
+ Read a text file with automatic encoding detection.
240
+
241
+ Args:
242
+ file_path: Path to the text file
243
+
244
+ Returns:
245
+ File content as string
246
+ """
247
+ encodings = ["utf-8", "utf-8-sig", "gbk", "gb2312", "gb18030", "latin-1", "cp1252"]
248
+
249
+ for encoding in encodings:
250
+ try:
251
+ with open(file_path, "r", encoding=encoding) as f:
252
+ return f.read()
253
+ except UnicodeDecodeError:
254
+ continue
255
+
256
+ # Last resort: read with error replacement
257
+ with open(file_path, "rb") as f:
258
+ return f.read().decode("utf-8", errors="replace")
259
+
260
+ @classmethod
261
+ def needs_mineru(cls, file_path: str) -> bool:
262
+ """
263
+ Quick check if a single file needs MinerU parsing.
264
+
265
+ Args:
266
+ file_path: Path to the file
267
+
268
+ Returns:
269
+ True if file requires MinerU
270
+ """
271
+ doc_type = cls.get_document_type(file_path)
272
+ return doc_type in (DocumentType.PDF, DocumentType.DOCX, DocumentType.IMAGE)
273
+
274
+ @classmethod
275
+ def is_text_readable(cls, file_path: str) -> bool:
276
+ """
277
+ Check if a file can be read directly as text.
278
+
279
+ Args:
280
+ file_path: Path to the file
281
+
282
+ Returns:
283
+ True if file can be read as text
284
+ """
285
+ doc_type = cls.get_document_type(file_path)
286
+ return doc_type in (DocumentType.TEXT, DocumentType.MARKDOWN)
@@ -0,0 +1,234 @@
1
+ """
2
+ Pipeline Factory
3
+ ================
4
+
5
+ Factory for creating and managing RAG pipelines.
6
+
7
+ LightRAG is the default pipeline (always available).
8
+ RAGAnything and LlamaIndex are optional (require extra dependencies).
9
+ """
10
+
11
+ import logging
12
+ from typing import Callable, Dict, List, Optional
13
+
14
+ from .pipelines import lightrag
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ # Pipeline registry - start with always-available pipelines
19
+ _PIPELINES: Dict[str, Callable] = {
20
+ "lightrag": lightrag.LightRAGPipeline, # Knowledge graph: PDFParser, fast text-only (default)
21
+ "realtimex": lightrag.LightRAGPipeline, # Alias: RealTimeX (uses LightRAG with RealTimeX AI config)
22
+ }
23
+
24
+ # Pipeline metadata for list_pipelines()
25
+ _PIPELINE_INFO: Dict[str, Dict[str, str]] = {
26
+ "realtimex": {
27
+ "id": "realtimex",
28
+ "name": "RealTimeX",
29
+ "description": "RealTimeX AI powered knowledge retrieval (recommended).",
30
+ "available": True,
31
+ },
32
+ "lightrag": {
33
+ "id": "lightrag",
34
+ "name": "LightRAG",
35
+ "description": "Lightweight knowledge graph retrieval, fast processing of text documents.",
36
+ "available": True,
37
+ },
38
+ }
39
+
40
+ # Try to register optional pipelines
41
+ def _register_optional_pipelines():
42
+ """Register pipelines that have optional dependencies."""
43
+ global _PIPELINES, _PIPELINE_INFO
44
+
45
+ # Try RAGAnything (requires raganything package)
46
+ try:
47
+ from .pipelines.raganything import RAGAnythingPipeline
48
+ _PIPELINES["raganything"] = RAGAnythingPipeline
49
+ _PIPELINE_INFO["raganything"] = {
50
+ "id": "raganything",
51
+ "name": "RAG-Anything",
52
+ "description": "Multimodal document processing with chart and formula extraction.",
53
+ "available": True,
54
+ }
55
+ logger.debug("RAGAnything pipeline registered")
56
+ except ImportError as e:
57
+ _PIPELINE_INFO["raganything"] = {
58
+ "id": "raganything",
59
+ "name": "RAG-Anything",
60
+ "description": "Multimodal document processing (requires: pip install realtimex-deeptutor[raganything])",
61
+ "available": False,
62
+ }
63
+ logger.debug(f"RAGAnything not available: {e}")
64
+
65
+ # Try LlamaIndex (requires llama-index package)
66
+ try:
67
+ from .pipelines import llamaindex
68
+ _PIPELINES["llamaindex"] = llamaindex.LlamaIndexPipeline
69
+ _PIPELINE_INFO["llamaindex"] = {
70
+ "id": "llamaindex",
71
+ "name": "LlamaIndex",
72
+ "description": "Pure vector retrieval, fastest processing speed.",
73
+ "available": True,
74
+ }
75
+ logger.debug("LlamaIndex pipeline registered")
76
+ except ImportError as e:
77
+ _PIPELINE_INFO["llamaindex"] = {
78
+ "id": "llamaindex",
79
+ "name": "LlamaIndex",
80
+ "description": "Vector retrieval (requires: pip install realtimex-deeptutor[llamaindex])",
81
+ "available": False,
82
+ }
83
+ logger.debug(f"LlamaIndex not available: {e}")
84
+
85
+
86
+ # Register optional pipelines at module load
87
+ _register_optional_pipelines()
88
+
89
+
90
+ def get_pipeline(name: str = "lightrag", kb_base_dir: Optional[str] = None, **kwargs):
91
+ """
92
+ Get a pre-configured pipeline by name.
93
+
94
+ Args:
95
+ name: Pipeline name (lightrag, raganything, llamaindex)
96
+ Default is 'lightrag' (always available).
97
+ kb_base_dir: Base directory for knowledge bases (passed to all pipelines)
98
+ **kwargs: Additional arguments passed to pipeline constructor
99
+
100
+ Returns:
101
+ Pipeline instance
102
+
103
+ Raises:
104
+ ValueError: If pipeline name is not found or not available
105
+ """
106
+ if name not in _PIPELINES:
107
+ available = list(_PIPELINES.keys())
108
+ # Check if it's a known but unavailable pipeline
109
+ if name in _PIPELINE_INFO:
110
+ info = _PIPELINE_INFO[name]
111
+ raise ValueError(
112
+ f"Pipeline '{name}' is not available. {info['description']}. "
113
+ f"Available pipelines: {available}"
114
+ )
115
+ raise ValueError(f"Unknown pipeline: {name}. Available: {available}")
116
+
117
+ factory = _PIPELINES[name]
118
+
119
+ # Handle different pipeline types:
120
+ # - lightrag, realtimex, academic: functions that return RAGPipeline
121
+ # - llamaindex, raganything: classes that need instantiation
122
+ if name in ("lightrag", "realtimex", "academic"):
123
+ # LightRAGPipeline and AcademicPipeline are factory functions
124
+ return factory(kb_base_dir=kb_base_dir)
125
+ elif name in ("llamaindex", "raganything"):
126
+ # LlamaIndexPipeline and RAGAnythingPipeline are classes
127
+ if kb_base_dir:
128
+ kwargs["kb_base_dir"] = kb_base_dir
129
+ return factory(**kwargs)
130
+ else:
131
+ # Default: try calling with kb_base_dir
132
+ return factory(kb_base_dir=kb_base_dir)
133
+
134
+
135
+ def list_pipelines(include_unavailable: bool = False) -> List[Dict[str, str]]:
136
+ """
137
+ List available pipelines.
138
+
139
+ Args:
140
+ include_unavailable: If True, also include pipelines that aren't installed
141
+
142
+ Returns:
143
+ List of pipeline info dictionaries
144
+ """
145
+ result = []
146
+ # Order: realtimex first (recommended), then others
147
+ order = ["realtimex", "lightrag", "raganything", "llamaindex"]
148
+
149
+ for pipeline_id in order:
150
+ if pipeline_id in _PIPELINE_INFO:
151
+ info = _PIPELINE_INFO[pipeline_id]
152
+ if include_unavailable or info.get("available", False):
153
+ result.append({
154
+ "id": info["id"],
155
+ "name": info["name"],
156
+ "description": info["description"],
157
+ })
158
+
159
+ return result
160
+
161
+
162
+ def register_pipeline(name: str, factory: Callable):
163
+ """
164
+ Register a custom pipeline.
165
+
166
+ Args:
167
+ name: Pipeline name
168
+ factory: Factory function or class that creates the pipeline
169
+ """
170
+ _PIPELINES[name] = factory
171
+
172
+
173
+ def has_pipeline(name: str) -> bool:
174
+ """
175
+ Check if a pipeline exists.
176
+
177
+ Args:
178
+ name: Pipeline name
179
+
180
+ Returns:
181
+ True if pipeline exists
182
+ """
183
+ return name in _PIPELINES
184
+
185
+
186
+ # Backward compatibility with old plugin API
187
+ def get_plugin(name: str) -> Dict[str, Callable]:
188
+ """
189
+ DEPRECATED: Use get_pipeline() instead.
190
+
191
+ Get a plugin by name (maps to pipeline API).
192
+ """
193
+ import warnings
194
+
195
+ warnings.warn(
196
+ "get_plugin() is deprecated, use get_pipeline() instead",
197
+ DeprecationWarning,
198
+ stacklevel=2,
199
+ )
200
+
201
+ pipeline = get_pipeline(name)
202
+ return {
203
+ "initialize": pipeline.initialize,
204
+ "search": pipeline.search,
205
+ "delete": getattr(pipeline, "delete", lambda kb: True),
206
+ }
207
+
208
+
209
+ def list_plugins() -> List[Dict[str, str]]:
210
+ """
211
+ DEPRECATED: Use list_pipelines() instead.
212
+ """
213
+ import warnings
214
+
215
+ warnings.warn(
216
+ "list_plugins() is deprecated, use list_pipelines() instead",
217
+ DeprecationWarning,
218
+ stacklevel=2,
219
+ )
220
+ return list_pipelines()
221
+
222
+
223
+ def has_plugin(name: str) -> bool:
224
+ """
225
+ DEPRECATED: Use has_pipeline() instead.
226
+ """
227
+ import warnings
228
+
229
+ warnings.warn(
230
+ "has_plugin() is deprecated, use has_pipeline() instead",
231
+ DeprecationWarning,
232
+ stacklevel=2,
233
+ )
234
+ return has_pipeline(name)