realtimex-deeptutor 0.5.0.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- realtimex_deeptutor/__init__.py +67 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/METADATA +1612 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/RECORD +276 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/WHEEL +5 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/entry_points.txt +2 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/licenses/LICENSE +661 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/top_level.txt +2 -0
- src/__init__.py +40 -0
- src/agents/__init__.py +24 -0
- src/agents/base_agent.py +657 -0
- src/agents/chat/__init__.py +24 -0
- src/agents/chat/chat_agent.py +435 -0
- src/agents/chat/prompts/en/chat_agent.yaml +35 -0
- src/agents/chat/prompts/zh/chat_agent.yaml +35 -0
- src/agents/chat/session_manager.py +311 -0
- src/agents/co_writer/__init__.py +0 -0
- src/agents/co_writer/edit_agent.py +260 -0
- src/agents/co_writer/narrator_agent.py +423 -0
- src/agents/co_writer/prompts/en/edit_agent.yaml +113 -0
- src/agents/co_writer/prompts/en/narrator_agent.yaml +88 -0
- src/agents/co_writer/prompts/zh/edit_agent.yaml +113 -0
- src/agents/co_writer/prompts/zh/narrator_agent.yaml +88 -0
- src/agents/guide/__init__.py +16 -0
- src/agents/guide/agents/__init__.py +11 -0
- src/agents/guide/agents/chat_agent.py +104 -0
- src/agents/guide/agents/interactive_agent.py +223 -0
- src/agents/guide/agents/locate_agent.py +149 -0
- src/agents/guide/agents/summary_agent.py +150 -0
- src/agents/guide/guide_manager.py +500 -0
- src/agents/guide/prompts/en/chat_agent.yaml +41 -0
- src/agents/guide/prompts/en/interactive_agent.yaml +202 -0
- src/agents/guide/prompts/en/locate_agent.yaml +68 -0
- src/agents/guide/prompts/en/summary_agent.yaml +157 -0
- src/agents/guide/prompts/zh/chat_agent.yaml +41 -0
- src/agents/guide/prompts/zh/interactive_agent.yaml +626 -0
- src/agents/guide/prompts/zh/locate_agent.yaml +68 -0
- src/agents/guide/prompts/zh/summary_agent.yaml +157 -0
- src/agents/ideagen/__init__.py +12 -0
- src/agents/ideagen/idea_generation_workflow.py +426 -0
- src/agents/ideagen/material_organizer_agent.py +173 -0
- src/agents/ideagen/prompts/en/idea_generation.yaml +187 -0
- src/agents/ideagen/prompts/en/material_organizer.yaml +69 -0
- src/agents/ideagen/prompts/zh/idea_generation.yaml +187 -0
- src/agents/ideagen/prompts/zh/material_organizer.yaml +69 -0
- src/agents/question/__init__.py +24 -0
- src/agents/question/agents/__init__.py +18 -0
- src/agents/question/agents/generate_agent.py +381 -0
- src/agents/question/agents/relevance_analyzer.py +207 -0
- src/agents/question/agents/retrieve_agent.py +239 -0
- src/agents/question/coordinator.py +718 -0
- src/agents/question/example.py +109 -0
- src/agents/question/prompts/en/coordinator.yaml +75 -0
- src/agents/question/prompts/en/generate_agent.yaml +77 -0
- src/agents/question/prompts/en/relevance_analyzer.yaml +41 -0
- src/agents/question/prompts/en/retrieve_agent.yaml +32 -0
- src/agents/question/prompts/zh/coordinator.yaml +75 -0
- src/agents/question/prompts/zh/generate_agent.yaml +77 -0
- src/agents/question/prompts/zh/relevance_analyzer.yaml +39 -0
- src/agents/question/prompts/zh/retrieve_agent.yaml +30 -0
- src/agents/research/agents/__init__.py +23 -0
- src/agents/research/agents/decompose_agent.py +507 -0
- src/agents/research/agents/manager_agent.py +228 -0
- src/agents/research/agents/note_agent.py +180 -0
- src/agents/research/agents/rephrase_agent.py +263 -0
- src/agents/research/agents/reporting_agent.py +1333 -0
- src/agents/research/agents/research_agent.py +714 -0
- src/agents/research/data_structures.py +451 -0
- src/agents/research/main.py +188 -0
- src/agents/research/prompts/en/decompose_agent.yaml +89 -0
- src/agents/research/prompts/en/manager_agent.yaml +24 -0
- src/agents/research/prompts/en/note_agent.yaml +121 -0
- src/agents/research/prompts/en/rephrase_agent.yaml +58 -0
- src/agents/research/prompts/en/reporting_agent.yaml +380 -0
- src/agents/research/prompts/en/research_agent.yaml +173 -0
- src/agents/research/prompts/zh/decompose_agent.yaml +89 -0
- src/agents/research/prompts/zh/manager_agent.yaml +24 -0
- src/agents/research/prompts/zh/note_agent.yaml +121 -0
- src/agents/research/prompts/zh/rephrase_agent.yaml +58 -0
- src/agents/research/prompts/zh/reporting_agent.yaml +380 -0
- src/agents/research/prompts/zh/research_agent.yaml +173 -0
- src/agents/research/research_pipeline.py +1309 -0
- src/agents/research/utils/__init__.py +60 -0
- src/agents/research/utils/citation_manager.py +799 -0
- src/agents/research/utils/json_utils.py +98 -0
- src/agents/research/utils/token_tracker.py +297 -0
- src/agents/solve/__init__.py +80 -0
- src/agents/solve/analysis_loop/__init__.py +14 -0
- src/agents/solve/analysis_loop/investigate_agent.py +414 -0
- src/agents/solve/analysis_loop/note_agent.py +190 -0
- src/agents/solve/main_solver.py +862 -0
- src/agents/solve/memory/__init__.py +34 -0
- src/agents/solve/memory/citation_memory.py +353 -0
- src/agents/solve/memory/investigate_memory.py +226 -0
- src/agents/solve/memory/solve_memory.py +340 -0
- src/agents/solve/prompts/en/analysis_loop/investigate_agent.yaml +55 -0
- src/agents/solve/prompts/en/analysis_loop/note_agent.yaml +54 -0
- src/agents/solve/prompts/en/solve_loop/manager_agent.yaml +67 -0
- src/agents/solve/prompts/en/solve_loop/precision_answer_agent.yaml +62 -0
- src/agents/solve/prompts/en/solve_loop/response_agent.yaml +90 -0
- src/agents/solve/prompts/en/solve_loop/solve_agent.yaml +75 -0
- src/agents/solve/prompts/en/solve_loop/tool_agent.yaml +38 -0
- src/agents/solve/prompts/zh/analysis_loop/investigate_agent.yaml +53 -0
- src/agents/solve/prompts/zh/analysis_loop/note_agent.yaml +54 -0
- src/agents/solve/prompts/zh/solve_loop/manager_agent.yaml +66 -0
- src/agents/solve/prompts/zh/solve_loop/precision_answer_agent.yaml +62 -0
- src/agents/solve/prompts/zh/solve_loop/response_agent.yaml +90 -0
- src/agents/solve/prompts/zh/solve_loop/solve_agent.yaml +76 -0
- src/agents/solve/prompts/zh/solve_loop/tool_agent.yaml +41 -0
- src/agents/solve/solve_loop/__init__.py +22 -0
- src/agents/solve/solve_loop/citation_manager.py +74 -0
- src/agents/solve/solve_loop/manager_agent.py +274 -0
- src/agents/solve/solve_loop/precision_answer_agent.py +96 -0
- src/agents/solve/solve_loop/response_agent.py +301 -0
- src/agents/solve/solve_loop/solve_agent.py +325 -0
- src/agents/solve/solve_loop/tool_agent.py +470 -0
- src/agents/solve/utils/__init__.py +64 -0
- src/agents/solve/utils/config_validator.py +313 -0
- src/agents/solve/utils/display_manager.py +223 -0
- src/agents/solve/utils/error_handler.py +363 -0
- src/agents/solve/utils/json_utils.py +98 -0
- src/agents/solve/utils/performance_monitor.py +407 -0
- src/agents/solve/utils/token_tracker.py +541 -0
- src/api/__init__.py +0 -0
- src/api/main.py +240 -0
- src/api/routers/__init__.py +1 -0
- src/api/routers/agent_config.py +69 -0
- src/api/routers/chat.py +296 -0
- src/api/routers/co_writer.py +337 -0
- src/api/routers/config.py +627 -0
- src/api/routers/dashboard.py +18 -0
- src/api/routers/guide.py +337 -0
- src/api/routers/ideagen.py +436 -0
- src/api/routers/knowledge.py +821 -0
- src/api/routers/notebook.py +247 -0
- src/api/routers/question.py +537 -0
- src/api/routers/research.py +394 -0
- src/api/routers/settings.py +164 -0
- src/api/routers/solve.py +305 -0
- src/api/routers/system.py +252 -0
- src/api/run_server.py +61 -0
- src/api/utils/history.py +172 -0
- src/api/utils/log_interceptor.py +21 -0
- src/api/utils/notebook_manager.py +415 -0
- src/api/utils/progress_broadcaster.py +72 -0
- src/api/utils/task_id_manager.py +100 -0
- src/config/__init__.py +0 -0
- src/config/accessors.py +18 -0
- src/config/constants.py +34 -0
- src/config/defaults.py +18 -0
- src/config/schema.py +38 -0
- src/config/settings.py +50 -0
- src/core/errors.py +62 -0
- src/knowledge/__init__.py +23 -0
- src/knowledge/add_documents.py +606 -0
- src/knowledge/config.py +65 -0
- src/knowledge/example_add_documents.py +236 -0
- src/knowledge/extract_numbered_items.py +1039 -0
- src/knowledge/initializer.py +621 -0
- src/knowledge/kb.py +22 -0
- src/knowledge/manager.py +782 -0
- src/knowledge/progress_tracker.py +182 -0
- src/knowledge/start_kb.py +535 -0
- src/logging/__init__.py +103 -0
- src/logging/adapters/__init__.py +17 -0
- src/logging/adapters/lightrag.py +184 -0
- src/logging/adapters/llamaindex.py +141 -0
- src/logging/config.py +80 -0
- src/logging/handlers/__init__.py +20 -0
- src/logging/handlers/console.py +75 -0
- src/logging/handlers/file.py +201 -0
- src/logging/handlers/websocket.py +127 -0
- src/logging/logger.py +709 -0
- src/logging/stats/__init__.py +16 -0
- src/logging/stats/llm_stats.py +179 -0
- src/services/__init__.py +56 -0
- src/services/config/__init__.py +61 -0
- src/services/config/knowledge_base_config.py +210 -0
- src/services/config/loader.py +260 -0
- src/services/config/unified_config.py +603 -0
- src/services/embedding/__init__.py +45 -0
- src/services/embedding/adapters/__init__.py +22 -0
- src/services/embedding/adapters/base.py +106 -0
- src/services/embedding/adapters/cohere.py +127 -0
- src/services/embedding/adapters/jina.py +99 -0
- src/services/embedding/adapters/ollama.py +116 -0
- src/services/embedding/adapters/openai_compatible.py +96 -0
- src/services/embedding/client.py +159 -0
- src/services/embedding/config.py +156 -0
- src/services/embedding/provider.py +119 -0
- src/services/llm/__init__.py +152 -0
- src/services/llm/capabilities.py +313 -0
- src/services/llm/client.py +302 -0
- src/services/llm/cloud_provider.py +530 -0
- src/services/llm/config.py +200 -0
- src/services/llm/error_mapping.py +103 -0
- src/services/llm/exceptions.py +152 -0
- src/services/llm/factory.py +450 -0
- src/services/llm/local_provider.py +347 -0
- src/services/llm/providers/anthropic.py +95 -0
- src/services/llm/providers/base_provider.py +93 -0
- src/services/llm/providers/open_ai.py +83 -0
- src/services/llm/registry.py +71 -0
- src/services/llm/telemetry.py +40 -0
- src/services/llm/types.py +27 -0
- src/services/llm/utils.py +333 -0
- src/services/prompt/__init__.py +25 -0
- src/services/prompt/manager.py +206 -0
- src/services/rag/__init__.py +64 -0
- src/services/rag/components/__init__.py +29 -0
- src/services/rag/components/base.py +59 -0
- src/services/rag/components/chunkers/__init__.py +18 -0
- src/services/rag/components/chunkers/base.py +34 -0
- src/services/rag/components/chunkers/fixed.py +71 -0
- src/services/rag/components/chunkers/numbered_item.py +94 -0
- src/services/rag/components/chunkers/semantic.py +97 -0
- src/services/rag/components/embedders/__init__.py +14 -0
- src/services/rag/components/embedders/base.py +32 -0
- src/services/rag/components/embedders/openai.py +63 -0
- src/services/rag/components/indexers/__init__.py +18 -0
- src/services/rag/components/indexers/base.py +35 -0
- src/services/rag/components/indexers/graph.py +172 -0
- src/services/rag/components/indexers/lightrag.py +156 -0
- src/services/rag/components/indexers/vector.py +146 -0
- src/services/rag/components/parsers/__init__.py +18 -0
- src/services/rag/components/parsers/base.py +35 -0
- src/services/rag/components/parsers/markdown.py +52 -0
- src/services/rag/components/parsers/pdf.py +115 -0
- src/services/rag/components/parsers/text.py +86 -0
- src/services/rag/components/retrievers/__init__.py +18 -0
- src/services/rag/components/retrievers/base.py +34 -0
- src/services/rag/components/retrievers/dense.py +200 -0
- src/services/rag/components/retrievers/hybrid.py +164 -0
- src/services/rag/components/retrievers/lightrag.py +169 -0
- src/services/rag/components/routing.py +286 -0
- src/services/rag/factory.py +234 -0
- src/services/rag/pipeline.py +215 -0
- src/services/rag/pipelines/__init__.py +32 -0
- src/services/rag/pipelines/academic.py +44 -0
- src/services/rag/pipelines/lightrag.py +43 -0
- src/services/rag/pipelines/llamaindex.py +313 -0
- src/services/rag/pipelines/raganything.py +384 -0
- src/services/rag/service.py +244 -0
- src/services/rag/types.py +73 -0
- src/services/search/__init__.py +284 -0
- src/services/search/base.py +87 -0
- src/services/search/consolidation.py +398 -0
- src/services/search/providers/__init__.py +128 -0
- src/services/search/providers/baidu.py +188 -0
- src/services/search/providers/exa.py +194 -0
- src/services/search/providers/jina.py +161 -0
- src/services/search/providers/perplexity.py +153 -0
- src/services/search/providers/serper.py +209 -0
- src/services/search/providers/tavily.py +161 -0
- src/services/search/types.py +114 -0
- src/services/setup/__init__.py +34 -0
- src/services/setup/init.py +285 -0
- src/services/tts/__init__.py +16 -0
- src/services/tts/config.py +99 -0
- src/tools/__init__.py +91 -0
- src/tools/code_executor.py +536 -0
- src/tools/paper_search_tool.py +171 -0
- src/tools/query_item_tool.py +310 -0
- src/tools/question/__init__.py +15 -0
- src/tools/question/exam_mimic.py +616 -0
- src/tools/question/pdf_parser.py +211 -0
- src/tools/question/question_extractor.py +397 -0
- src/tools/rag_tool.py +173 -0
- src/tools/tex_chunker.py +339 -0
- src/tools/tex_downloader.py +253 -0
- src/tools/web_search.py +71 -0
- src/utils/config_manager.py +206 -0
- src/utils/document_validator.py +168 -0
- src/utils/error_rate_tracker.py +111 -0
- src/utils/error_utils.py +82 -0
- src/utils/json_parser.py +110 -0
- src/utils/network/circuit_breaker.py +79 -0
|
@@ -0,0 +1,286 @@
|
|
|
1
|
+
"""
|
|
2
|
+
File Type Router
|
|
3
|
+
================
|
|
4
|
+
|
|
5
|
+
Centralized file type classification and routing for RAG pipelines.
|
|
6
|
+
Determines the appropriate processing method for each document type.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from enum import Enum
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import List
|
|
13
|
+
|
|
14
|
+
from src.logging import get_logger
|
|
15
|
+
|
|
16
|
+
logger = get_logger("FileTypeRouter")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class DocumentType(Enum):
|
|
20
|
+
"""Document type classification"""
|
|
21
|
+
|
|
22
|
+
PDF = "pdf" # Requires MinerU complex parsing
|
|
23
|
+
TEXT = "text" # Plain text, direct read
|
|
24
|
+
MARKDOWN = "markdown" # Structured text
|
|
25
|
+
DOCX = "docx" # Word documents
|
|
26
|
+
IMAGE = "image" # Images (may need OCR)
|
|
27
|
+
UNKNOWN = "unknown" # Unsupported
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class FileClassification:
|
|
32
|
+
"""
|
|
33
|
+
Result of file classification.
|
|
34
|
+
|
|
35
|
+
Attributes:
|
|
36
|
+
needs_mineru: Files requiring MinerU parsing (PDF, etc.)
|
|
37
|
+
text_files: Files that can be read directly as text
|
|
38
|
+
unsupported: Files with unsupported formats
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
needs_mineru: List[str]
|
|
42
|
+
text_files: List[str]
|
|
43
|
+
unsupported: List[str]
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class FileTypeRouter:
|
|
47
|
+
"""
|
|
48
|
+
File type router for RAG pipelines.
|
|
49
|
+
|
|
50
|
+
Classifies files before processing to route them to appropriate handlers:
|
|
51
|
+
- PDF files -> MinerU parser (complex document parsing)
|
|
52
|
+
- Text files -> Direct read (fast, simple)
|
|
53
|
+
- Unsupported -> Skip with warning
|
|
54
|
+
|
|
55
|
+
Usage:
|
|
56
|
+
router = FileTypeRouter()
|
|
57
|
+
classification = router.classify_files(file_paths)
|
|
58
|
+
|
|
59
|
+
# Process PDF files with MinerU
|
|
60
|
+
for pdf in classification.needs_mineru:
|
|
61
|
+
await rag.process_document_complete(pdf, ...)
|
|
62
|
+
|
|
63
|
+
# Process text files directly
|
|
64
|
+
for txt in classification.text_files:
|
|
65
|
+
content = await FileTypeRouter.read_text_file(txt)
|
|
66
|
+
await rag.lightrag.ainsert(content)
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
# Extensions requiring MinerU parsing (complex document formats)
|
|
70
|
+
MINERU_EXTENSIONS = {".pdf"}
|
|
71
|
+
|
|
72
|
+
# Extensions for direct text reading
|
|
73
|
+
TEXT_EXTENSIONS = {
|
|
74
|
+
# Plain text
|
|
75
|
+
".txt",
|
|
76
|
+
".text",
|
|
77
|
+
".log",
|
|
78
|
+
# Markup languages
|
|
79
|
+
".md",
|
|
80
|
+
".markdown",
|
|
81
|
+
".rst",
|
|
82
|
+
".asciidoc",
|
|
83
|
+
# Data formats
|
|
84
|
+
".json",
|
|
85
|
+
".yaml",
|
|
86
|
+
".yml",
|
|
87
|
+
".toml",
|
|
88
|
+
".csv",
|
|
89
|
+
".tsv",
|
|
90
|
+
# LaTeX
|
|
91
|
+
".tex",
|
|
92
|
+
".latex",
|
|
93
|
+
".bib",
|
|
94
|
+
# Code files
|
|
95
|
+
".py",
|
|
96
|
+
".js",
|
|
97
|
+
".ts",
|
|
98
|
+
".jsx",
|
|
99
|
+
".tsx",
|
|
100
|
+
".java",
|
|
101
|
+
".c",
|
|
102
|
+
".cpp",
|
|
103
|
+
".h",
|
|
104
|
+
".hpp",
|
|
105
|
+
".go",
|
|
106
|
+
".rs",
|
|
107
|
+
".rb",
|
|
108
|
+
".php",
|
|
109
|
+
".swift",
|
|
110
|
+
".kt",
|
|
111
|
+
".scala",
|
|
112
|
+
".r",
|
|
113
|
+
".sql",
|
|
114
|
+
".sh",
|
|
115
|
+
".bash",
|
|
116
|
+
".zsh",
|
|
117
|
+
".ps1",
|
|
118
|
+
# Web
|
|
119
|
+
".html",
|
|
120
|
+
".htm",
|
|
121
|
+
".xml",
|
|
122
|
+
".css",
|
|
123
|
+
".scss",
|
|
124
|
+
".sass",
|
|
125
|
+
".less",
|
|
126
|
+
# Config
|
|
127
|
+
".ini",
|
|
128
|
+
".cfg",
|
|
129
|
+
".conf",
|
|
130
|
+
".env",
|
|
131
|
+
".properties",
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
# Word document extensions (special handling)
|
|
135
|
+
DOCX_EXTENSIONS = {".docx", ".doc"}
|
|
136
|
+
|
|
137
|
+
# Image extensions (may need OCR)
|
|
138
|
+
IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp", ".tiff", ".tif"}
|
|
139
|
+
|
|
140
|
+
@classmethod
|
|
141
|
+
def get_document_type(cls, file_path: str) -> DocumentType:
|
|
142
|
+
"""
|
|
143
|
+
Classify a single file by its type.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
file_path: Path to the file
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
DocumentType enum value
|
|
150
|
+
"""
|
|
151
|
+
ext = Path(file_path).suffix.lower()
|
|
152
|
+
|
|
153
|
+
if ext in cls.MINERU_EXTENSIONS:
|
|
154
|
+
return DocumentType.PDF
|
|
155
|
+
elif ext in cls.TEXT_EXTENSIONS:
|
|
156
|
+
return DocumentType.TEXT
|
|
157
|
+
elif ext in cls.DOCX_EXTENSIONS:
|
|
158
|
+
return DocumentType.DOCX
|
|
159
|
+
elif ext in cls.IMAGE_EXTENSIONS:
|
|
160
|
+
return DocumentType.IMAGE
|
|
161
|
+
else:
|
|
162
|
+
# Try to detect if it's a text file by content
|
|
163
|
+
if cls._is_text_file(file_path):
|
|
164
|
+
return DocumentType.TEXT
|
|
165
|
+
return DocumentType.UNKNOWN
|
|
166
|
+
|
|
167
|
+
@classmethod
|
|
168
|
+
def _is_text_file(cls, file_path: str, sample_size: int = 8192) -> bool:
|
|
169
|
+
"""
|
|
170
|
+
Detect if a file is text-based by examining its content.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
file_path: Path to the file
|
|
174
|
+
sample_size: Number of bytes to sample
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
True if file appears to be text
|
|
178
|
+
"""
|
|
179
|
+
try:
|
|
180
|
+
with open(file_path, "rb") as f:
|
|
181
|
+
chunk = f.read(sample_size)
|
|
182
|
+
|
|
183
|
+
# Check for null bytes (binary file indicator)
|
|
184
|
+
if b"\x00" in chunk:
|
|
185
|
+
return False
|
|
186
|
+
|
|
187
|
+
# Try to decode as UTF-8
|
|
188
|
+
chunk.decode("utf-8")
|
|
189
|
+
return True
|
|
190
|
+
except (UnicodeDecodeError, IOError, OSError):
|
|
191
|
+
return False
|
|
192
|
+
|
|
193
|
+
@classmethod
|
|
194
|
+
def classify_files(cls, file_paths: List[str]) -> FileClassification:
|
|
195
|
+
"""
|
|
196
|
+
Classify a list of files by processing method.
|
|
197
|
+
|
|
198
|
+
Args:
|
|
199
|
+
file_paths: List of file paths to classify
|
|
200
|
+
|
|
201
|
+
Returns:
|
|
202
|
+
FileClassification with files grouped by processing method
|
|
203
|
+
"""
|
|
204
|
+
needs_mineru = []
|
|
205
|
+
text_files = []
|
|
206
|
+
unsupported = []
|
|
207
|
+
|
|
208
|
+
for path in file_paths:
|
|
209
|
+
doc_type = cls.get_document_type(path)
|
|
210
|
+
|
|
211
|
+
if doc_type == DocumentType.PDF:
|
|
212
|
+
needs_mineru.append(path)
|
|
213
|
+
elif doc_type in (DocumentType.TEXT, DocumentType.MARKDOWN):
|
|
214
|
+
text_files.append(path)
|
|
215
|
+
elif doc_type == DocumentType.DOCX:
|
|
216
|
+
# DOCX files need special handling
|
|
217
|
+
# For now, route to MinerU which can handle them
|
|
218
|
+
needs_mineru.append(path)
|
|
219
|
+
elif doc_type == DocumentType.IMAGE:
|
|
220
|
+
# Images might need OCR - route to MinerU if multimodal is enabled
|
|
221
|
+
needs_mineru.append(path)
|
|
222
|
+
else:
|
|
223
|
+
unsupported.append(path)
|
|
224
|
+
|
|
225
|
+
logger.debug(
|
|
226
|
+
f"Classified {len(file_paths)} files: "
|
|
227
|
+
f"{len(needs_mineru)} MinerU, {len(text_files)} text, {len(unsupported)} unsupported"
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
return FileClassification(
|
|
231
|
+
needs_mineru=needs_mineru,
|
|
232
|
+
text_files=text_files,
|
|
233
|
+
unsupported=unsupported,
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
@classmethod
|
|
237
|
+
async def read_text_file(cls, file_path: str) -> str:
|
|
238
|
+
"""
|
|
239
|
+
Read a text file with automatic encoding detection.
|
|
240
|
+
|
|
241
|
+
Args:
|
|
242
|
+
file_path: Path to the text file
|
|
243
|
+
|
|
244
|
+
Returns:
|
|
245
|
+
File content as string
|
|
246
|
+
"""
|
|
247
|
+
encodings = ["utf-8", "utf-8-sig", "gbk", "gb2312", "gb18030", "latin-1", "cp1252"]
|
|
248
|
+
|
|
249
|
+
for encoding in encodings:
|
|
250
|
+
try:
|
|
251
|
+
with open(file_path, "r", encoding=encoding) as f:
|
|
252
|
+
return f.read()
|
|
253
|
+
except UnicodeDecodeError:
|
|
254
|
+
continue
|
|
255
|
+
|
|
256
|
+
# Last resort: read with error replacement
|
|
257
|
+
with open(file_path, "rb") as f:
|
|
258
|
+
return f.read().decode("utf-8", errors="replace")
|
|
259
|
+
|
|
260
|
+
@classmethod
|
|
261
|
+
def needs_mineru(cls, file_path: str) -> bool:
|
|
262
|
+
"""
|
|
263
|
+
Quick check if a single file needs MinerU parsing.
|
|
264
|
+
|
|
265
|
+
Args:
|
|
266
|
+
file_path: Path to the file
|
|
267
|
+
|
|
268
|
+
Returns:
|
|
269
|
+
True if file requires MinerU
|
|
270
|
+
"""
|
|
271
|
+
doc_type = cls.get_document_type(file_path)
|
|
272
|
+
return doc_type in (DocumentType.PDF, DocumentType.DOCX, DocumentType.IMAGE)
|
|
273
|
+
|
|
274
|
+
@classmethod
|
|
275
|
+
def is_text_readable(cls, file_path: str) -> bool:
|
|
276
|
+
"""
|
|
277
|
+
Check if a file can be read directly as text.
|
|
278
|
+
|
|
279
|
+
Args:
|
|
280
|
+
file_path: Path to the file
|
|
281
|
+
|
|
282
|
+
Returns:
|
|
283
|
+
True if file can be read as text
|
|
284
|
+
"""
|
|
285
|
+
doc_type = cls.get_document_type(file_path)
|
|
286
|
+
return doc_type in (DocumentType.TEXT, DocumentType.MARKDOWN)
|
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pipeline Factory
|
|
3
|
+
================
|
|
4
|
+
|
|
5
|
+
Factory for creating and managing RAG pipelines.
|
|
6
|
+
|
|
7
|
+
LightRAG is the default pipeline (always available).
|
|
8
|
+
RAGAnything and LlamaIndex are optional (require extra dependencies).
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import logging
|
|
12
|
+
from typing import Callable, Dict, List, Optional
|
|
13
|
+
|
|
14
|
+
from .pipelines import lightrag
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
# Pipeline registry - start with always-available pipelines
|
|
19
|
+
_PIPELINES: Dict[str, Callable] = {
|
|
20
|
+
"lightrag": lightrag.LightRAGPipeline, # Knowledge graph: PDFParser, fast text-only (default)
|
|
21
|
+
"realtimex": lightrag.LightRAGPipeline, # Alias: RealTimeX (uses LightRAG with RealTimeX AI config)
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
# Pipeline metadata for list_pipelines()
|
|
25
|
+
_PIPELINE_INFO: Dict[str, Dict[str, str]] = {
|
|
26
|
+
"realtimex": {
|
|
27
|
+
"id": "realtimex",
|
|
28
|
+
"name": "RealTimeX",
|
|
29
|
+
"description": "RealTimeX AI powered knowledge retrieval (recommended).",
|
|
30
|
+
"available": True,
|
|
31
|
+
},
|
|
32
|
+
"lightrag": {
|
|
33
|
+
"id": "lightrag",
|
|
34
|
+
"name": "LightRAG",
|
|
35
|
+
"description": "Lightweight knowledge graph retrieval, fast processing of text documents.",
|
|
36
|
+
"available": True,
|
|
37
|
+
},
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
# Try to register optional pipelines
|
|
41
|
+
def _register_optional_pipelines():
|
|
42
|
+
"""Register pipelines that have optional dependencies."""
|
|
43
|
+
global _PIPELINES, _PIPELINE_INFO
|
|
44
|
+
|
|
45
|
+
# Try RAGAnything (requires raganything package)
|
|
46
|
+
try:
|
|
47
|
+
from .pipelines.raganything import RAGAnythingPipeline
|
|
48
|
+
_PIPELINES["raganything"] = RAGAnythingPipeline
|
|
49
|
+
_PIPELINE_INFO["raganything"] = {
|
|
50
|
+
"id": "raganything",
|
|
51
|
+
"name": "RAG-Anything",
|
|
52
|
+
"description": "Multimodal document processing with chart and formula extraction.",
|
|
53
|
+
"available": True,
|
|
54
|
+
}
|
|
55
|
+
logger.debug("RAGAnything pipeline registered")
|
|
56
|
+
except ImportError as e:
|
|
57
|
+
_PIPELINE_INFO["raganything"] = {
|
|
58
|
+
"id": "raganything",
|
|
59
|
+
"name": "RAG-Anything",
|
|
60
|
+
"description": "Multimodal document processing (requires: pip install realtimex-deeptutor[raganything])",
|
|
61
|
+
"available": False,
|
|
62
|
+
}
|
|
63
|
+
logger.debug(f"RAGAnything not available: {e}")
|
|
64
|
+
|
|
65
|
+
# Try LlamaIndex (requires llama-index package)
|
|
66
|
+
try:
|
|
67
|
+
from .pipelines import llamaindex
|
|
68
|
+
_PIPELINES["llamaindex"] = llamaindex.LlamaIndexPipeline
|
|
69
|
+
_PIPELINE_INFO["llamaindex"] = {
|
|
70
|
+
"id": "llamaindex",
|
|
71
|
+
"name": "LlamaIndex",
|
|
72
|
+
"description": "Pure vector retrieval, fastest processing speed.",
|
|
73
|
+
"available": True,
|
|
74
|
+
}
|
|
75
|
+
logger.debug("LlamaIndex pipeline registered")
|
|
76
|
+
except ImportError as e:
|
|
77
|
+
_PIPELINE_INFO["llamaindex"] = {
|
|
78
|
+
"id": "llamaindex",
|
|
79
|
+
"name": "LlamaIndex",
|
|
80
|
+
"description": "Vector retrieval (requires: pip install realtimex-deeptutor[llamaindex])",
|
|
81
|
+
"available": False,
|
|
82
|
+
}
|
|
83
|
+
logger.debug(f"LlamaIndex not available: {e}")
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
# Register optional pipelines at module load
|
|
87
|
+
_register_optional_pipelines()
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def get_pipeline(name: str = "lightrag", kb_base_dir: Optional[str] = None, **kwargs):
|
|
91
|
+
"""
|
|
92
|
+
Get a pre-configured pipeline by name.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
name: Pipeline name (lightrag, raganything, llamaindex)
|
|
96
|
+
Default is 'lightrag' (always available).
|
|
97
|
+
kb_base_dir: Base directory for knowledge bases (passed to all pipelines)
|
|
98
|
+
**kwargs: Additional arguments passed to pipeline constructor
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
Pipeline instance
|
|
102
|
+
|
|
103
|
+
Raises:
|
|
104
|
+
ValueError: If pipeline name is not found or not available
|
|
105
|
+
"""
|
|
106
|
+
if name not in _PIPELINES:
|
|
107
|
+
available = list(_PIPELINES.keys())
|
|
108
|
+
# Check if it's a known but unavailable pipeline
|
|
109
|
+
if name in _PIPELINE_INFO:
|
|
110
|
+
info = _PIPELINE_INFO[name]
|
|
111
|
+
raise ValueError(
|
|
112
|
+
f"Pipeline '{name}' is not available. {info['description']}. "
|
|
113
|
+
f"Available pipelines: {available}"
|
|
114
|
+
)
|
|
115
|
+
raise ValueError(f"Unknown pipeline: {name}. Available: {available}")
|
|
116
|
+
|
|
117
|
+
factory = _PIPELINES[name]
|
|
118
|
+
|
|
119
|
+
# Handle different pipeline types:
|
|
120
|
+
# - lightrag, realtimex, academic: functions that return RAGPipeline
|
|
121
|
+
# - llamaindex, raganything: classes that need instantiation
|
|
122
|
+
if name in ("lightrag", "realtimex", "academic"):
|
|
123
|
+
# LightRAGPipeline and AcademicPipeline are factory functions
|
|
124
|
+
return factory(kb_base_dir=kb_base_dir)
|
|
125
|
+
elif name in ("llamaindex", "raganything"):
|
|
126
|
+
# LlamaIndexPipeline and RAGAnythingPipeline are classes
|
|
127
|
+
if kb_base_dir:
|
|
128
|
+
kwargs["kb_base_dir"] = kb_base_dir
|
|
129
|
+
return factory(**kwargs)
|
|
130
|
+
else:
|
|
131
|
+
# Default: try calling with kb_base_dir
|
|
132
|
+
return factory(kb_base_dir=kb_base_dir)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def list_pipelines(include_unavailable: bool = False) -> List[Dict[str, str]]:
|
|
136
|
+
"""
|
|
137
|
+
List available pipelines.
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
include_unavailable: If True, also include pipelines that aren't installed
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
List of pipeline info dictionaries
|
|
144
|
+
"""
|
|
145
|
+
result = []
|
|
146
|
+
# Order: realtimex first (recommended), then others
|
|
147
|
+
order = ["realtimex", "lightrag", "raganything", "llamaindex"]
|
|
148
|
+
|
|
149
|
+
for pipeline_id in order:
|
|
150
|
+
if pipeline_id in _PIPELINE_INFO:
|
|
151
|
+
info = _PIPELINE_INFO[pipeline_id]
|
|
152
|
+
if include_unavailable or info.get("available", False):
|
|
153
|
+
result.append({
|
|
154
|
+
"id": info["id"],
|
|
155
|
+
"name": info["name"],
|
|
156
|
+
"description": info["description"],
|
|
157
|
+
})
|
|
158
|
+
|
|
159
|
+
return result
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def register_pipeline(name: str, factory: Callable):
|
|
163
|
+
"""
|
|
164
|
+
Register a custom pipeline.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
name: Pipeline name
|
|
168
|
+
factory: Factory function or class that creates the pipeline
|
|
169
|
+
"""
|
|
170
|
+
_PIPELINES[name] = factory
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def has_pipeline(name: str) -> bool:
|
|
174
|
+
"""
|
|
175
|
+
Check if a pipeline exists.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
name: Pipeline name
|
|
179
|
+
|
|
180
|
+
Returns:
|
|
181
|
+
True if pipeline exists
|
|
182
|
+
"""
|
|
183
|
+
return name in _PIPELINES
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
# Backward compatibility with old plugin API
|
|
187
|
+
def get_plugin(name: str) -> Dict[str, Callable]:
|
|
188
|
+
"""
|
|
189
|
+
DEPRECATED: Use get_pipeline() instead.
|
|
190
|
+
|
|
191
|
+
Get a plugin by name (maps to pipeline API).
|
|
192
|
+
"""
|
|
193
|
+
import warnings
|
|
194
|
+
|
|
195
|
+
warnings.warn(
|
|
196
|
+
"get_plugin() is deprecated, use get_pipeline() instead",
|
|
197
|
+
DeprecationWarning,
|
|
198
|
+
stacklevel=2,
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
pipeline = get_pipeline(name)
|
|
202
|
+
return {
|
|
203
|
+
"initialize": pipeline.initialize,
|
|
204
|
+
"search": pipeline.search,
|
|
205
|
+
"delete": getattr(pipeline, "delete", lambda kb: True),
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def list_plugins() -> List[Dict[str, str]]:
|
|
210
|
+
"""
|
|
211
|
+
DEPRECATED: Use list_pipelines() instead.
|
|
212
|
+
"""
|
|
213
|
+
import warnings
|
|
214
|
+
|
|
215
|
+
warnings.warn(
|
|
216
|
+
"list_plugins() is deprecated, use list_pipelines() instead",
|
|
217
|
+
DeprecationWarning,
|
|
218
|
+
stacklevel=2,
|
|
219
|
+
)
|
|
220
|
+
return list_pipelines()
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def has_plugin(name: str) -> bool:
|
|
224
|
+
"""
|
|
225
|
+
DEPRECATED: Use has_pipeline() instead.
|
|
226
|
+
"""
|
|
227
|
+
import warnings
|
|
228
|
+
|
|
229
|
+
warnings.warn(
|
|
230
|
+
"has_plugin() is deprecated, use has_pipeline() instead",
|
|
231
|
+
DeprecationWarning,
|
|
232
|
+
stacklevel=2,
|
|
233
|
+
)
|
|
234
|
+
return has_pipeline(name)
|