realtimex-deeptutor 0.5.0.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (276) hide show
  1. realtimex_deeptutor/__init__.py +67 -0
  2. realtimex_deeptutor-0.5.0.post1.dist-info/METADATA +1612 -0
  3. realtimex_deeptutor-0.5.0.post1.dist-info/RECORD +276 -0
  4. realtimex_deeptutor-0.5.0.post1.dist-info/WHEEL +5 -0
  5. realtimex_deeptutor-0.5.0.post1.dist-info/entry_points.txt +2 -0
  6. realtimex_deeptutor-0.5.0.post1.dist-info/licenses/LICENSE +661 -0
  7. realtimex_deeptutor-0.5.0.post1.dist-info/top_level.txt +2 -0
  8. src/__init__.py +40 -0
  9. src/agents/__init__.py +24 -0
  10. src/agents/base_agent.py +657 -0
  11. src/agents/chat/__init__.py +24 -0
  12. src/agents/chat/chat_agent.py +435 -0
  13. src/agents/chat/prompts/en/chat_agent.yaml +35 -0
  14. src/agents/chat/prompts/zh/chat_agent.yaml +35 -0
  15. src/agents/chat/session_manager.py +311 -0
  16. src/agents/co_writer/__init__.py +0 -0
  17. src/agents/co_writer/edit_agent.py +260 -0
  18. src/agents/co_writer/narrator_agent.py +423 -0
  19. src/agents/co_writer/prompts/en/edit_agent.yaml +113 -0
  20. src/agents/co_writer/prompts/en/narrator_agent.yaml +88 -0
  21. src/agents/co_writer/prompts/zh/edit_agent.yaml +113 -0
  22. src/agents/co_writer/prompts/zh/narrator_agent.yaml +88 -0
  23. src/agents/guide/__init__.py +16 -0
  24. src/agents/guide/agents/__init__.py +11 -0
  25. src/agents/guide/agents/chat_agent.py +104 -0
  26. src/agents/guide/agents/interactive_agent.py +223 -0
  27. src/agents/guide/agents/locate_agent.py +149 -0
  28. src/agents/guide/agents/summary_agent.py +150 -0
  29. src/agents/guide/guide_manager.py +500 -0
  30. src/agents/guide/prompts/en/chat_agent.yaml +41 -0
  31. src/agents/guide/prompts/en/interactive_agent.yaml +202 -0
  32. src/agents/guide/prompts/en/locate_agent.yaml +68 -0
  33. src/agents/guide/prompts/en/summary_agent.yaml +157 -0
  34. src/agents/guide/prompts/zh/chat_agent.yaml +41 -0
  35. src/agents/guide/prompts/zh/interactive_agent.yaml +626 -0
  36. src/agents/guide/prompts/zh/locate_agent.yaml +68 -0
  37. src/agents/guide/prompts/zh/summary_agent.yaml +157 -0
  38. src/agents/ideagen/__init__.py +12 -0
  39. src/agents/ideagen/idea_generation_workflow.py +426 -0
  40. src/agents/ideagen/material_organizer_agent.py +173 -0
  41. src/agents/ideagen/prompts/en/idea_generation.yaml +187 -0
  42. src/agents/ideagen/prompts/en/material_organizer.yaml +69 -0
  43. src/agents/ideagen/prompts/zh/idea_generation.yaml +187 -0
  44. src/agents/ideagen/prompts/zh/material_organizer.yaml +69 -0
  45. src/agents/question/__init__.py +24 -0
  46. src/agents/question/agents/__init__.py +18 -0
  47. src/agents/question/agents/generate_agent.py +381 -0
  48. src/agents/question/agents/relevance_analyzer.py +207 -0
  49. src/agents/question/agents/retrieve_agent.py +239 -0
  50. src/agents/question/coordinator.py +718 -0
  51. src/agents/question/example.py +109 -0
  52. src/agents/question/prompts/en/coordinator.yaml +75 -0
  53. src/agents/question/prompts/en/generate_agent.yaml +77 -0
  54. src/agents/question/prompts/en/relevance_analyzer.yaml +41 -0
  55. src/agents/question/prompts/en/retrieve_agent.yaml +32 -0
  56. src/agents/question/prompts/zh/coordinator.yaml +75 -0
  57. src/agents/question/prompts/zh/generate_agent.yaml +77 -0
  58. src/agents/question/prompts/zh/relevance_analyzer.yaml +39 -0
  59. src/agents/question/prompts/zh/retrieve_agent.yaml +30 -0
  60. src/agents/research/agents/__init__.py +23 -0
  61. src/agents/research/agents/decompose_agent.py +507 -0
  62. src/agents/research/agents/manager_agent.py +228 -0
  63. src/agents/research/agents/note_agent.py +180 -0
  64. src/agents/research/agents/rephrase_agent.py +263 -0
  65. src/agents/research/agents/reporting_agent.py +1333 -0
  66. src/agents/research/agents/research_agent.py +714 -0
  67. src/agents/research/data_structures.py +451 -0
  68. src/agents/research/main.py +188 -0
  69. src/agents/research/prompts/en/decompose_agent.yaml +89 -0
  70. src/agents/research/prompts/en/manager_agent.yaml +24 -0
  71. src/agents/research/prompts/en/note_agent.yaml +121 -0
  72. src/agents/research/prompts/en/rephrase_agent.yaml +58 -0
  73. src/agents/research/prompts/en/reporting_agent.yaml +380 -0
  74. src/agents/research/prompts/en/research_agent.yaml +173 -0
  75. src/agents/research/prompts/zh/decompose_agent.yaml +89 -0
  76. src/agents/research/prompts/zh/manager_agent.yaml +24 -0
  77. src/agents/research/prompts/zh/note_agent.yaml +121 -0
  78. src/agents/research/prompts/zh/rephrase_agent.yaml +58 -0
  79. src/agents/research/prompts/zh/reporting_agent.yaml +380 -0
  80. src/agents/research/prompts/zh/research_agent.yaml +173 -0
  81. src/agents/research/research_pipeline.py +1309 -0
  82. src/agents/research/utils/__init__.py +60 -0
  83. src/agents/research/utils/citation_manager.py +799 -0
  84. src/agents/research/utils/json_utils.py +98 -0
  85. src/agents/research/utils/token_tracker.py +297 -0
  86. src/agents/solve/__init__.py +80 -0
  87. src/agents/solve/analysis_loop/__init__.py +14 -0
  88. src/agents/solve/analysis_loop/investigate_agent.py +414 -0
  89. src/agents/solve/analysis_loop/note_agent.py +190 -0
  90. src/agents/solve/main_solver.py +862 -0
  91. src/agents/solve/memory/__init__.py +34 -0
  92. src/agents/solve/memory/citation_memory.py +353 -0
  93. src/agents/solve/memory/investigate_memory.py +226 -0
  94. src/agents/solve/memory/solve_memory.py +340 -0
  95. src/agents/solve/prompts/en/analysis_loop/investigate_agent.yaml +55 -0
  96. src/agents/solve/prompts/en/analysis_loop/note_agent.yaml +54 -0
  97. src/agents/solve/prompts/en/solve_loop/manager_agent.yaml +67 -0
  98. src/agents/solve/prompts/en/solve_loop/precision_answer_agent.yaml +62 -0
  99. src/agents/solve/prompts/en/solve_loop/response_agent.yaml +90 -0
  100. src/agents/solve/prompts/en/solve_loop/solve_agent.yaml +75 -0
  101. src/agents/solve/prompts/en/solve_loop/tool_agent.yaml +38 -0
  102. src/agents/solve/prompts/zh/analysis_loop/investigate_agent.yaml +53 -0
  103. src/agents/solve/prompts/zh/analysis_loop/note_agent.yaml +54 -0
  104. src/agents/solve/prompts/zh/solve_loop/manager_agent.yaml +66 -0
  105. src/agents/solve/prompts/zh/solve_loop/precision_answer_agent.yaml +62 -0
  106. src/agents/solve/prompts/zh/solve_loop/response_agent.yaml +90 -0
  107. src/agents/solve/prompts/zh/solve_loop/solve_agent.yaml +76 -0
  108. src/agents/solve/prompts/zh/solve_loop/tool_agent.yaml +41 -0
  109. src/agents/solve/solve_loop/__init__.py +22 -0
  110. src/agents/solve/solve_loop/citation_manager.py +74 -0
  111. src/agents/solve/solve_loop/manager_agent.py +274 -0
  112. src/agents/solve/solve_loop/precision_answer_agent.py +96 -0
  113. src/agents/solve/solve_loop/response_agent.py +301 -0
  114. src/agents/solve/solve_loop/solve_agent.py +325 -0
  115. src/agents/solve/solve_loop/tool_agent.py +470 -0
  116. src/agents/solve/utils/__init__.py +64 -0
  117. src/agents/solve/utils/config_validator.py +313 -0
  118. src/agents/solve/utils/display_manager.py +223 -0
  119. src/agents/solve/utils/error_handler.py +363 -0
  120. src/agents/solve/utils/json_utils.py +98 -0
  121. src/agents/solve/utils/performance_monitor.py +407 -0
  122. src/agents/solve/utils/token_tracker.py +541 -0
  123. src/api/__init__.py +0 -0
  124. src/api/main.py +240 -0
  125. src/api/routers/__init__.py +1 -0
  126. src/api/routers/agent_config.py +69 -0
  127. src/api/routers/chat.py +296 -0
  128. src/api/routers/co_writer.py +337 -0
  129. src/api/routers/config.py +627 -0
  130. src/api/routers/dashboard.py +18 -0
  131. src/api/routers/guide.py +337 -0
  132. src/api/routers/ideagen.py +436 -0
  133. src/api/routers/knowledge.py +821 -0
  134. src/api/routers/notebook.py +247 -0
  135. src/api/routers/question.py +537 -0
  136. src/api/routers/research.py +394 -0
  137. src/api/routers/settings.py +164 -0
  138. src/api/routers/solve.py +305 -0
  139. src/api/routers/system.py +252 -0
  140. src/api/run_server.py +61 -0
  141. src/api/utils/history.py +172 -0
  142. src/api/utils/log_interceptor.py +21 -0
  143. src/api/utils/notebook_manager.py +415 -0
  144. src/api/utils/progress_broadcaster.py +72 -0
  145. src/api/utils/task_id_manager.py +100 -0
  146. src/config/__init__.py +0 -0
  147. src/config/accessors.py +18 -0
  148. src/config/constants.py +34 -0
  149. src/config/defaults.py +18 -0
  150. src/config/schema.py +38 -0
  151. src/config/settings.py +50 -0
  152. src/core/errors.py +62 -0
  153. src/knowledge/__init__.py +23 -0
  154. src/knowledge/add_documents.py +606 -0
  155. src/knowledge/config.py +65 -0
  156. src/knowledge/example_add_documents.py +236 -0
  157. src/knowledge/extract_numbered_items.py +1039 -0
  158. src/knowledge/initializer.py +621 -0
  159. src/knowledge/kb.py +22 -0
  160. src/knowledge/manager.py +782 -0
  161. src/knowledge/progress_tracker.py +182 -0
  162. src/knowledge/start_kb.py +535 -0
  163. src/logging/__init__.py +103 -0
  164. src/logging/adapters/__init__.py +17 -0
  165. src/logging/adapters/lightrag.py +184 -0
  166. src/logging/adapters/llamaindex.py +141 -0
  167. src/logging/config.py +80 -0
  168. src/logging/handlers/__init__.py +20 -0
  169. src/logging/handlers/console.py +75 -0
  170. src/logging/handlers/file.py +201 -0
  171. src/logging/handlers/websocket.py +127 -0
  172. src/logging/logger.py +709 -0
  173. src/logging/stats/__init__.py +16 -0
  174. src/logging/stats/llm_stats.py +179 -0
  175. src/services/__init__.py +56 -0
  176. src/services/config/__init__.py +61 -0
  177. src/services/config/knowledge_base_config.py +210 -0
  178. src/services/config/loader.py +260 -0
  179. src/services/config/unified_config.py +603 -0
  180. src/services/embedding/__init__.py +45 -0
  181. src/services/embedding/adapters/__init__.py +22 -0
  182. src/services/embedding/adapters/base.py +106 -0
  183. src/services/embedding/adapters/cohere.py +127 -0
  184. src/services/embedding/adapters/jina.py +99 -0
  185. src/services/embedding/adapters/ollama.py +116 -0
  186. src/services/embedding/adapters/openai_compatible.py +96 -0
  187. src/services/embedding/client.py +159 -0
  188. src/services/embedding/config.py +156 -0
  189. src/services/embedding/provider.py +119 -0
  190. src/services/llm/__init__.py +152 -0
  191. src/services/llm/capabilities.py +313 -0
  192. src/services/llm/client.py +302 -0
  193. src/services/llm/cloud_provider.py +530 -0
  194. src/services/llm/config.py +200 -0
  195. src/services/llm/error_mapping.py +103 -0
  196. src/services/llm/exceptions.py +152 -0
  197. src/services/llm/factory.py +450 -0
  198. src/services/llm/local_provider.py +347 -0
  199. src/services/llm/providers/anthropic.py +95 -0
  200. src/services/llm/providers/base_provider.py +93 -0
  201. src/services/llm/providers/open_ai.py +83 -0
  202. src/services/llm/registry.py +71 -0
  203. src/services/llm/telemetry.py +40 -0
  204. src/services/llm/types.py +27 -0
  205. src/services/llm/utils.py +333 -0
  206. src/services/prompt/__init__.py +25 -0
  207. src/services/prompt/manager.py +206 -0
  208. src/services/rag/__init__.py +64 -0
  209. src/services/rag/components/__init__.py +29 -0
  210. src/services/rag/components/base.py +59 -0
  211. src/services/rag/components/chunkers/__init__.py +18 -0
  212. src/services/rag/components/chunkers/base.py +34 -0
  213. src/services/rag/components/chunkers/fixed.py +71 -0
  214. src/services/rag/components/chunkers/numbered_item.py +94 -0
  215. src/services/rag/components/chunkers/semantic.py +97 -0
  216. src/services/rag/components/embedders/__init__.py +14 -0
  217. src/services/rag/components/embedders/base.py +32 -0
  218. src/services/rag/components/embedders/openai.py +63 -0
  219. src/services/rag/components/indexers/__init__.py +18 -0
  220. src/services/rag/components/indexers/base.py +35 -0
  221. src/services/rag/components/indexers/graph.py +172 -0
  222. src/services/rag/components/indexers/lightrag.py +156 -0
  223. src/services/rag/components/indexers/vector.py +146 -0
  224. src/services/rag/components/parsers/__init__.py +18 -0
  225. src/services/rag/components/parsers/base.py +35 -0
  226. src/services/rag/components/parsers/markdown.py +52 -0
  227. src/services/rag/components/parsers/pdf.py +115 -0
  228. src/services/rag/components/parsers/text.py +86 -0
  229. src/services/rag/components/retrievers/__init__.py +18 -0
  230. src/services/rag/components/retrievers/base.py +34 -0
  231. src/services/rag/components/retrievers/dense.py +200 -0
  232. src/services/rag/components/retrievers/hybrid.py +164 -0
  233. src/services/rag/components/retrievers/lightrag.py +169 -0
  234. src/services/rag/components/routing.py +286 -0
  235. src/services/rag/factory.py +234 -0
  236. src/services/rag/pipeline.py +215 -0
  237. src/services/rag/pipelines/__init__.py +32 -0
  238. src/services/rag/pipelines/academic.py +44 -0
  239. src/services/rag/pipelines/lightrag.py +43 -0
  240. src/services/rag/pipelines/llamaindex.py +313 -0
  241. src/services/rag/pipelines/raganything.py +384 -0
  242. src/services/rag/service.py +244 -0
  243. src/services/rag/types.py +73 -0
  244. src/services/search/__init__.py +284 -0
  245. src/services/search/base.py +87 -0
  246. src/services/search/consolidation.py +398 -0
  247. src/services/search/providers/__init__.py +128 -0
  248. src/services/search/providers/baidu.py +188 -0
  249. src/services/search/providers/exa.py +194 -0
  250. src/services/search/providers/jina.py +161 -0
  251. src/services/search/providers/perplexity.py +153 -0
  252. src/services/search/providers/serper.py +209 -0
  253. src/services/search/providers/tavily.py +161 -0
  254. src/services/search/types.py +114 -0
  255. src/services/setup/__init__.py +34 -0
  256. src/services/setup/init.py +285 -0
  257. src/services/tts/__init__.py +16 -0
  258. src/services/tts/config.py +99 -0
  259. src/tools/__init__.py +91 -0
  260. src/tools/code_executor.py +536 -0
  261. src/tools/paper_search_tool.py +171 -0
  262. src/tools/query_item_tool.py +310 -0
  263. src/tools/question/__init__.py +15 -0
  264. src/tools/question/exam_mimic.py +616 -0
  265. src/tools/question/pdf_parser.py +211 -0
  266. src/tools/question/question_extractor.py +397 -0
  267. src/tools/rag_tool.py +173 -0
  268. src/tools/tex_chunker.py +339 -0
  269. src/tools/tex_downloader.py +253 -0
  270. src/tools/web_search.py +71 -0
  271. src/utils/config_manager.py +206 -0
  272. src/utils/document_validator.py +168 -0
  273. src/utils/error_rate_tracker.py +111 -0
  274. src/utils/error_utils.py +82 -0
  275. src/utils/json_parser.py +110 -0
  276. src/utils/network/circuit_breaker.py +79 -0
@@ -0,0 +1,29 @@
1
+ """
2
+ RAG Components
3
+ ==============
4
+
5
+ Modular components for building RAG pipelines.
6
+
7
+ Components follow a simple protocol:
8
+ - Each component has a `name` attribute
9
+ - Each component has an async `process()` method
10
+ """
11
+
12
+ # Import component modules for convenience
13
+ from . import chunkers, embedders, indexers, parsers, retrievers
14
+ from .base import BaseComponent, Component
15
+ from .routing import DocumentType, FileClassification, FileTypeRouter
16
+
17
+ __all__ = [
18
+ "Component",
19
+ "BaseComponent",
20
+ "parsers",
21
+ "chunkers",
22
+ "embedders",
23
+ "indexers",
24
+ "retrievers",
25
+ # File type routing
26
+ "FileTypeRouter",
27
+ "FileClassification",
28
+ "DocumentType",
29
+ ]
@@ -0,0 +1,59 @@
1
+ """
2
+ Base Component
3
+ ==============
4
+
5
+ Base classes and protocols for RAG components.
6
+ """
7
+
8
+ from typing import Any, Protocol, runtime_checkable
9
+
10
+
11
+ @runtime_checkable
12
+ class Component(Protocol):
13
+ """
14
+ Base protocol for all RAG components.
15
+
16
+ All components must implement:
17
+ - name: str - Component identifier
18
+ - process(data, **kwargs) -> Any - Process input data
19
+ """
20
+
21
+ name: str
22
+
23
+ async def process(self, data: Any, **kwargs) -> Any:
24
+ """
25
+ Process input data.
26
+
27
+ Args:
28
+ data: Input data to process
29
+ **kwargs: Additional arguments
30
+
31
+ Returns:
32
+ Processed output
33
+ """
34
+ ...
35
+
36
+
37
+ class BaseComponent:
38
+ """
39
+ Base class with common functionality for components.
40
+
41
+ Provides:
42
+ - Logger initialization
43
+ - Default name from class name
44
+ """
45
+
46
+ name: str = "base"
47
+
48
+ def __init__(self):
49
+ from src.logging import get_logger
50
+
51
+ self.logger = get_logger(self.__class__.__name__)
52
+
53
+ async def process(self, data: Any, **kwargs) -> Any:
54
+ """
55
+ Process input data.
56
+
57
+ Override this method in subclasses.
58
+ """
59
+ raise NotImplementedError("Subclasses must implement process()")
@@ -0,0 +1,18 @@
1
+ """
2
+ Document Chunkers
3
+ =================
4
+
5
+ Chunkers for splitting documents into smaller pieces.
6
+ """
7
+
8
+ from .base import BaseChunker
9
+ from .fixed import FixedSizeChunker
10
+ from .numbered_item import NumberedItemExtractor
11
+ from .semantic import SemanticChunker
12
+
13
+ __all__ = [
14
+ "BaseChunker",
15
+ "SemanticChunker",
16
+ "FixedSizeChunker",
17
+ "NumberedItemExtractor",
18
+ ]
@@ -0,0 +1,34 @@
1
+ """
2
+ Base Chunker
3
+ ============
4
+
5
+ Base class for document chunkers.
6
+ """
7
+
8
+ from typing import List
9
+
10
+ from ...types import Chunk, Document
11
+ from ..base import BaseComponent
12
+
13
+
14
+ class BaseChunker(BaseComponent):
15
+ """
16
+ Base class for document chunkers.
17
+
18
+ Chunkers split documents into smaller chunks for processing.
19
+ """
20
+
21
+ name = "base_chunker"
22
+
23
+ async def process(self, doc: Document, **kwargs) -> List[Chunk]:
24
+ """
25
+ Chunk a document.
26
+
27
+ Args:
28
+ doc: Document to chunk
29
+ **kwargs: Additional arguments
30
+
31
+ Returns:
32
+ List of Chunks
33
+ """
34
+ raise NotImplementedError("Subclasses must implement process()")
@@ -0,0 +1,71 @@
1
+ """
2
+ Fixed Size Chunker
3
+ ==================
4
+
5
+ Chunker that splits documents into fixed-size pieces.
6
+ """
7
+
8
+ from typing import List
9
+
10
+ from ...types import Chunk, Document
11
+ from ..base import BaseComponent
12
+
13
+
14
+ class FixedSizeChunker(BaseComponent):
15
+ """
16
+ Fixed-size chunker.
17
+
18
+ Splits documents into chunks of a fixed size with optional overlap.
19
+ """
20
+
21
+ name = "fixed_size_chunker"
22
+
23
+ def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 100):
24
+ """
25
+ Initialize fixed-size chunker.
26
+
27
+ Args:
28
+ chunk_size: Size of each chunk in characters
29
+ chunk_overlap: Overlap between chunks
30
+ """
31
+ super().__init__()
32
+ self.chunk_size = chunk_size
33
+ self.chunk_overlap = chunk_overlap
34
+
35
+ async def process(self, doc: Document, **kwargs) -> List[Chunk]:
36
+ """
37
+ Chunk a document into fixed-size pieces.
38
+
39
+ Args:
40
+ doc: Document to chunk
41
+ **kwargs: Additional arguments
42
+
43
+ Returns:
44
+ List of fixed-size Chunks
45
+ """
46
+ self.logger.info(f"Chunking document: {doc.file_path or 'inline'}")
47
+
48
+ text = doc.content
49
+ if not text:
50
+ return []
51
+
52
+ chunks = []
53
+ step = self.chunk_size - self.chunk_overlap
54
+
55
+ for i in range(0, len(text), step):
56
+ chunk_text = text[i : i + self.chunk_size].strip()
57
+ if chunk_text:
58
+ chunks.append(
59
+ Chunk(
60
+ content=chunk_text,
61
+ chunk_type="text",
62
+ metadata={
63
+ "start_pos": i,
64
+ "end_pos": min(i + self.chunk_size, len(text)),
65
+ "source": doc.file_path,
66
+ },
67
+ )
68
+ )
69
+
70
+ self.logger.info(f"Created {len(chunks)} chunks")
71
+ return chunks
@@ -0,0 +1,94 @@
1
+ """
2
+ Numbered Item Extractor
3
+ =======================
4
+
5
+ Extracts numbered items (definitions, theorems, equations) from documents.
6
+ """
7
+
8
+ from typing import List
9
+
10
+ from ...types import Chunk, Document
11
+ from ..base import BaseComponent
12
+
13
+
14
+ class NumberedItemExtractor(BaseComponent):
15
+ """
16
+ Extract numbered items (definitions, theorems, equations) from documents.
17
+
18
+ Uses LLM to identify and extract structured academic content like
19
+ definitions, theorems, lemmas, propositions, equations, etc.
20
+ """
21
+
22
+ name = "numbered_item_extractor"
23
+
24
+ def __init__(self, batch_size: int = 20, max_concurrent: int = 5):
25
+ """
26
+ Initialize numbered item extractor.
27
+
28
+ Args:
29
+ batch_size: Number of content items to process per batch
30
+ max_concurrent: Maximum concurrent LLM calls
31
+ """
32
+ super().__init__()
33
+ self.batch_size = batch_size
34
+ self.max_concurrent = max_concurrent
35
+
36
+ async def process(self, doc: Document, **kwargs) -> List[Chunk]:
37
+ """
38
+ Extract numbered items from a document.
39
+
40
+ Args:
41
+ doc: Document to extract from (must have content_items)
42
+ **kwargs: Additional arguments
43
+
44
+ Returns:
45
+ List of Chunks representing numbered items
46
+ """
47
+ if not doc.content_items:
48
+ self.logger.warning("No content_items in document, skipping extraction")
49
+ return []
50
+
51
+ self.logger.info(f"Extracting numbered items from {len(doc.content_items)} content items")
52
+
53
+ try:
54
+ from src.knowledge.extract_numbered_items import (
55
+ extract_numbered_items_with_llm_async,
56
+ )
57
+ from src.services.llm import get_llm_client
58
+
59
+ llm_client = get_llm_client()
60
+
61
+ # Use existing extraction logic
62
+ items = await extract_numbered_items_with_llm_async(
63
+ doc.content_items,
64
+ api_key=llm_client.config.api_key,
65
+ base_url=llm_client.config.base_url,
66
+ batch_size=self.batch_size,
67
+ max_concurrent=self.max_concurrent,
68
+ )
69
+
70
+ # Convert to Chunks
71
+ chunks = []
72
+ for identifier, item_data in items.items():
73
+ chunks.append(
74
+ Chunk(
75
+ content=item_data["text"],
76
+ chunk_type=item_data["type"], # Definition, Theorem, Equation...
77
+ metadata={
78
+ "identifier": identifier,
79
+ "page": item_data.get("page", 0),
80
+ "img_paths": item_data.get("img_paths", []),
81
+ "source": doc.file_path,
82
+ },
83
+ )
84
+ )
85
+
86
+ self.logger.info(f"Extracted {len(chunks)} numbered items")
87
+ return chunks
88
+
89
+ except ImportError as e:
90
+ self.logger.warning(f"Could not import extraction module: {e}")
91
+ return []
92
+ except Exception as e:
93
+ self.logger.error(f"Failed to extract numbered items: {e}")
94
+ return []
@@ -0,0 +1,97 @@
1
+ """
2
+ Semantic Chunker
3
+ ================
4
+
5
+ Chunker that splits documents based on semantic boundaries.
6
+ """
7
+
8
+ from typing import List
9
+
10
+ from ...types import Chunk, Document
11
+ from ..base import BaseComponent
12
+
13
+
14
+ class SemanticChunker(BaseComponent):
15
+ """
16
+ Semantic chunker.
17
+
18
+ Splits documents based on semantic boundaries like paragraphs,
19
+ sections, or natural breakpoints.
20
+ """
21
+
22
+ name = "semantic_chunker"
23
+
24
+ def __init__(
25
+ self,
26
+ chunk_size: int = 1000,
27
+ chunk_overlap: int = 200,
28
+ separators: List[str] = None,
29
+ ):
30
+ """
31
+ Initialize semantic chunker.
32
+
33
+ Args:
34
+ chunk_size: Target chunk size in characters
35
+ chunk_overlap: Overlap between chunks
36
+ separators: List of separators to split on
37
+ """
38
+ super().__init__()
39
+ self.chunk_size = chunk_size
40
+ self.chunk_overlap = chunk_overlap
41
+ self.separators = separators or ["\n\n", "\n", ". ", " "]
42
+
43
+ async def process(self, doc: Document, **kwargs) -> List[Chunk]:
44
+ """
45
+ Chunk a document semantically.
46
+
47
+ Args:
48
+ doc: Document to chunk
49
+ **kwargs: Additional arguments
50
+
51
+ Returns:
52
+ List of semantic Chunks
53
+ """
54
+ self.logger.info(f"Chunking document: {doc.file_path or 'inline'}")
55
+
56
+ text = doc.content
57
+ if not text:
58
+ return []
59
+
60
+ chunks = []
61
+ current_pos = 0
62
+
63
+ while current_pos < len(text):
64
+ # Find chunk end
65
+ end_pos = min(current_pos + self.chunk_size, len(text))
66
+
67
+ # Try to find a natural break point
68
+ if end_pos < len(text):
69
+ for sep in self.separators:
70
+ # Look for separator in the last portion of the chunk
71
+ search_start = max(current_pos + self.chunk_size - 200, current_pos)
72
+ sep_pos = text.rfind(sep, search_start, end_pos)
73
+ if sep_pos > current_pos:
74
+ end_pos = sep_pos + len(sep)
75
+ break
76
+
77
+ chunk_text = text[current_pos:end_pos].strip()
78
+ if chunk_text:
79
+ chunks.append(
80
+ Chunk(
81
+ content=chunk_text,
82
+ chunk_type="text",
83
+ metadata={
84
+ "start_pos": current_pos,
85
+ "end_pos": end_pos,
86
+ "source": doc.file_path,
87
+ },
88
+ )
89
+ )
90
+
91
+ # Move to next position with overlap
92
+ current_pos = end_pos - self.chunk_overlap
93
+ if current_pos >= len(text) - self.chunk_overlap:
94
+ break
95
+
96
+ self.logger.info(f"Created {len(chunks)} chunks")
97
+ return chunks
@@ -0,0 +1,14 @@
1
+ """
2
+ Document Embedders
3
+ ==================
4
+
5
+ Embedders for generating vector representations of text.
6
+ """
7
+
8
+ from .base import BaseEmbedder
9
+ from .openai import OpenAIEmbedder
10
+
11
+ __all__ = [
12
+ "BaseEmbedder",
13
+ "OpenAIEmbedder",
14
+ ]
@@ -0,0 +1,32 @@
1
+ """
2
+ Base Embedder
3
+ =============
4
+
5
+ Base class for document embedders.
6
+ """
7
+
8
+ from ...types import Document
9
+ from ..base import BaseComponent
10
+
11
+
12
+ class BaseEmbedder(BaseComponent):
13
+ """
14
+ Base class for document embedders.
15
+
16
+ Embedders generate vector representations for document chunks.
17
+ """
18
+
19
+ name = "base_embedder"
20
+
21
+ async def process(self, doc: Document, **kwargs) -> Document:
22
+ """
23
+ Embed a document's chunks.
24
+
25
+ Args:
26
+ doc: Document with chunks to embed
27
+ **kwargs: Additional arguments
28
+
29
+ Returns:
30
+ Document with embedded chunks
31
+ """
32
+ raise NotImplementedError("Subclasses must implement process()")
@@ -0,0 +1,63 @@
1
+ """
2
+ OpenAI Embedder
3
+ ===============
4
+
5
+ Embedder using OpenAI-compatible embedding API.
6
+ """
7
+
8
+ from ...types import Document
9
+ from ..base import BaseComponent
10
+
11
+
12
+ class OpenAIEmbedder(BaseComponent):
13
+ """
14
+ OpenAI-compatible embedder.
15
+
16
+ Uses the embedding service to generate vectors for document chunks.
17
+ """
18
+
19
+ name = "openai_embedder"
20
+
21
+ def __init__(self, batch_size: int = 100):
22
+ """
23
+ Initialize OpenAI embedder.
24
+
25
+ Args:
26
+ batch_size: Number of texts to embed per API call
27
+ """
28
+ super().__init__()
29
+ self.batch_size = batch_size
30
+
31
+ async def process(self, doc: Document, **kwargs) -> Document:
32
+ """
33
+ Embed a document's chunks.
34
+
35
+ Args:
36
+ doc: Document with chunks to embed
37
+ **kwargs: Additional arguments
38
+
39
+ Returns:
40
+ Document with embedded chunks
41
+ """
42
+ if not doc.chunks:
43
+ self.logger.warning("No chunks to embed")
44
+ return doc
45
+
46
+ self.logger.info(f"Embedding {len(doc.chunks)} chunks")
47
+
48
+ from src.services.embedding import get_embedding_client
49
+
50
+ client = get_embedding_client()
51
+
52
+ # Batch embed
53
+ for i in range(0, len(doc.chunks), self.batch_size):
54
+ batch = doc.chunks[i : i + self.batch_size]
55
+ texts = [chunk.content for chunk in batch]
56
+
57
+ embeddings = await client.embed(texts)
58
+
59
+ for chunk, embedding in zip(batch, embeddings):
60
+ chunk.embedding = embedding
61
+
62
+ self.logger.info("Embedding complete")
63
+ return doc
@@ -0,0 +1,18 @@
1
+ """
2
+ Document Indexers
3
+ =================
4
+
5
+ Indexers for building searchable indexes from documents.
6
+ """
7
+
8
+ from .base import BaseIndexer
9
+ from .graph import GraphIndexer
10
+ from .lightrag import LightRAGIndexer
11
+ from .vector import VectorIndexer
12
+
13
+ __all__ = [
14
+ "BaseIndexer",
15
+ "VectorIndexer",
16
+ "GraphIndexer",
17
+ "LightRAGIndexer",
18
+ ]
@@ -0,0 +1,35 @@
1
+ """
2
+ Base Indexer
3
+ ============
4
+
5
+ Base class for document indexers.
6
+ """
7
+
8
+ from typing import List
9
+
10
+ from ...types import Document
11
+ from ..base import BaseComponent
12
+
13
+
14
+ class BaseIndexer(BaseComponent):
15
+ """
16
+ Base class for document indexers.
17
+
18
+ Indexers build searchable indexes from documents.
19
+ """
20
+
21
+ name = "base_indexer"
22
+
23
+ async def process(self, kb_name: str, documents: List[Document], **kwargs) -> bool:
24
+ """
25
+ Index documents into a knowledge base.
26
+
27
+ Args:
28
+ kb_name: Knowledge base name
29
+ documents: List of documents to index
30
+ **kwargs: Additional arguments
31
+
32
+ Returns:
33
+ True if successful
34
+ """
35
+ raise NotImplementedError("Subclasses must implement process()")