realtimex-deeptutor 0.5.0.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- realtimex_deeptutor/__init__.py +67 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/METADATA +1612 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/RECORD +276 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/WHEEL +5 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/entry_points.txt +2 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/licenses/LICENSE +661 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/top_level.txt +2 -0
- src/__init__.py +40 -0
- src/agents/__init__.py +24 -0
- src/agents/base_agent.py +657 -0
- src/agents/chat/__init__.py +24 -0
- src/agents/chat/chat_agent.py +435 -0
- src/agents/chat/prompts/en/chat_agent.yaml +35 -0
- src/agents/chat/prompts/zh/chat_agent.yaml +35 -0
- src/agents/chat/session_manager.py +311 -0
- src/agents/co_writer/__init__.py +0 -0
- src/agents/co_writer/edit_agent.py +260 -0
- src/agents/co_writer/narrator_agent.py +423 -0
- src/agents/co_writer/prompts/en/edit_agent.yaml +113 -0
- src/agents/co_writer/prompts/en/narrator_agent.yaml +88 -0
- src/agents/co_writer/prompts/zh/edit_agent.yaml +113 -0
- src/agents/co_writer/prompts/zh/narrator_agent.yaml +88 -0
- src/agents/guide/__init__.py +16 -0
- src/agents/guide/agents/__init__.py +11 -0
- src/agents/guide/agents/chat_agent.py +104 -0
- src/agents/guide/agents/interactive_agent.py +223 -0
- src/agents/guide/agents/locate_agent.py +149 -0
- src/agents/guide/agents/summary_agent.py +150 -0
- src/agents/guide/guide_manager.py +500 -0
- src/agents/guide/prompts/en/chat_agent.yaml +41 -0
- src/agents/guide/prompts/en/interactive_agent.yaml +202 -0
- src/agents/guide/prompts/en/locate_agent.yaml +68 -0
- src/agents/guide/prompts/en/summary_agent.yaml +157 -0
- src/agents/guide/prompts/zh/chat_agent.yaml +41 -0
- src/agents/guide/prompts/zh/interactive_agent.yaml +626 -0
- src/agents/guide/prompts/zh/locate_agent.yaml +68 -0
- src/agents/guide/prompts/zh/summary_agent.yaml +157 -0
- src/agents/ideagen/__init__.py +12 -0
- src/agents/ideagen/idea_generation_workflow.py +426 -0
- src/agents/ideagen/material_organizer_agent.py +173 -0
- src/agents/ideagen/prompts/en/idea_generation.yaml +187 -0
- src/agents/ideagen/prompts/en/material_organizer.yaml +69 -0
- src/agents/ideagen/prompts/zh/idea_generation.yaml +187 -0
- src/agents/ideagen/prompts/zh/material_organizer.yaml +69 -0
- src/agents/question/__init__.py +24 -0
- src/agents/question/agents/__init__.py +18 -0
- src/agents/question/agents/generate_agent.py +381 -0
- src/agents/question/agents/relevance_analyzer.py +207 -0
- src/agents/question/agents/retrieve_agent.py +239 -0
- src/agents/question/coordinator.py +718 -0
- src/agents/question/example.py +109 -0
- src/agents/question/prompts/en/coordinator.yaml +75 -0
- src/agents/question/prompts/en/generate_agent.yaml +77 -0
- src/agents/question/prompts/en/relevance_analyzer.yaml +41 -0
- src/agents/question/prompts/en/retrieve_agent.yaml +32 -0
- src/agents/question/prompts/zh/coordinator.yaml +75 -0
- src/agents/question/prompts/zh/generate_agent.yaml +77 -0
- src/agents/question/prompts/zh/relevance_analyzer.yaml +39 -0
- src/agents/question/prompts/zh/retrieve_agent.yaml +30 -0
- src/agents/research/agents/__init__.py +23 -0
- src/agents/research/agents/decompose_agent.py +507 -0
- src/agents/research/agents/manager_agent.py +228 -0
- src/agents/research/agents/note_agent.py +180 -0
- src/agents/research/agents/rephrase_agent.py +263 -0
- src/agents/research/agents/reporting_agent.py +1333 -0
- src/agents/research/agents/research_agent.py +714 -0
- src/agents/research/data_structures.py +451 -0
- src/agents/research/main.py +188 -0
- src/agents/research/prompts/en/decompose_agent.yaml +89 -0
- src/agents/research/prompts/en/manager_agent.yaml +24 -0
- src/agents/research/prompts/en/note_agent.yaml +121 -0
- src/agents/research/prompts/en/rephrase_agent.yaml +58 -0
- src/agents/research/prompts/en/reporting_agent.yaml +380 -0
- src/agents/research/prompts/en/research_agent.yaml +173 -0
- src/agents/research/prompts/zh/decompose_agent.yaml +89 -0
- src/agents/research/prompts/zh/manager_agent.yaml +24 -0
- src/agents/research/prompts/zh/note_agent.yaml +121 -0
- src/agents/research/prompts/zh/rephrase_agent.yaml +58 -0
- src/agents/research/prompts/zh/reporting_agent.yaml +380 -0
- src/agents/research/prompts/zh/research_agent.yaml +173 -0
- src/agents/research/research_pipeline.py +1309 -0
- src/agents/research/utils/__init__.py +60 -0
- src/agents/research/utils/citation_manager.py +799 -0
- src/agents/research/utils/json_utils.py +98 -0
- src/agents/research/utils/token_tracker.py +297 -0
- src/agents/solve/__init__.py +80 -0
- src/agents/solve/analysis_loop/__init__.py +14 -0
- src/agents/solve/analysis_loop/investigate_agent.py +414 -0
- src/agents/solve/analysis_loop/note_agent.py +190 -0
- src/agents/solve/main_solver.py +862 -0
- src/agents/solve/memory/__init__.py +34 -0
- src/agents/solve/memory/citation_memory.py +353 -0
- src/agents/solve/memory/investigate_memory.py +226 -0
- src/agents/solve/memory/solve_memory.py +340 -0
- src/agents/solve/prompts/en/analysis_loop/investigate_agent.yaml +55 -0
- src/agents/solve/prompts/en/analysis_loop/note_agent.yaml +54 -0
- src/agents/solve/prompts/en/solve_loop/manager_agent.yaml +67 -0
- src/agents/solve/prompts/en/solve_loop/precision_answer_agent.yaml +62 -0
- src/agents/solve/prompts/en/solve_loop/response_agent.yaml +90 -0
- src/agents/solve/prompts/en/solve_loop/solve_agent.yaml +75 -0
- src/agents/solve/prompts/en/solve_loop/tool_agent.yaml +38 -0
- src/agents/solve/prompts/zh/analysis_loop/investigate_agent.yaml +53 -0
- src/agents/solve/prompts/zh/analysis_loop/note_agent.yaml +54 -0
- src/agents/solve/prompts/zh/solve_loop/manager_agent.yaml +66 -0
- src/agents/solve/prompts/zh/solve_loop/precision_answer_agent.yaml +62 -0
- src/agents/solve/prompts/zh/solve_loop/response_agent.yaml +90 -0
- src/agents/solve/prompts/zh/solve_loop/solve_agent.yaml +76 -0
- src/agents/solve/prompts/zh/solve_loop/tool_agent.yaml +41 -0
- src/agents/solve/solve_loop/__init__.py +22 -0
- src/agents/solve/solve_loop/citation_manager.py +74 -0
- src/agents/solve/solve_loop/manager_agent.py +274 -0
- src/agents/solve/solve_loop/precision_answer_agent.py +96 -0
- src/agents/solve/solve_loop/response_agent.py +301 -0
- src/agents/solve/solve_loop/solve_agent.py +325 -0
- src/agents/solve/solve_loop/tool_agent.py +470 -0
- src/agents/solve/utils/__init__.py +64 -0
- src/agents/solve/utils/config_validator.py +313 -0
- src/agents/solve/utils/display_manager.py +223 -0
- src/agents/solve/utils/error_handler.py +363 -0
- src/agents/solve/utils/json_utils.py +98 -0
- src/agents/solve/utils/performance_monitor.py +407 -0
- src/agents/solve/utils/token_tracker.py +541 -0
- src/api/__init__.py +0 -0
- src/api/main.py +240 -0
- src/api/routers/__init__.py +1 -0
- src/api/routers/agent_config.py +69 -0
- src/api/routers/chat.py +296 -0
- src/api/routers/co_writer.py +337 -0
- src/api/routers/config.py +627 -0
- src/api/routers/dashboard.py +18 -0
- src/api/routers/guide.py +337 -0
- src/api/routers/ideagen.py +436 -0
- src/api/routers/knowledge.py +821 -0
- src/api/routers/notebook.py +247 -0
- src/api/routers/question.py +537 -0
- src/api/routers/research.py +394 -0
- src/api/routers/settings.py +164 -0
- src/api/routers/solve.py +305 -0
- src/api/routers/system.py +252 -0
- src/api/run_server.py +61 -0
- src/api/utils/history.py +172 -0
- src/api/utils/log_interceptor.py +21 -0
- src/api/utils/notebook_manager.py +415 -0
- src/api/utils/progress_broadcaster.py +72 -0
- src/api/utils/task_id_manager.py +100 -0
- src/config/__init__.py +0 -0
- src/config/accessors.py +18 -0
- src/config/constants.py +34 -0
- src/config/defaults.py +18 -0
- src/config/schema.py +38 -0
- src/config/settings.py +50 -0
- src/core/errors.py +62 -0
- src/knowledge/__init__.py +23 -0
- src/knowledge/add_documents.py +606 -0
- src/knowledge/config.py +65 -0
- src/knowledge/example_add_documents.py +236 -0
- src/knowledge/extract_numbered_items.py +1039 -0
- src/knowledge/initializer.py +621 -0
- src/knowledge/kb.py +22 -0
- src/knowledge/manager.py +782 -0
- src/knowledge/progress_tracker.py +182 -0
- src/knowledge/start_kb.py +535 -0
- src/logging/__init__.py +103 -0
- src/logging/adapters/__init__.py +17 -0
- src/logging/adapters/lightrag.py +184 -0
- src/logging/adapters/llamaindex.py +141 -0
- src/logging/config.py +80 -0
- src/logging/handlers/__init__.py +20 -0
- src/logging/handlers/console.py +75 -0
- src/logging/handlers/file.py +201 -0
- src/logging/handlers/websocket.py +127 -0
- src/logging/logger.py +709 -0
- src/logging/stats/__init__.py +16 -0
- src/logging/stats/llm_stats.py +179 -0
- src/services/__init__.py +56 -0
- src/services/config/__init__.py +61 -0
- src/services/config/knowledge_base_config.py +210 -0
- src/services/config/loader.py +260 -0
- src/services/config/unified_config.py +603 -0
- src/services/embedding/__init__.py +45 -0
- src/services/embedding/adapters/__init__.py +22 -0
- src/services/embedding/adapters/base.py +106 -0
- src/services/embedding/adapters/cohere.py +127 -0
- src/services/embedding/adapters/jina.py +99 -0
- src/services/embedding/adapters/ollama.py +116 -0
- src/services/embedding/adapters/openai_compatible.py +96 -0
- src/services/embedding/client.py +159 -0
- src/services/embedding/config.py +156 -0
- src/services/embedding/provider.py +119 -0
- src/services/llm/__init__.py +152 -0
- src/services/llm/capabilities.py +313 -0
- src/services/llm/client.py +302 -0
- src/services/llm/cloud_provider.py +530 -0
- src/services/llm/config.py +200 -0
- src/services/llm/error_mapping.py +103 -0
- src/services/llm/exceptions.py +152 -0
- src/services/llm/factory.py +450 -0
- src/services/llm/local_provider.py +347 -0
- src/services/llm/providers/anthropic.py +95 -0
- src/services/llm/providers/base_provider.py +93 -0
- src/services/llm/providers/open_ai.py +83 -0
- src/services/llm/registry.py +71 -0
- src/services/llm/telemetry.py +40 -0
- src/services/llm/types.py +27 -0
- src/services/llm/utils.py +333 -0
- src/services/prompt/__init__.py +25 -0
- src/services/prompt/manager.py +206 -0
- src/services/rag/__init__.py +64 -0
- src/services/rag/components/__init__.py +29 -0
- src/services/rag/components/base.py +59 -0
- src/services/rag/components/chunkers/__init__.py +18 -0
- src/services/rag/components/chunkers/base.py +34 -0
- src/services/rag/components/chunkers/fixed.py +71 -0
- src/services/rag/components/chunkers/numbered_item.py +94 -0
- src/services/rag/components/chunkers/semantic.py +97 -0
- src/services/rag/components/embedders/__init__.py +14 -0
- src/services/rag/components/embedders/base.py +32 -0
- src/services/rag/components/embedders/openai.py +63 -0
- src/services/rag/components/indexers/__init__.py +18 -0
- src/services/rag/components/indexers/base.py +35 -0
- src/services/rag/components/indexers/graph.py +172 -0
- src/services/rag/components/indexers/lightrag.py +156 -0
- src/services/rag/components/indexers/vector.py +146 -0
- src/services/rag/components/parsers/__init__.py +18 -0
- src/services/rag/components/parsers/base.py +35 -0
- src/services/rag/components/parsers/markdown.py +52 -0
- src/services/rag/components/parsers/pdf.py +115 -0
- src/services/rag/components/parsers/text.py +86 -0
- src/services/rag/components/retrievers/__init__.py +18 -0
- src/services/rag/components/retrievers/base.py +34 -0
- src/services/rag/components/retrievers/dense.py +200 -0
- src/services/rag/components/retrievers/hybrid.py +164 -0
- src/services/rag/components/retrievers/lightrag.py +169 -0
- src/services/rag/components/routing.py +286 -0
- src/services/rag/factory.py +234 -0
- src/services/rag/pipeline.py +215 -0
- src/services/rag/pipelines/__init__.py +32 -0
- src/services/rag/pipelines/academic.py +44 -0
- src/services/rag/pipelines/lightrag.py +43 -0
- src/services/rag/pipelines/llamaindex.py +313 -0
- src/services/rag/pipelines/raganything.py +384 -0
- src/services/rag/service.py +244 -0
- src/services/rag/types.py +73 -0
- src/services/search/__init__.py +284 -0
- src/services/search/base.py +87 -0
- src/services/search/consolidation.py +398 -0
- src/services/search/providers/__init__.py +128 -0
- src/services/search/providers/baidu.py +188 -0
- src/services/search/providers/exa.py +194 -0
- src/services/search/providers/jina.py +161 -0
- src/services/search/providers/perplexity.py +153 -0
- src/services/search/providers/serper.py +209 -0
- src/services/search/providers/tavily.py +161 -0
- src/services/search/types.py +114 -0
- src/services/setup/__init__.py +34 -0
- src/services/setup/init.py +285 -0
- src/services/tts/__init__.py +16 -0
- src/services/tts/config.py +99 -0
- src/tools/__init__.py +91 -0
- src/tools/code_executor.py +536 -0
- src/tools/paper_search_tool.py +171 -0
- src/tools/query_item_tool.py +310 -0
- src/tools/question/__init__.py +15 -0
- src/tools/question/exam_mimic.py +616 -0
- src/tools/question/pdf_parser.py +211 -0
- src/tools/question/question_extractor.py +397 -0
- src/tools/rag_tool.py +173 -0
- src/tools/tex_chunker.py +339 -0
- src/tools/tex_downloader.py +253 -0
- src/tools/web_search.py +71 -0
- src/utils/config_manager.py +206 -0
- src/utils/document_validator.py +168 -0
- src/utils/error_rate_tracker.py +111 -0
- src/utils/error_utils.py +82 -0
- src/utils/json_parser.py +110 -0
- src/utils/network/circuit_breaker.py +79 -0
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""
|
|
2
|
+
RAG Components
|
|
3
|
+
==============
|
|
4
|
+
|
|
5
|
+
Modular components for building RAG pipelines.
|
|
6
|
+
|
|
7
|
+
Components follow a simple protocol:
|
|
8
|
+
- Each component has a `name` attribute
|
|
9
|
+
- Each component has an async `process()` method
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
# Import component modules for convenience
|
|
13
|
+
from . import chunkers, embedders, indexers, parsers, retrievers
|
|
14
|
+
from .base import BaseComponent, Component
|
|
15
|
+
from .routing import DocumentType, FileClassification, FileTypeRouter
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"Component",
|
|
19
|
+
"BaseComponent",
|
|
20
|
+
"parsers",
|
|
21
|
+
"chunkers",
|
|
22
|
+
"embedders",
|
|
23
|
+
"indexers",
|
|
24
|
+
"retrievers",
|
|
25
|
+
# File type routing
|
|
26
|
+
"FileTypeRouter",
|
|
27
|
+
"FileClassification",
|
|
28
|
+
"DocumentType",
|
|
29
|
+
]
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base Component
|
|
3
|
+
==============
|
|
4
|
+
|
|
5
|
+
Base classes and protocols for RAG components.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import Any, Protocol, runtime_checkable
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@runtime_checkable
|
|
12
|
+
class Component(Protocol):
|
|
13
|
+
"""
|
|
14
|
+
Base protocol for all RAG components.
|
|
15
|
+
|
|
16
|
+
All components must implement:
|
|
17
|
+
- name: str - Component identifier
|
|
18
|
+
- process(data, **kwargs) -> Any - Process input data
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
name: str
|
|
22
|
+
|
|
23
|
+
async def process(self, data: Any, **kwargs) -> Any:
|
|
24
|
+
"""
|
|
25
|
+
Process input data.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
data: Input data to process
|
|
29
|
+
**kwargs: Additional arguments
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
Processed output
|
|
33
|
+
"""
|
|
34
|
+
...
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class BaseComponent:
|
|
38
|
+
"""
|
|
39
|
+
Base class with common functionality for components.
|
|
40
|
+
|
|
41
|
+
Provides:
|
|
42
|
+
- Logger initialization
|
|
43
|
+
- Default name from class name
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
name: str = "base"
|
|
47
|
+
|
|
48
|
+
def __init__(self):
|
|
49
|
+
from src.logging import get_logger
|
|
50
|
+
|
|
51
|
+
self.logger = get_logger(self.__class__.__name__)
|
|
52
|
+
|
|
53
|
+
async def process(self, data: Any, **kwargs) -> Any:
|
|
54
|
+
"""
|
|
55
|
+
Process input data.
|
|
56
|
+
|
|
57
|
+
Override this method in subclasses.
|
|
58
|
+
"""
|
|
59
|
+
raise NotImplementedError("Subclasses must implement process()")
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Document Chunkers
|
|
3
|
+
=================
|
|
4
|
+
|
|
5
|
+
Chunkers for splitting documents into smaller pieces.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from .base import BaseChunker
|
|
9
|
+
from .fixed import FixedSizeChunker
|
|
10
|
+
from .numbered_item import NumberedItemExtractor
|
|
11
|
+
from .semantic import SemanticChunker
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"BaseChunker",
|
|
15
|
+
"SemanticChunker",
|
|
16
|
+
"FixedSizeChunker",
|
|
17
|
+
"NumberedItemExtractor",
|
|
18
|
+
]
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base Chunker
|
|
3
|
+
============
|
|
4
|
+
|
|
5
|
+
Base class for document chunkers.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import List
|
|
9
|
+
|
|
10
|
+
from ...types import Chunk, Document
|
|
11
|
+
from ..base import BaseComponent
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class BaseChunker(BaseComponent):
|
|
15
|
+
"""
|
|
16
|
+
Base class for document chunkers.
|
|
17
|
+
|
|
18
|
+
Chunkers split documents into smaller chunks for processing.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
name = "base_chunker"
|
|
22
|
+
|
|
23
|
+
async def process(self, doc: Document, **kwargs) -> List[Chunk]:
|
|
24
|
+
"""
|
|
25
|
+
Chunk a document.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
doc: Document to chunk
|
|
29
|
+
**kwargs: Additional arguments
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
List of Chunks
|
|
33
|
+
"""
|
|
34
|
+
raise NotImplementedError("Subclasses must implement process()")
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Fixed Size Chunker
|
|
3
|
+
==================
|
|
4
|
+
|
|
5
|
+
Chunker that splits documents into fixed-size pieces.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import List
|
|
9
|
+
|
|
10
|
+
from ...types import Chunk, Document
|
|
11
|
+
from ..base import BaseComponent
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class FixedSizeChunker(BaseComponent):
|
|
15
|
+
"""
|
|
16
|
+
Fixed-size chunker.
|
|
17
|
+
|
|
18
|
+
Splits documents into chunks of a fixed size with optional overlap.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
name = "fixed_size_chunker"
|
|
22
|
+
|
|
23
|
+
def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 100):
|
|
24
|
+
"""
|
|
25
|
+
Initialize fixed-size chunker.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
chunk_size: Size of each chunk in characters
|
|
29
|
+
chunk_overlap: Overlap between chunks
|
|
30
|
+
"""
|
|
31
|
+
super().__init__()
|
|
32
|
+
self.chunk_size = chunk_size
|
|
33
|
+
self.chunk_overlap = chunk_overlap
|
|
34
|
+
|
|
35
|
+
async def process(self, doc: Document, **kwargs) -> List[Chunk]:
|
|
36
|
+
"""
|
|
37
|
+
Chunk a document into fixed-size pieces.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
doc: Document to chunk
|
|
41
|
+
**kwargs: Additional arguments
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
List of fixed-size Chunks
|
|
45
|
+
"""
|
|
46
|
+
self.logger.info(f"Chunking document: {doc.file_path or 'inline'}")
|
|
47
|
+
|
|
48
|
+
text = doc.content
|
|
49
|
+
if not text:
|
|
50
|
+
return []
|
|
51
|
+
|
|
52
|
+
chunks = []
|
|
53
|
+
step = self.chunk_size - self.chunk_overlap
|
|
54
|
+
|
|
55
|
+
for i in range(0, len(text), step):
|
|
56
|
+
chunk_text = text[i : i + self.chunk_size].strip()
|
|
57
|
+
if chunk_text:
|
|
58
|
+
chunks.append(
|
|
59
|
+
Chunk(
|
|
60
|
+
content=chunk_text,
|
|
61
|
+
chunk_type="text",
|
|
62
|
+
metadata={
|
|
63
|
+
"start_pos": i,
|
|
64
|
+
"end_pos": min(i + self.chunk_size, len(text)),
|
|
65
|
+
"source": doc.file_path,
|
|
66
|
+
},
|
|
67
|
+
)
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
self.logger.info(f"Created {len(chunks)} chunks")
|
|
71
|
+
return chunks
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Numbered Item Extractor
|
|
3
|
+
=======================
|
|
4
|
+
|
|
5
|
+
Extracts numbered items (definitions, theorems, equations) from documents.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import List
|
|
9
|
+
|
|
10
|
+
from ...types import Chunk, Document
|
|
11
|
+
from ..base import BaseComponent
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class NumberedItemExtractor(BaseComponent):
|
|
15
|
+
"""
|
|
16
|
+
Extract numbered items (definitions, theorems, equations) from documents.
|
|
17
|
+
|
|
18
|
+
Uses LLM to identify and extract structured academic content like
|
|
19
|
+
definitions, theorems, lemmas, propositions, equations, etc.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
name = "numbered_item_extractor"
|
|
23
|
+
|
|
24
|
+
def __init__(self, batch_size: int = 20, max_concurrent: int = 5):
|
|
25
|
+
"""
|
|
26
|
+
Initialize numbered item extractor.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
batch_size: Number of content items to process per batch
|
|
30
|
+
max_concurrent: Maximum concurrent LLM calls
|
|
31
|
+
"""
|
|
32
|
+
super().__init__()
|
|
33
|
+
self.batch_size = batch_size
|
|
34
|
+
self.max_concurrent = max_concurrent
|
|
35
|
+
|
|
36
|
+
async def process(self, doc: Document, **kwargs) -> List[Chunk]:
|
|
37
|
+
"""
|
|
38
|
+
Extract numbered items from a document.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
doc: Document to extract from (must have content_items)
|
|
42
|
+
**kwargs: Additional arguments
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
List of Chunks representing numbered items
|
|
46
|
+
"""
|
|
47
|
+
if not doc.content_items:
|
|
48
|
+
self.logger.warning("No content_items in document, skipping extraction")
|
|
49
|
+
return []
|
|
50
|
+
|
|
51
|
+
self.logger.info(f"Extracting numbered items from {len(doc.content_items)} content items")
|
|
52
|
+
|
|
53
|
+
try:
|
|
54
|
+
from src.knowledge.extract_numbered_items import (
|
|
55
|
+
extract_numbered_items_with_llm_async,
|
|
56
|
+
)
|
|
57
|
+
from src.services.llm import get_llm_client
|
|
58
|
+
|
|
59
|
+
llm_client = get_llm_client()
|
|
60
|
+
|
|
61
|
+
# Use existing extraction logic
|
|
62
|
+
items = await extract_numbered_items_with_llm_async(
|
|
63
|
+
doc.content_items,
|
|
64
|
+
api_key=llm_client.config.api_key,
|
|
65
|
+
base_url=llm_client.config.base_url,
|
|
66
|
+
batch_size=self.batch_size,
|
|
67
|
+
max_concurrent=self.max_concurrent,
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# Convert to Chunks
|
|
71
|
+
chunks = []
|
|
72
|
+
for identifier, item_data in items.items():
|
|
73
|
+
chunks.append(
|
|
74
|
+
Chunk(
|
|
75
|
+
content=item_data["text"],
|
|
76
|
+
chunk_type=item_data["type"], # Definition, Theorem, Equation...
|
|
77
|
+
metadata={
|
|
78
|
+
"identifier": identifier,
|
|
79
|
+
"page": item_data.get("page", 0),
|
|
80
|
+
"img_paths": item_data.get("img_paths", []),
|
|
81
|
+
"source": doc.file_path,
|
|
82
|
+
},
|
|
83
|
+
)
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
self.logger.info(f"Extracted {len(chunks)} numbered items")
|
|
87
|
+
return chunks
|
|
88
|
+
|
|
89
|
+
except ImportError as e:
|
|
90
|
+
self.logger.warning(f"Could not import extraction module: {e}")
|
|
91
|
+
return []
|
|
92
|
+
except Exception as e:
|
|
93
|
+
self.logger.error(f"Failed to extract numbered items: {e}")
|
|
94
|
+
return []
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Semantic Chunker
|
|
3
|
+
================
|
|
4
|
+
|
|
5
|
+
Chunker that splits documents based on semantic boundaries.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import List
|
|
9
|
+
|
|
10
|
+
from ...types import Chunk, Document
|
|
11
|
+
from ..base import BaseComponent
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class SemanticChunker(BaseComponent):
|
|
15
|
+
"""
|
|
16
|
+
Semantic chunker.
|
|
17
|
+
|
|
18
|
+
Splits documents based on semantic boundaries like paragraphs,
|
|
19
|
+
sections, or natural breakpoints.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
name = "semantic_chunker"
|
|
23
|
+
|
|
24
|
+
def __init__(
|
|
25
|
+
self,
|
|
26
|
+
chunk_size: int = 1000,
|
|
27
|
+
chunk_overlap: int = 200,
|
|
28
|
+
separators: List[str] = None,
|
|
29
|
+
):
|
|
30
|
+
"""
|
|
31
|
+
Initialize semantic chunker.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
chunk_size: Target chunk size in characters
|
|
35
|
+
chunk_overlap: Overlap between chunks
|
|
36
|
+
separators: List of separators to split on
|
|
37
|
+
"""
|
|
38
|
+
super().__init__()
|
|
39
|
+
self.chunk_size = chunk_size
|
|
40
|
+
self.chunk_overlap = chunk_overlap
|
|
41
|
+
self.separators = separators or ["\n\n", "\n", ". ", " "]
|
|
42
|
+
|
|
43
|
+
async def process(self, doc: Document, **kwargs) -> List[Chunk]:
|
|
44
|
+
"""
|
|
45
|
+
Chunk a document semantically.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
doc: Document to chunk
|
|
49
|
+
**kwargs: Additional arguments
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
List of semantic Chunks
|
|
53
|
+
"""
|
|
54
|
+
self.logger.info(f"Chunking document: {doc.file_path or 'inline'}")
|
|
55
|
+
|
|
56
|
+
text = doc.content
|
|
57
|
+
if not text:
|
|
58
|
+
return []
|
|
59
|
+
|
|
60
|
+
chunks = []
|
|
61
|
+
current_pos = 0
|
|
62
|
+
|
|
63
|
+
while current_pos < len(text):
|
|
64
|
+
# Find chunk end
|
|
65
|
+
end_pos = min(current_pos + self.chunk_size, len(text))
|
|
66
|
+
|
|
67
|
+
# Try to find a natural break point
|
|
68
|
+
if end_pos < len(text):
|
|
69
|
+
for sep in self.separators:
|
|
70
|
+
# Look for separator in the last portion of the chunk
|
|
71
|
+
search_start = max(current_pos + self.chunk_size - 200, current_pos)
|
|
72
|
+
sep_pos = text.rfind(sep, search_start, end_pos)
|
|
73
|
+
if sep_pos > current_pos:
|
|
74
|
+
end_pos = sep_pos + len(sep)
|
|
75
|
+
break
|
|
76
|
+
|
|
77
|
+
chunk_text = text[current_pos:end_pos].strip()
|
|
78
|
+
if chunk_text:
|
|
79
|
+
chunks.append(
|
|
80
|
+
Chunk(
|
|
81
|
+
content=chunk_text,
|
|
82
|
+
chunk_type="text",
|
|
83
|
+
metadata={
|
|
84
|
+
"start_pos": current_pos,
|
|
85
|
+
"end_pos": end_pos,
|
|
86
|
+
"source": doc.file_path,
|
|
87
|
+
},
|
|
88
|
+
)
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
# Move to next position with overlap
|
|
92
|
+
current_pos = end_pos - self.chunk_overlap
|
|
93
|
+
if current_pos >= len(text) - self.chunk_overlap:
|
|
94
|
+
break
|
|
95
|
+
|
|
96
|
+
self.logger.info(f"Created {len(chunks)} chunks")
|
|
97
|
+
return chunks
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base Embedder
|
|
3
|
+
=============
|
|
4
|
+
|
|
5
|
+
Base class for document embedders.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from ...types import Document
|
|
9
|
+
from ..base import BaseComponent
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class BaseEmbedder(BaseComponent):
|
|
13
|
+
"""
|
|
14
|
+
Base class for document embedders.
|
|
15
|
+
|
|
16
|
+
Embedders generate vector representations for document chunks.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
name = "base_embedder"
|
|
20
|
+
|
|
21
|
+
async def process(self, doc: Document, **kwargs) -> Document:
|
|
22
|
+
"""
|
|
23
|
+
Embed a document's chunks.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
doc: Document with chunks to embed
|
|
27
|
+
**kwargs: Additional arguments
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
Document with embedded chunks
|
|
31
|
+
"""
|
|
32
|
+
raise NotImplementedError("Subclasses must implement process()")
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""
|
|
2
|
+
OpenAI Embedder
|
|
3
|
+
===============
|
|
4
|
+
|
|
5
|
+
Embedder using OpenAI-compatible embedding API.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from ...types import Document
|
|
9
|
+
from ..base import BaseComponent
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class OpenAIEmbedder(BaseComponent):
|
|
13
|
+
"""
|
|
14
|
+
OpenAI-compatible embedder.
|
|
15
|
+
|
|
16
|
+
Uses the embedding service to generate vectors for document chunks.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
name = "openai_embedder"
|
|
20
|
+
|
|
21
|
+
def __init__(self, batch_size: int = 100):
|
|
22
|
+
"""
|
|
23
|
+
Initialize OpenAI embedder.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
batch_size: Number of texts to embed per API call
|
|
27
|
+
"""
|
|
28
|
+
super().__init__()
|
|
29
|
+
self.batch_size = batch_size
|
|
30
|
+
|
|
31
|
+
async def process(self, doc: Document, **kwargs) -> Document:
|
|
32
|
+
"""
|
|
33
|
+
Embed a document's chunks.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
doc: Document with chunks to embed
|
|
37
|
+
**kwargs: Additional arguments
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
Document with embedded chunks
|
|
41
|
+
"""
|
|
42
|
+
if not doc.chunks:
|
|
43
|
+
self.logger.warning("No chunks to embed")
|
|
44
|
+
return doc
|
|
45
|
+
|
|
46
|
+
self.logger.info(f"Embedding {len(doc.chunks)} chunks")
|
|
47
|
+
|
|
48
|
+
from src.services.embedding import get_embedding_client
|
|
49
|
+
|
|
50
|
+
client = get_embedding_client()
|
|
51
|
+
|
|
52
|
+
# Batch embed
|
|
53
|
+
for i in range(0, len(doc.chunks), self.batch_size):
|
|
54
|
+
batch = doc.chunks[i : i + self.batch_size]
|
|
55
|
+
texts = [chunk.content for chunk in batch]
|
|
56
|
+
|
|
57
|
+
embeddings = await client.embed(texts)
|
|
58
|
+
|
|
59
|
+
for chunk, embedding in zip(batch, embeddings):
|
|
60
|
+
chunk.embedding = embedding
|
|
61
|
+
|
|
62
|
+
self.logger.info("Embedding complete")
|
|
63
|
+
return doc
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Document Indexers
|
|
3
|
+
=================
|
|
4
|
+
|
|
5
|
+
Indexers for building searchable indexes from documents.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from .base import BaseIndexer
|
|
9
|
+
from .graph import GraphIndexer
|
|
10
|
+
from .lightrag import LightRAGIndexer
|
|
11
|
+
from .vector import VectorIndexer
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"BaseIndexer",
|
|
15
|
+
"VectorIndexer",
|
|
16
|
+
"GraphIndexer",
|
|
17
|
+
"LightRAGIndexer",
|
|
18
|
+
]
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base Indexer
|
|
3
|
+
============
|
|
4
|
+
|
|
5
|
+
Base class for document indexers.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import List
|
|
9
|
+
|
|
10
|
+
from ...types import Document
|
|
11
|
+
from ..base import BaseComponent
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class BaseIndexer(BaseComponent):
|
|
15
|
+
"""
|
|
16
|
+
Base class for document indexers.
|
|
17
|
+
|
|
18
|
+
Indexers build searchable indexes from documents.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
name = "base_indexer"
|
|
22
|
+
|
|
23
|
+
async def process(self, kb_name: str, documents: List[Document], **kwargs) -> bool:
|
|
24
|
+
"""
|
|
25
|
+
Index documents into a knowledge base.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
kb_name: Knowledge base name
|
|
29
|
+
documents: List of documents to index
|
|
30
|
+
**kwargs: Additional arguments
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
True if successful
|
|
34
|
+
"""
|
|
35
|
+
raise NotImplementedError("Subclasses must implement process()")
|