longparser 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- longparser/__init__.py +104 -0
- longparser/chunkers/__init__.py +5 -0
- longparser/chunkers/hybrid_chunker.py +1046 -0
- longparser/extractors/__init__.py +9 -0
- longparser/extractors/base.py +62 -0
- longparser/extractors/docling_extractor.py +2065 -0
- longparser/extractors/latex_ocr.py +404 -0
- longparser/integrations/__init__.py +31 -0
- longparser/integrations/langchain.py +138 -0
- longparser/integrations/llamaindex.py +157 -0
- longparser/pipeline/__init__.py +8 -0
- longparser/pipeline/orchestrator.py +230 -0
- longparser/py.typed +0 -0
- longparser/schemas.py +247 -0
- longparser/server/__init__.py +22 -0
- longparser/server/app.py +1045 -0
- longparser/server/chat/__init__.py +39 -0
- longparser/server/chat/callbacks.py +110 -0
- longparser/server/chat/engine.py +341 -0
- longparser/server/chat/graph.py +176 -0
- longparser/server/chat/llm_chain.py +153 -0
- longparser/server/chat/retriever.py +111 -0
- longparser/server/chat/schemas.py +164 -0
- longparser/server/db.py +656 -0
- longparser/server/embeddings.py +181 -0
- longparser/server/queue.py +97 -0
- longparser/server/routers/__init__.py +0 -0
- longparser/server/schemas.py +204 -0
- longparser/server/vectorstores.py +443 -0
- longparser/server/worker.py +480 -0
- longparser/utils/__init__.py +5 -0
- longparser/utils/rtl_detector.py +93 -0
- longparser-0.1.0.dist-info/METADATA +337 -0
- longparser-0.1.0.dist-info/RECORD +36 -0
- longparser-0.1.0.dist-info/WHEEL +5 -0
- longparser-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""Abstract base class for document extractors."""
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
from ..schemas import Document, Page, ProcessingConfig, ExtractorType
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class BaseExtractor(ABC):
|
|
11
|
+
"""Base class for all document extractors."""
|
|
12
|
+
|
|
13
|
+
extractor_type: ExtractorType
|
|
14
|
+
version: str = "1.0.0"
|
|
15
|
+
|
|
16
|
+
@abstractmethod
|
|
17
|
+
def extract(
|
|
18
|
+
self,
|
|
19
|
+
file_path: Path,
|
|
20
|
+
config: ProcessingConfig,
|
|
21
|
+
page_numbers: Optional[list[int]] = None,
|
|
22
|
+
) -> Document:
|
|
23
|
+
"""
|
|
24
|
+
Extract content from a document.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
file_path: Path to the input file (PDF/image)
|
|
28
|
+
config: Processing configuration
|
|
29
|
+
page_numbers: Optional list of specific pages to process.
|
|
30
|
+
If None, process all pages.
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
Document with extracted pages and blocks
|
|
34
|
+
"""
|
|
35
|
+
pass
|
|
36
|
+
|
|
37
|
+
@abstractmethod
|
|
38
|
+
def extract_page(
|
|
39
|
+
self,
|
|
40
|
+
file_path: Path,
|
|
41
|
+
page_number: int,
|
|
42
|
+
config: ProcessingConfig,
|
|
43
|
+
) -> Page:
|
|
44
|
+
"""
|
|
45
|
+
Extract a single page from a document.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
file_path: Path to the input file
|
|
49
|
+
page_number: 0-indexed page number
|
|
50
|
+
config: Processing configuration
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
Page with extracted blocks
|
|
54
|
+
"""
|
|
55
|
+
pass
|
|
56
|
+
|
|
57
|
+
def get_provenance_info(self) -> dict:
|
|
58
|
+
"""Get extractor provenance information."""
|
|
59
|
+
return {
|
|
60
|
+
"extractor": self.extractor_type,
|
|
61
|
+
"extractor_version": self.version,
|
|
62
|
+
}
|