longparser 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,9 @@
1
+ """Document extractors for LongParser."""
2
+
3
+ from .base import BaseExtractor
4
+ from .docling_extractor import DoclingExtractor
5
+
6
+ __all__ = [
7
+ "BaseExtractor",
8
+ "DoclingExtractor",
9
+ ]
@@ -0,0 +1,62 @@
1
+ """Abstract base class for document extractors."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from pathlib import Path
5
+ from typing import Optional
6
+
7
+ from ..schemas import Document, Page, ProcessingConfig, ExtractorType
8
+
9
+
10
+ class BaseExtractor(ABC):
11
+ """Base class for all document extractors."""
12
+
13
+ extractor_type: ExtractorType
14
+ version: str = "1.0.0"
15
+
16
+ @abstractmethod
17
+ def extract(
18
+ self,
19
+ file_path: Path,
20
+ config: ProcessingConfig,
21
+ page_numbers: Optional[list[int]] = None,
22
+ ) -> Document:
23
+ """
24
+ Extract content from a document.
25
+
26
+ Args:
27
+ file_path: Path to the input file (PDF/image)
28
+ config: Processing configuration
29
+ page_numbers: Optional list of specific pages to process.
30
+ If None, process all pages.
31
+
32
+ Returns:
33
+ Document with extracted pages and blocks
34
+ """
35
+ pass
36
+
37
+ @abstractmethod
38
+ def extract_page(
39
+ self,
40
+ file_path: Path,
41
+ page_number: int,
42
+ config: ProcessingConfig,
43
+ ) -> Page:
44
+ """
45
+ Extract a single page from a document.
46
+
47
+ Args:
48
+ file_path: Path to the input file
49
+ page_number: 0-indexed page number
50
+ config: Processing configuration
51
+
52
+ Returns:
53
+ Page with extracted blocks
54
+ """
55
+ pass
56
+
57
+ def get_provenance_info(self) -> dict:
58
+ """Get extractor provenance information."""
59
+ return {
60
+ "extractor": self.extractor_type,
61
+ "extractor_version": self.version,
62
+ }