vlm4ocr 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vlm4ocr/data_types.py ADDED
@@ -0,0 +1,109 @@
1
+ import os
2
+ from typing import List, Literal
3
+ from dataclasses import dataclass, field
4
+ from vlm4ocr.utils import get_default_page_delimiter
5
+
6
+ OutputMode = Literal["markdown", "HTML", "text"]
7
+
8
+ @dataclass
9
+ class OCRResult:
10
+ """
11
+ This class represents the result of an OCR process.
12
+
13
+ Parameters:
14
+ ----------
15
+ input_dir : str
16
+ The directory where the input files (e.g., image, PDF, tiff) are located.
17
+ output_mode : str
18
+ The output format. Must be 'markdown', 'HTML', or 'text'.
19
+ pages : List[str]
20
+ A list of strings, each representing a page of the OCR result.
21
+ """
22
+ input_dir: str
23
+ output_mode: OutputMode
24
+ pages: List[dict] = field(default_factory=list)
25
+ filename: str = field(init=False)
26
+ status: str = field(init=False, default="processing")
27
+
28
+ def __post_init__(self):
29
+ """
30
+ Called after the dataclass-generated __init__ method.
31
+ Used for validation and initializing derived fields.
32
+ """
33
+ self.filename = os.path.basename(self.input_dir)
34
+
35
+ # output_mode validation
36
+ if self.output_mode not in ["markdown", "HTML", "text"]:
37
+ raise ValueError("output_mode must be 'markdown', 'HTML', or 'text'")
38
+
39
+ # pages validation
40
+ if not isinstance(self.pages, list):
41
+ raise ValueError("pages must be a list of dict")
42
+ for i, page_content in enumerate(self.pages):
43
+ if not isinstance(page_content, dict):
44
+ raise ValueError(f"Each page must be a dict. Page at index {i} is not a dict.")
45
+
46
+
47
+ def add_page(self, text:str, image_processing_status: dict):
48
+ """
49
+ This method adds a new page to the OCRResult object.
50
+
51
+ Parameters:
52
+ ----------
53
+ text : str
54
+ The OCR result text of the page.
55
+ image_processing_status : dict
56
+ A dictionary containing the image processing status for the page.
57
+ It can include keys like 'rotate_correction', 'max_dimension_pixels', etc.
58
+ """
59
+ if not isinstance(text, str):
60
+ raise ValueError("text must be a string")
61
+ if not isinstance(image_processing_status, dict):
62
+ raise ValueError("image_processing_status must be a dict")
63
+
64
+ page = {
65
+ "text": text,
66
+ "image_processing_status": image_processing_status
67
+ }
68
+ self.pages.append(page)
69
+
70
+
71
+ def __len__(self):
72
+ return len(self.pages)
73
+
74
+ def get_page(self, idx):
75
+ if not isinstance(idx, int):
76
+ raise ValueError("Index must be an integer")
77
+ if idx < 0 or idx >= len(self.pages):
78
+ raise IndexError(f"Index out of range. The OCRResult has {len(self.pages)} pages, but index {idx} was requested.")
79
+
80
+ return self.pages[idx]
81
+
82
+ def __iter__(self):
83
+ return iter(self.pages)
84
+
85
+ def __repr__(self):
86
+ return f"OCRResult(filename={self.filename}, output_mode={self.output_mode}, pages_count={len(self.pages)}, status={self.status})"
87
+
88
+ def to_string(self, page_delimiter:str="auto") -> str:
89
+ """
90
+ Convert the OCRResult object to a string representation.
91
+
92
+ Parameters:
93
+ ----------
94
+ page_delimiter : str, Optional
95
+ Only applies if separate_pages = True. The delimiter to use between PDF pages.
96
+ if 'auto', it will be set to the default page delimiter for the output mode:
97
+ 'markdown' -> '\n\n---\n\n'
98
+ 'HTML' -> '<br><br>'
99
+ 'text' -> '\n\n---\n\n'
100
+ """
101
+ if not isinstance(page_delimiter, str):
102
+ raise ValueError("page_delimiter must be a string")
103
+
104
+ if page_delimiter == "auto":
105
+ self.page_delimiter = get_default_page_delimiter(self.output_mode)
106
+ else:
107
+ self.page_delimiter = page_delimiter
108
+
109
+ return self.page_delimiter.join([page.get("text", "") for page in self.pages])