myocr-lib 0.1.3__py3-none-any.whl → 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {myocr_lib-0.1.3.dist-info → myocr_lib-0.1.4.dist-info}/METADATA +2 -3
- myocr_lib-0.1.4.dist-info/RECORD +8 -0
- {myocr_lib-0.1.3.dist-info → myocr_lib-0.1.4.dist-info}/WHEEL +1 -1
- ocr_pdf/main_code.py +116 -4
- myocr_lib-0.1.3.dist-info/RECORD +0 -8
- {myocr_lib-0.1.3.dist-info → myocr_lib-0.1.4.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: myocr_lib
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.4
|
|
4
4
|
Summary: To be available soon
|
|
5
5
|
Keywords: utility,library,functions,ocr,image-processing
|
|
6
6
|
Classifier: Development Status :: 3 - Alpha
|
|
@@ -16,4 +16,3 @@ Requires-Dist: ocrmypdf>=16.12.0
|
|
|
16
16
|
Requires-Dist: Pillow>=9.0.0
|
|
17
17
|
Requires-Dist: opencv-python>=4.12.0
|
|
18
18
|
Requires-Dist: PyMuPDF>=1.26.7
|
|
19
|
-
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
ocr_img/__init__.py,sha256=A0HMZNNgwObl6qZHeGKXQSf-VhPlkQt3YKEcA_VXHLw,80
|
|
2
|
+
ocr_img/main_code.py,sha256=738bf1zZTr1FRncvPfUik4SSS2nSMkPqrCFRtnQeGmM,1300
|
|
3
|
+
ocr_pdf/__init__.py,sha256=t4SYasAJ1pjnd7sZBnPe1PqMbR18-4onsHXu93zw1LE,96
|
|
4
|
+
ocr_pdf/main_code.py,sha256=7wYi6HPUoKyvqOk02qiPC2RR39CSKCRb9Eki8vhOes0,9368
|
|
5
|
+
myocr_lib-0.1.4.dist-info/METADATA,sha256=7IopmcXCQifPCF8B_8AJRCVQIP8rQofXsLEA2sU1u5g,675
|
|
6
|
+
myocr_lib-0.1.4.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
7
|
+
myocr_lib-0.1.4.dist-info/top_level.txt,sha256=m22oM9I_3SovUyKNzGQIpqgCp7GTXua0G7cEYqMx654,16
|
|
8
|
+
myocr_lib-0.1.4.dist-info/RECORD,,
|
ocr_pdf/main_code.py
CHANGED
|
@@ -13,6 +13,50 @@ class OCRDataExtractor:
|
|
|
13
13
|
self.pages_to_ocr = [] # holds the pages needing ocr
|
|
14
14
|
|
|
15
15
|
|
|
16
|
+
def _raise_for_ocr_results(self, result):
|
|
17
|
+
"""This function raises the corresponding and matching exceptions based on the value of thre result code."""
|
|
18
|
+
# key for the exceptions code
|
|
19
|
+
exceptions_codes = {exc:"" for exc in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 130]}
|
|
20
|
+
|
|
21
|
+
return_code = result.returncode
|
|
22
|
+
|
|
23
|
+
match return_code:
|
|
24
|
+
case 1:
|
|
25
|
+
raise Exception("Invalid arguments were provided.")
|
|
26
|
+
|
|
27
|
+
case 2:
|
|
28
|
+
raise Exception("The input file is not a valid PDF.")
|
|
29
|
+
|
|
30
|
+
case 3:
|
|
31
|
+
raise Exception("An external program (like Tesseract or Ghostscript) required by OCRmyPDF is missing.")
|
|
32
|
+
|
|
33
|
+
case 4:
|
|
34
|
+
raise Exception("An output file was created, but it is invalid (the file will still be produced).")
|
|
35
|
+
|
|
36
|
+
case 5:
|
|
37
|
+
raise Exception("Insufficient permissions to read the input or write the output file.")
|
|
38
|
+
|
|
39
|
+
case 6:
|
|
40
|
+
raise Exception("The file already contains text, and the command was not configured to force re-processing.")
|
|
41
|
+
|
|
42
|
+
case 7:
|
|
43
|
+
raise Exception("An error occurred in an external program (e.g., Ghostscript).")
|
|
44
|
+
|
|
45
|
+
case 8:
|
|
46
|
+
raise Exception("The input PDF is encrypted and cannot be processed.")
|
|
47
|
+
|
|
48
|
+
case 9:
|
|
49
|
+
raise Exception("Tesseract rejected a custom configuration file.")
|
|
50
|
+
|
|
51
|
+
case 10:
|
|
52
|
+
raise Exception("A valid PDF was created, but conversion to PDF/A failed.")
|
|
53
|
+
|
|
54
|
+
case 15:
|
|
55
|
+
raise Exception("An unexpected or other error occurred.")
|
|
56
|
+
|
|
57
|
+
case 130:
|
|
58
|
+
raise Exception("The program was interrupted by the user (Ctrl+C). ")
|
|
59
|
+
|
|
16
60
|
|
|
17
61
|
def _apply_whole_pdf_ocr(self):
|
|
18
62
|
# Define the Commands
|
|
@@ -27,14 +71,65 @@ class OCRDataExtractor:
|
|
|
27
71
|
# run the command
|
|
28
72
|
result = subprocess.run(command, capture_output=True, text=True)
|
|
29
73
|
|
|
74
|
+
try:
|
|
75
|
+
self._raise_for_ocr_results(result)
|
|
76
|
+
|
|
77
|
+
except Exception as e:
|
|
78
|
+
print("Error during OCR process:\n", str(e))
|
|
79
|
+
|
|
80
|
+
if result.returncode == 6: # already has text
|
|
81
|
+
print("Retrying using forcing..")
|
|
82
|
+
# use force ocr
|
|
83
|
+
# Define the Commands
|
|
84
|
+
command = [
|
|
85
|
+
"ocrmypdf",
|
|
86
|
+
f"{self.input_pdf_path}",
|
|
87
|
+
f"{self.output_file_path}",
|
|
88
|
+
"--force-ocr",
|
|
89
|
+
"--tesseract-timeout", "1000" # increased timeout
|
|
90
|
+
]
|
|
91
|
+
|
|
92
|
+
# run the command
|
|
93
|
+
result = subprocess.run(command, capture_output=True, text=True)
|
|
94
|
+
|
|
95
|
+
else:
|
|
96
|
+
raise e
|
|
97
|
+
|
|
98
|
+
|
|
30
99
|
if result.returncode == 0:
|
|
31
|
-
print("OCR on PDF
|
|
100
|
+
print("OCR Was applied on the PDF.")
|
|
32
101
|
|
|
33
102
|
return True
|
|
34
103
|
|
|
35
104
|
return False
|
|
36
105
|
|
|
37
106
|
|
|
107
|
+
########### Ratio Functions
|
|
108
|
+
def _text_coverage_ratio(self,page):
|
|
109
|
+
blocks = page.get_text("blocks")
|
|
110
|
+
page_area = page.rect.width * page.rect.height
|
|
111
|
+
|
|
112
|
+
text_area = 0
|
|
113
|
+
for block in blocks:
|
|
114
|
+
if block[6] == 0: # text block
|
|
115
|
+
x0, y0, x1, y1 = block[:4]
|
|
116
|
+
text_area += (x1 - x0) * (y1 - y0)
|
|
117
|
+
|
|
118
|
+
return text_area / page_area if page_area else 0
|
|
119
|
+
|
|
120
|
+
def _image_coverage_ratio(self, page):
|
|
121
|
+
images = page.get_images(full=True)
|
|
122
|
+
page_area = page.rect.width * page.rect.height
|
|
123
|
+
image_area = 0
|
|
124
|
+
|
|
125
|
+
for img in images:
|
|
126
|
+
xref = img[0]
|
|
127
|
+
rects = page.get_image_rects(xref)
|
|
128
|
+
for r in rects:
|
|
129
|
+
image_area += r.width * r.height
|
|
130
|
+
|
|
131
|
+
return image_area / page_area if page_area else 0
|
|
132
|
+
|
|
38
133
|
def _is_whole_pdf_ocr(self):
|
|
39
134
|
# read and extract the whole text from the pdf
|
|
40
135
|
THRESHOLD_VALUE = 100 # minimum text
|
|
@@ -44,14 +139,31 @@ class OCRDataExtractor:
|
|
|
44
139
|
for page_num in range(len(doc)):
|
|
45
140
|
page = doc.load_page(page_num)
|
|
46
141
|
text = page.get_text()
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
142
|
+
|
|
143
|
+
# add logic if the image and text ratio is above a certain threshold
|
|
144
|
+
image_ratio = self._image_coverage_ratio(page)
|
|
145
|
+
text_ratio = self._text_coverage_ratio(page)
|
|
146
|
+
print(f"Image :\t{image_ratio*100:.2f}")
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
# if len(text.strip()) > THRESHOLD_VALUE:
|
|
150
|
+
if image_ratio > 0.45:
|
|
151
|
+
# print("Length of text:\n", len(text.strip()))
|
|
152
|
+
print(f"Image :\t{image_ratio*100:.2f}")
|
|
50
153
|
# append to the page list
|
|
51
154
|
self.pages_to_ocr.append(page_num)
|
|
155
|
+
|
|
156
|
+
elif text_ratio < 0.05:
|
|
157
|
+
print(f"Text :\t{text_ratio*100:.2f}")
|
|
158
|
+
# append to the page list
|
|
159
|
+
self.pages_to_ocr.append(page_num)
|
|
160
|
+
|
|
161
|
+
else:
|
|
162
|
+
is_whole_pdf_ocr_applicable = False
|
|
52
163
|
|
|
53
164
|
return is_whole_pdf_ocr_applicable
|
|
54
165
|
|
|
166
|
+
########### Page by Page Functions
|
|
55
167
|
def _is_page_by_page_ocr(self):
|
|
56
168
|
return len(self.pages_to_ocr) > 0
|
|
57
169
|
|
myocr_lib-0.1.3.dist-info/RECORD
DELETED
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
ocr_img/__init__.py,sha256=A0HMZNNgwObl6qZHeGKXQSf-VhPlkQt3YKEcA_VXHLw,80
|
|
2
|
-
ocr_img/main_code.py,sha256=738bf1zZTr1FRncvPfUik4SSS2nSMkPqrCFRtnQeGmM,1300
|
|
3
|
-
ocr_pdf/__init__.py,sha256=t4SYasAJ1pjnd7sZBnPe1PqMbR18-4onsHXu93zw1LE,96
|
|
4
|
-
ocr_pdf/main_code.py,sha256=Rz9PtpA79XPbX2VsmeNaQabnY_ZoRkYylTUeNZPc2cE,5175
|
|
5
|
-
myocr_lib-0.1.3.dist-info/METADATA,sha256=s-8a8rie6ytZfIIF4vplbuot2_gHccJ8gUF9vHslcwY,677
|
|
6
|
-
myocr_lib-0.1.3.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
|
|
7
|
-
myocr_lib-0.1.3.dist-info/top_level.txt,sha256=m22oM9I_3SovUyKNzGQIpqgCp7GTXua0G7cEYqMx654,16
|
|
8
|
-
myocr_lib-0.1.3.dist-info/RECORD,,
|
|
File without changes
|