myocr-lib 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {myocr_lib-0.1.2.dist-info → myocr_lib-0.1.4.dist-info}/METADATA +2 -3
- myocr_lib-0.1.4.dist-info/RECORD +8 -0
- {myocr_lib-0.1.2.dist-info → myocr_lib-0.1.4.dist-info}/WHEEL +1 -1
- ocr_img/main_code.py +1 -1
- ocr_pdf/main_code.py +118 -6
- myocr_lib-0.1.2.dist-info/RECORD +0 -8
- {myocr_lib-0.1.2.dist-info → myocr_lib-0.1.4.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: myocr_lib
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.4
|
|
4
4
|
Summary: To be available soon
|
|
5
5
|
Keywords: utility,library,functions,ocr,image-processing
|
|
6
6
|
Classifier: Development Status :: 3 - Alpha
|
|
@@ -16,4 +16,3 @@ Requires-Dist: ocrmypdf>=16.12.0
|
|
|
16
16
|
Requires-Dist: Pillow>=9.0.0
|
|
17
17
|
Requires-Dist: opencv-python>=4.12.0
|
|
18
18
|
Requires-Dist: PyMuPDF>=1.26.7
|
|
19
|
-
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
ocr_img/__init__.py,sha256=A0HMZNNgwObl6qZHeGKXQSf-VhPlkQt3YKEcA_VXHLw,80
|
|
2
|
+
ocr_img/main_code.py,sha256=738bf1zZTr1FRncvPfUik4SSS2nSMkPqrCFRtnQeGmM,1300
|
|
3
|
+
ocr_pdf/__init__.py,sha256=t4SYasAJ1pjnd7sZBnPe1PqMbR18-4onsHXu93zw1LE,96
|
|
4
|
+
ocr_pdf/main_code.py,sha256=7wYi6HPUoKyvqOk02qiPC2RR39CSKCRb9Eki8vhOes0,9368
|
|
5
|
+
myocr_lib-0.1.4.dist-info/METADATA,sha256=7IopmcXCQifPCF8B_8AJRCVQIP8rQofXsLEA2sU1u5g,675
|
|
6
|
+
myocr_lib-0.1.4.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
7
|
+
myocr_lib-0.1.4.dist-info/top_level.txt,sha256=m22oM9I_3SovUyKNzGQIpqgCp7GTXua0G7cEYqMx654,16
|
|
8
|
+
myocr_lib-0.1.4.dist-info/RECORD,,
|
ocr_img/main_code.py
CHANGED
ocr_pdf/main_code.py
CHANGED
|
@@ -13,6 +13,50 @@ class OCRDataExtractor:
|
|
|
13
13
|
self.pages_to_ocr = [] # holds the pages needing ocr
|
|
14
14
|
|
|
15
15
|
|
|
16
|
+
def _raise_for_ocr_results(self, result):
|
|
17
|
+
"""This function raises the corresponding and matching exceptions based on the value of thre result code."""
|
|
18
|
+
# key for the exceptions code
|
|
19
|
+
exceptions_codes = {exc:"" for exc in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 130]}
|
|
20
|
+
|
|
21
|
+
return_code = result.returncode
|
|
22
|
+
|
|
23
|
+
match return_code:
|
|
24
|
+
case 1:
|
|
25
|
+
raise Exception("Invalid arguments were provided.")
|
|
26
|
+
|
|
27
|
+
case 2:
|
|
28
|
+
raise Exception("The input file is not a valid PDF.")
|
|
29
|
+
|
|
30
|
+
case 3:
|
|
31
|
+
raise Exception("An external program (like Tesseract or Ghostscript) required by OCRmyPDF is missing.")
|
|
32
|
+
|
|
33
|
+
case 4:
|
|
34
|
+
raise Exception("An output file was created, but it is invalid (the file will still be produced).")
|
|
35
|
+
|
|
36
|
+
case 5:
|
|
37
|
+
raise Exception("Insufficient permissions to read the input or write the output file.")
|
|
38
|
+
|
|
39
|
+
case 6:
|
|
40
|
+
raise Exception("The file already contains text, and the command was not configured to force re-processing.")
|
|
41
|
+
|
|
42
|
+
case 7:
|
|
43
|
+
raise Exception("An error occurred in an external program (e.g., Ghostscript).")
|
|
44
|
+
|
|
45
|
+
case 8:
|
|
46
|
+
raise Exception("The input PDF is encrypted and cannot be processed.")
|
|
47
|
+
|
|
48
|
+
case 9:
|
|
49
|
+
raise Exception("Tesseract rejected a custom configuration file.")
|
|
50
|
+
|
|
51
|
+
case 10:
|
|
52
|
+
raise Exception("A valid PDF was created, but conversion to PDF/A failed.")
|
|
53
|
+
|
|
54
|
+
case 15:
|
|
55
|
+
raise Exception("An unexpected or other error occurred.")
|
|
56
|
+
|
|
57
|
+
case 130:
|
|
58
|
+
raise Exception("The program was interrupted by the user (Ctrl+C). ")
|
|
59
|
+
|
|
16
60
|
|
|
17
61
|
def _apply_whole_pdf_ocr(self):
|
|
18
62
|
# Define the Commands
|
|
@@ -27,14 +71,65 @@ class OCRDataExtractor:
|
|
|
27
71
|
# run the command
|
|
28
72
|
result = subprocess.run(command, capture_output=True, text=True)
|
|
29
73
|
|
|
74
|
+
try:
|
|
75
|
+
self._raise_for_ocr_results(result)
|
|
76
|
+
|
|
77
|
+
except Exception as e:
|
|
78
|
+
print("Error during OCR process:\n", str(e))
|
|
79
|
+
|
|
80
|
+
if result.returncode == 6: # already has text
|
|
81
|
+
print("Retrying using forcing..")
|
|
82
|
+
# use force ocr
|
|
83
|
+
# Define the Commands
|
|
84
|
+
command = [
|
|
85
|
+
"ocrmypdf",
|
|
86
|
+
f"{self.input_pdf_path}",
|
|
87
|
+
f"{self.output_file_path}",
|
|
88
|
+
"--force-ocr",
|
|
89
|
+
"--tesseract-timeout", "1000" # increased timeout
|
|
90
|
+
]
|
|
91
|
+
|
|
92
|
+
# run the command
|
|
93
|
+
result = subprocess.run(command, capture_output=True, text=True)
|
|
94
|
+
|
|
95
|
+
else:
|
|
96
|
+
raise e
|
|
97
|
+
|
|
98
|
+
|
|
30
99
|
if result.returncode == 0:
|
|
31
|
-
print("
|
|
100
|
+
print("OCR Was applied on the PDF.")
|
|
32
101
|
|
|
33
102
|
return True
|
|
34
103
|
|
|
35
104
|
return False
|
|
36
105
|
|
|
37
106
|
|
|
107
|
+
########### Ratio Functions
|
|
108
|
+
def _text_coverage_ratio(self,page):
|
|
109
|
+
blocks = page.get_text("blocks")
|
|
110
|
+
page_area = page.rect.width * page.rect.height
|
|
111
|
+
|
|
112
|
+
text_area = 0
|
|
113
|
+
for block in blocks:
|
|
114
|
+
if block[6] == 0: # text block
|
|
115
|
+
x0, y0, x1, y1 = block[:4]
|
|
116
|
+
text_area += (x1 - x0) * (y1 - y0)
|
|
117
|
+
|
|
118
|
+
return text_area / page_area if page_area else 0
|
|
119
|
+
|
|
120
|
+
def _image_coverage_ratio(self, page):
|
|
121
|
+
images = page.get_images(full=True)
|
|
122
|
+
page_area = page.rect.width * page.rect.height
|
|
123
|
+
image_area = 0
|
|
124
|
+
|
|
125
|
+
for img in images:
|
|
126
|
+
xref = img[0]
|
|
127
|
+
rects = page.get_image_rects(xref)
|
|
128
|
+
for r in rects:
|
|
129
|
+
image_area += r.width * r.height
|
|
130
|
+
|
|
131
|
+
return image_area / page_area if page_area else 0
|
|
132
|
+
|
|
38
133
|
def _is_whole_pdf_ocr(self):
|
|
39
134
|
# read and extract the whole text from the pdf
|
|
40
135
|
THRESHOLD_VALUE = 100 # minimum text
|
|
@@ -44,14 +139,31 @@ class OCRDataExtractor:
|
|
|
44
139
|
for page_num in range(len(doc)):
|
|
45
140
|
page = doc.load_page(page_num)
|
|
46
141
|
text = page.get_text()
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
142
|
+
|
|
143
|
+
# add logic if the image and text ratio is above a certain threshold
|
|
144
|
+
image_ratio = self._image_coverage_ratio(page)
|
|
145
|
+
text_ratio = self._text_coverage_ratio(page)
|
|
146
|
+
print(f"Image :\t{image_ratio*100:.2f}")
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
# if len(text.strip()) > THRESHOLD_VALUE:
|
|
150
|
+
if image_ratio > 0.45:
|
|
151
|
+
# print("Length of text:\n", len(text.strip()))
|
|
152
|
+
print(f"Image :\t{image_ratio*100:.2f}")
|
|
50
153
|
# append to the page list
|
|
51
154
|
self.pages_to_ocr.append(page_num)
|
|
155
|
+
|
|
156
|
+
elif text_ratio < 0.05:
|
|
157
|
+
print(f"Text :\t{text_ratio*100:.2f}")
|
|
158
|
+
# append to the page list
|
|
159
|
+
self.pages_to_ocr.append(page_num)
|
|
160
|
+
|
|
161
|
+
else:
|
|
162
|
+
is_whole_pdf_ocr_applicable = False
|
|
52
163
|
|
|
53
164
|
return is_whole_pdf_ocr_applicable
|
|
54
165
|
|
|
166
|
+
########### Page by Page Functions
|
|
55
167
|
def _is_page_by_page_ocr(self):
|
|
56
168
|
return len(self.pages_to_ocr) > 0
|
|
57
169
|
|
|
@@ -118,7 +230,7 @@ class OCRDataExtractor:
|
|
|
118
230
|
# apply the extraction for the whole pdf through fitz
|
|
119
231
|
text = self._extract_text_whole_pdf() if results else None
|
|
120
232
|
|
|
121
|
-
print(text)
|
|
233
|
+
# print(text)
|
|
122
234
|
|
|
123
235
|
# temporary store the text
|
|
124
236
|
# ext = self.input_pdf_path.split('.')[-1]
|
|
@@ -141,7 +253,7 @@ class OCRDataExtractor:
|
|
|
141
253
|
# do the extraction for specific pages only throug fitz
|
|
142
254
|
text = self._extract_text_page_by_page() if results else None
|
|
143
255
|
|
|
144
|
-
print(text)
|
|
256
|
+
# print(text)
|
|
145
257
|
|
|
146
258
|
# delete the output file
|
|
147
259
|
self.delete_file(self.output_file_path)
|
myocr_lib-0.1.2.dist-info/RECORD
DELETED
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
ocr_img/__init__.py,sha256=A0HMZNNgwObl6qZHeGKXQSf-VhPlkQt3YKEcA_VXHLw,80
|
|
2
|
-
ocr_img/main_code.py,sha256=aqE-NGFDCJ_nqpMt7GujRvEin6dB4kCK9DEQiADnMRs,1298
|
|
3
|
-
ocr_pdf/__init__.py,sha256=t4SYasAJ1pjnd7sZBnPe1PqMbR18-4onsHXu93zw1LE,96
|
|
4
|
-
ocr_pdf/main_code.py,sha256=-ivDkwevfiLXEj-OtrPP5I9djBgec9xJV0ltIe4TQfg,5188
|
|
5
|
-
myocr_lib-0.1.2.dist-info/METADATA,sha256=UdefSkJ_0SjJdGImDdkCyriWL5Krf3mHOnLohQjIX8c,677
|
|
6
|
-
myocr_lib-0.1.2.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
|
|
7
|
-
myocr_lib-0.1.2.dist-info/top_level.txt,sha256=m22oM9I_3SovUyKNzGQIpqgCp7GTXua0G7cEYqMx654,16
|
|
8
|
-
myocr_lib-0.1.2.dist-info/RECORD,,
|
|
File without changes
|