myocr-lib 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: myocr_lib
3
- Version: 0.1.2
3
+ Version: 0.1.4
4
4
  Summary: To be available soon
5
5
  Keywords: utility,library,functions,ocr,image-processing
6
6
  Classifier: Development Status :: 3 - Alpha
@@ -16,4 +16,3 @@ Requires-Dist: ocrmypdf>=16.12.0
16
16
  Requires-Dist: Pillow>=9.0.0
17
17
  Requires-Dist: opencv-python>=4.12.0
18
18
  Requires-Dist: PyMuPDF>=1.26.7
19
-
@@ -0,0 +1,8 @@
1
+ ocr_img/__init__.py,sha256=A0HMZNNgwObl6qZHeGKXQSf-VhPlkQt3YKEcA_VXHLw,80
2
+ ocr_img/main_code.py,sha256=738bf1zZTr1FRncvPfUik4SSS2nSMkPqrCFRtnQeGmM,1300
3
+ ocr_pdf/__init__.py,sha256=t4SYasAJ1pjnd7sZBnPe1PqMbR18-4onsHXu93zw1LE,96
4
+ ocr_pdf/main_code.py,sha256=7wYi6HPUoKyvqOk02qiPC2RR39CSKCRb9Eki8vhOes0,9368
5
+ myocr_lib-0.1.4.dist-info/METADATA,sha256=7IopmcXCQifPCF8B_8AJRCVQIP8rQofXsLEA2sU1u5g,675
6
+ myocr_lib-0.1.4.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
7
+ myocr_lib-0.1.4.dist-info/top_level.txt,sha256=m22oM9I_3SovUyKNzGQIpqgCp7GTXua0G7cEYqMx654,16
8
+ myocr_lib-0.1.4.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.3.2)
2
+ Generator: setuptools (80.10.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
ocr_img/main_code.py CHANGED
@@ -42,7 +42,7 @@ class ImageOCR:
42
42
  # apply the ocr
43
43
  text = self._apply_ocr(img)
44
44
 
45
- print(text)
45
+ # print(text)
46
46
 
47
47
  # temporary save to file for demo
48
48
  # get the extension
ocr_pdf/main_code.py CHANGED
@@ -13,6 +13,50 @@ class OCRDataExtractor:
13
13
  self.pages_to_ocr = [] # holds the pages needing ocr
14
14
 
15
15
 
16
+ def _raise_for_ocr_results(self, result):
17
+ """This function raises the corresponding and matching exceptions based on the value of thre result code."""
18
+ # key for the exceptions code
19
+ exceptions_codes = {exc:"" for exc in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 130]}
20
+
21
+ return_code = result.returncode
22
+
23
+ match return_code:
24
+ case 1:
25
+ raise Exception("Invalid arguments were provided.")
26
+
27
+ case 2:
28
+ raise Exception("The input file is not a valid PDF.")
29
+
30
+ case 3:
31
+ raise Exception("An external program (like Tesseract or Ghostscript) required by OCRmyPDF is missing.")
32
+
33
+ case 4:
34
+ raise Exception("An output file was created, but it is invalid (the file will still be produced).")
35
+
36
+ case 5:
37
+ raise Exception("Insufficient permissions to read the input or write the output file.")
38
+
39
+ case 6:
40
+ raise Exception("The file already contains text, and the command was not configured to force re-processing.")
41
+
42
+ case 7:
43
+ raise Exception("An error occurred in an external program (e.g., Ghostscript).")
44
+
45
+ case 8:
46
+ raise Exception("The input PDF is encrypted and cannot be processed.")
47
+
48
+ case 9:
49
+ raise Exception("Tesseract rejected a custom configuration file.")
50
+
51
+ case 10:
52
+ raise Exception("A valid PDF was created, but conversion to PDF/A failed.")
53
+
54
+ case 15:
55
+ raise Exception("An unexpected or other error occurred.")
56
+
57
+ case 130:
58
+ raise Exception("The program was interrupted by the user (Ctrl+C). ")
59
+
16
60
 
17
61
  def _apply_whole_pdf_ocr(self):
18
62
  # Define the Commands
@@ -27,14 +71,65 @@ class OCRDataExtractor:
27
71
  # run the command
28
72
  result = subprocess.run(command, capture_output=True, text=True)
29
73
 
74
+ try:
75
+ self._raise_for_ocr_results(result)
76
+
77
+ except Exception as e:
78
+ print("Error during OCR process:\n", str(e))
79
+
80
+ if result.returncode == 6: # already has text
81
+ print("Retrying using forcing..")
82
+ # use force ocr
83
+ # Define the Commands
84
+ command = [
85
+ "ocrmypdf",
86
+ f"{self.input_pdf_path}",
87
+ f"{self.output_file_path}",
88
+ "--force-ocr",
89
+ "--tesseract-timeout", "1000" # increased timeout
90
+ ]
91
+
92
+ # run the command
93
+ result = subprocess.run(command, capture_output=True, text=True)
94
+
95
+ else:
96
+ raise e
97
+
98
+
30
99
  if result.returncode == 0:
31
- print("PDF Was converted to Selectable Successfully.")
100
+ print("OCR Was applied on the PDF.")
32
101
 
33
102
  return True
34
103
 
35
104
  return False
36
105
 
37
106
 
107
+ ########### Ratio Functions
108
+ def _text_coverage_ratio(self,page):
109
+ blocks = page.get_text("blocks")
110
+ page_area = page.rect.width * page.rect.height
111
+
112
+ text_area = 0
113
+ for block in blocks:
114
+ if block[6] == 0: # text block
115
+ x0, y0, x1, y1 = block[:4]
116
+ text_area += (x1 - x0) * (y1 - y0)
117
+
118
+ return text_area / page_area if page_area else 0
119
+
120
+ def _image_coverage_ratio(self, page):
121
+ images = page.get_images(full=True)
122
+ page_area = page.rect.width * page.rect.height
123
+ image_area = 0
124
+
125
+ for img in images:
126
+ xref = img[0]
127
+ rects = page.get_image_rects(xref)
128
+ for r in rects:
129
+ image_area += r.width * r.height
130
+
131
+ return image_area / page_area if page_area else 0
132
+
38
133
  def _is_whole_pdf_ocr(self):
39
134
  # read and extract the whole text from the pdf
40
135
  THRESHOLD_VALUE = 100 # minimum text
@@ -44,14 +139,31 @@ class OCRDataExtractor:
44
139
  for page_num in range(len(doc)):
45
140
  page = doc.load_page(page_num)
46
141
  text = page.get_text()
47
- if len(text.strip()) > THRESHOLD_VALUE:
48
- is_whole_pdf_ocr_applicable = False
49
- else:
142
+
143
+ # add logic if the image and text ratio is above a certain threshold
144
+ image_ratio = self._image_coverage_ratio(page)
145
+ text_ratio = self._text_coverage_ratio(page)
146
+ print(f"Image :\t{image_ratio*100:.2f}")
147
+
148
+
149
+ # if len(text.strip()) > THRESHOLD_VALUE:
150
+ if image_ratio > 0.45:
151
+ # print("Length of text:\n", len(text.strip()))
152
+ print(f"Image :\t{image_ratio*100:.2f}")
50
153
  # append to the page list
51
154
  self.pages_to_ocr.append(page_num)
155
+
156
+ elif text_ratio < 0.05:
157
+ print(f"Text :\t{text_ratio*100:.2f}")
158
+ # append to the page list
159
+ self.pages_to_ocr.append(page_num)
160
+
161
+ else:
162
+ is_whole_pdf_ocr_applicable = False
52
163
 
53
164
  return is_whole_pdf_ocr_applicable
54
165
 
166
+ ########### Page by Page Functions
55
167
  def _is_page_by_page_ocr(self):
56
168
  return len(self.pages_to_ocr) > 0
57
169
 
@@ -118,7 +230,7 @@ class OCRDataExtractor:
118
230
  # apply the extraction for the whole pdf through fitz
119
231
  text = self._extract_text_whole_pdf() if results else None
120
232
 
121
- print(text)
233
+ # print(text)
122
234
 
123
235
  # temporary store the text
124
236
  # ext = self.input_pdf_path.split('.')[-1]
@@ -141,7 +253,7 @@ class OCRDataExtractor:
141
253
  # do the extraction for specific pages only throug fitz
142
254
  text = self._extract_text_page_by_page() if results else None
143
255
 
144
- print(text)
256
+ # print(text)
145
257
 
146
258
  # delete the output file
147
259
  self.delete_file(self.output_file_path)
@@ -1,8 +0,0 @@
1
- ocr_img/__init__.py,sha256=A0HMZNNgwObl6qZHeGKXQSf-VhPlkQt3YKEcA_VXHLw,80
2
- ocr_img/main_code.py,sha256=aqE-NGFDCJ_nqpMt7GujRvEin6dB4kCK9DEQiADnMRs,1298
3
- ocr_pdf/__init__.py,sha256=t4SYasAJ1pjnd7sZBnPe1PqMbR18-4onsHXu93zw1LE,96
4
- ocr_pdf/main_code.py,sha256=-ivDkwevfiLXEj-OtrPP5I9djBgec9xJV0ltIe4TQfg,5188
5
- myocr_lib-0.1.2.dist-info/METADATA,sha256=UdefSkJ_0SjJdGImDdkCyriWL5Krf3mHOnLohQjIX8c,677
6
- myocr_lib-0.1.2.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
7
- myocr_lib-0.1.2.dist-info/top_level.txt,sha256=m22oM9I_3SovUyKNzGQIpqgCp7GTXua0G7cEYqMx654,16
8
- myocr_lib-0.1.2.dist-info/RECORD,,