myocr-lib 0.1.3__tar.gz → 0.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: myocr_lib
3
- Version: 0.1.3
3
+ Version: 0.1.4
4
4
  Summary: To be available soon
5
5
  Keywords: utility,library,functions,ocr,image-processing
6
6
  Classifier: Development Status :: 3 - Alpha
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "myocr_lib" # Replace with your actual library name
7
- version = "0.1.3"
7
+ version = "0.1.4"
8
8
  # authors = [
9
9
  # {name = "Muhammad Asif Ali", email = "creativedeveloper151214@gmail.com"}
10
10
  # ]
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: myocr_lib
3
- Version: 0.1.3
3
+ Version: 0.1.4
4
4
  Summary: To be available soon
5
5
  Keywords: utility,library,functions,ocr,image-processing
6
6
  Classifier: Development Status :: 3 - Alpha
@@ -13,6 +13,50 @@ class OCRDataExtractor:
13
13
  self.pages_to_ocr = [] # holds the pages needing ocr
14
14
 
15
15
 
16
+ def _raise_for_ocr_results(self, result):
17
+ """This function raises the corresponding and matching exceptions based on the value of thre result code."""
18
+ # key for the exceptions code
19
+ exceptions_codes = {exc:"" for exc in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 130]}
20
+
21
+ return_code = result.returncode
22
+
23
+ match return_code:
24
+ case 1:
25
+ raise Exception("Invalid arguments were provided.")
26
+
27
+ case 2:
28
+ raise Exception("The input file is not a valid PDF.")
29
+
30
+ case 3:
31
+ raise Exception("An external program (like Tesseract or Ghostscript) required by OCRmyPDF is missing.")
32
+
33
+ case 4:
34
+ raise Exception("An output file was created, but it is invalid (the file will still be produced).")
35
+
36
+ case 5:
37
+ raise Exception("Insufficient permissions to read the input or write the output file.")
38
+
39
+ case 6:
40
+ raise Exception("The file already contains text, and the command was not configured to force re-processing.")
41
+
42
+ case 7:
43
+ raise Exception("An error occurred in an external program (e.g., Ghostscript).")
44
+
45
+ case 8:
46
+ raise Exception("The input PDF is encrypted and cannot be processed.")
47
+
48
+ case 9:
49
+ raise Exception("Tesseract rejected a custom configuration file.")
50
+
51
+ case 10:
52
+ raise Exception("A valid PDF was created, but conversion to PDF/A failed.")
53
+
54
+ case 15:
55
+ raise Exception("An unexpected or other error occurred.")
56
+
57
+ case 130:
58
+ raise Exception("The program was interrupted by the user (Ctrl+C). ")
59
+
16
60
 
17
61
  def _apply_whole_pdf_ocr(self):
18
62
  # Define the Commands
@@ -27,14 +71,65 @@ class OCRDataExtractor:
27
71
  # run the command
28
72
  result = subprocess.run(command, capture_output=True, text=True)
29
73
 
74
+ try:
75
+ self._raise_for_ocr_results(result)
76
+
77
+ except Exception as e:
78
+ print("Error during OCR process:\n", str(e))
79
+
80
+ if result.returncode == 6: # already has text
81
+ print("Retrying using forcing..")
82
+ # use force ocr
83
+ # Define the Commands
84
+ command = [
85
+ "ocrmypdf",
86
+ f"{self.input_pdf_path}",
87
+ f"{self.output_file_path}",
88
+ "--force-ocr",
89
+ "--tesseract-timeout", "1000" # increased timeout
90
+ ]
91
+
92
+ # run the command
93
+ result = subprocess.run(command, capture_output=True, text=True)
94
+
95
+ else:
96
+ raise e
97
+
98
+
30
99
  if result.returncode == 0:
31
- print("OCR on PDF Was successfully.")
100
+ print("OCR Was applied on the PDF.")
32
101
 
33
102
  return True
34
103
 
35
104
  return False
36
105
 
37
106
 
107
+ ########### Ratio Functions
108
+ def _text_coverage_ratio(self,page):
109
+ blocks = page.get_text("blocks")
110
+ page_area = page.rect.width * page.rect.height
111
+
112
+ text_area = 0
113
+ for block in blocks:
114
+ if block[6] == 0: # text block
115
+ x0, y0, x1, y1 = block[:4]
116
+ text_area += (x1 - x0) * (y1 - y0)
117
+
118
+ return text_area / page_area if page_area else 0
119
+
120
+ def _image_coverage_ratio(self, page):
121
+ images = page.get_images(full=True)
122
+ page_area = page.rect.width * page.rect.height
123
+ image_area = 0
124
+
125
+ for img in images:
126
+ xref = img[0]
127
+ rects = page.get_image_rects(xref)
128
+ for r in rects:
129
+ image_area += r.width * r.height
130
+
131
+ return image_area / page_area if page_area else 0
132
+
38
133
  def _is_whole_pdf_ocr(self):
39
134
  # read and extract the whole text from the pdf
40
135
  THRESHOLD_VALUE = 100 # minimum text
@@ -44,14 +139,31 @@ class OCRDataExtractor:
44
139
  for page_num in range(len(doc)):
45
140
  page = doc.load_page(page_num)
46
141
  text = page.get_text()
47
- if len(text.strip()) > THRESHOLD_VALUE:
48
- is_whole_pdf_ocr_applicable = False
49
- else:
142
+
143
+ # add logic if the image and text ratio is above a certain threshold
144
+ image_ratio = self._image_coverage_ratio(page)
145
+ text_ratio = self._text_coverage_ratio(page)
146
+ print(f"Image :\t{image_ratio*100:.2f}")
147
+
148
+
149
+ # if len(text.strip()) > THRESHOLD_VALUE:
150
+ if image_ratio > 0.45:
151
+ # print("Length of text:\n", len(text.strip()))
152
+ print(f"Image :\t{image_ratio*100:.2f}")
50
153
  # append to the page list
51
154
  self.pages_to_ocr.append(page_num)
155
+
156
+ elif text_ratio < 0.05:
157
+ print(f"Text :\t{text_ratio*100:.2f}")
158
+ # append to the page list
159
+ self.pages_to_ocr.append(page_num)
160
+
161
+ else:
162
+ is_whole_pdf_ocr_applicable = False
52
163
 
53
164
  return is_whole_pdf_ocr_applicable
54
165
 
166
+ ########### Page by Page Functions
55
167
  def _is_page_by_page_ocr(self):
56
168
  return len(self.pages_to_ocr) > 0
57
169
 
File without changes
File without changes