PyPI - myocr-lib - Versions diffs - 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl - Mend

myocr-lib 0.1.2py3-none-any.whl → 0.1.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

{myocr_lib-0.1.2.dist-info → myocr_lib-0.1.4.dist-info}/METADATA +2 -3
myocr_lib-0.1.4.dist-info/RECORD +8 -0
{myocr_lib-0.1.2.dist-info → myocr_lib-0.1.4.dist-info}/WHEEL +1 -1
ocr_img/main_code.py +1 -1
ocr_pdf/main_code.py +118 -6
myocr_lib-0.1.2.dist-info/RECORD +0 -8
{myocr_lib-0.1.2.dist-info → myocr_lib-0.1.4.dist-info}/top_level.txt +0 -0

{myocr_lib-0.1.2.dist-info → myocr_lib-0.1.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.4
 Name: myocr_lib
-Version: 0.1.2
+Version: 0.1.4
 Summary: To be available soon
 Keywords: utility,library,functions,ocr,image-processing
 Classifier: Development Status :: 3 - Alpha
@@ -16,4 +16,3 @@ Requires-Dist: ocrmypdf>=16.12.0
 Requires-Dist: Pillow>=9.0.0
 Requires-Dist: opencv-python>=4.12.0
 Requires-Dist: PyMuPDF>=1.26.7

myocr_lib-0.1.4.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,8 @@
+ocr_img/__init__.py,sha256=A0HMZNNgwObl6qZHeGKXQSf-VhPlkQt3YKEcA_VXHLw,80
+ocr_img/main_code.py,sha256=738bf1zZTr1FRncvPfUik4SSS2nSMkPqrCFRtnQeGmM,1300
+ocr_pdf/__init__.py,sha256=t4SYasAJ1pjnd7sZBnPe1PqMbR18-4onsHXu93zw1LE,96
+ocr_pdf/main_code.py,sha256=7wYi6HPUoKyvqOk02qiPC2RR39CSKCRb9Eki8vhOes0,9368
+myocr_lib-0.1.4.dist-info/METADATA,sha256=7IopmcXCQifPCF8B_8AJRCVQIP8rQofXsLEA2sU1u5g,675
+myocr_lib-0.1.4.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+myocr_lib-0.1.4.dist-info/top_level.txt,sha256=m22oM9I_3SovUyKNzGQIpqgCp7GTXua0G7cEYqMx654,16
+myocr_lib-0.1.4.dist-info/RECORD,,

{myocr_lib-0.1.2.dist-info → myocr_lib-0.1.4.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (75.3.2)
+Generator: setuptools (80.10.2)
 Root-Is-Purelib: true
 Tag: py3-none-any

ocr_img/main_code.py CHANGED Viewed

@@ -42,7 +42,7 @@ class ImageOCR:
         # apply the ocr
         text = self._apply_ocr(img)
-        print(text)
+        # print(text)
         # temporary save to file for demo
         # get the extension

ocr_pdf/main_code.py CHANGED Viewed

@@ -13,6 +13,50 @@ class OCRDataExtractor:
         self.pages_to_ocr = [] # holds the pages needing ocr
+    def _raise_for_ocr_results(self, result):
+        """This function raises the corresponding and matching exceptions based on the value of thre result code."""
+        # key for the exceptions code
+        exceptions_codes = {exc:"" for exc in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 130]}
+        return_code = result.returncode
+        match return_code:
+            case 1:
+                raise Exception("Invalid arguments were provided.")
+            case 2:
+                raise Exception("The input file is not a valid PDF.")
+            case 3:
+                raise Exception("An external program (like Tesseract or Ghostscript) required by OCRmyPDF is missing.")
+            case 4:
+                raise Exception("An output file was created, but it is invalid (the file will still be produced).")
+            case 5:
+                raise Exception("Insufficient permissions to read the input or write the output file.")
+            case 6:
+                raise Exception("The file already contains text, and the command was not configured to force re-processing.")
+            case 7:
+                raise Exception("An error occurred in an external program (e.g., Ghostscript).")
+            case 8:
+                raise Exception("The input PDF is encrypted and cannot be processed.")
+            case 9:
+                raise Exception("Tesseract rejected a custom configuration file.")
+            case 10:
+                raise Exception("A valid PDF was created, but conversion to PDF/A failed.")
+            case 15:
+                raise Exception("An unexpected or other error occurred.")
+            case 130:
+                raise Exception("The program was interrupted by the user (Ctrl+C). ")
     def _apply_whole_pdf_ocr(self):
         # Define the Commands
@@ -27,14 +71,65 @@ class OCRDataExtractor:
         # run the command
         result = subprocess.run(command, capture_output=True, text=True)
+        try:
+            self._raise_for_ocr_results(result)
+        except Exception as e:
+            print("Error during OCR process:\n", str(e))
+            if result.returncode == 6: # already has text
+                print("Retrying using forcing..")
+                # use force ocr
+                # Define the Commands
+                command = [
+                    "ocrmypdf",
+                    f"{self.input_pdf_path}",
+                    f"{self.output_file_path}",
+                    "--force-ocr",
+                    "--tesseract-timeout", "1000" # increased timeout
+                ]
+                # run the command
+                result = subprocess.run(command, capture_output=True, text=True)
+            else:
+                raise e
         if result.returncode == 0:
-            print("PDF Was converted to Selectable Successfully.")
+            print("OCR Was applied on the PDF.")
             return True
         return False
+    ########### Ratio Functions
+    def _text_coverage_ratio(self,page):
+        blocks = page.get_text("blocks")
+        page_area = page.rect.width * page.rect.height
+        text_area = 0
+        for block in blocks:
+            if block[6] == 0:  # text block
+                x0, y0, x1, y1 = block[:4]
+                text_area += (x1 - x0) * (y1 - y0)
+        return text_area / page_area if page_area else 0
+    def _image_coverage_ratio(self, page):
+        images = page.get_images(full=True)
+        page_area = page.rect.width * page.rect.height
+        image_area = 0
+        for img in images:
+            xref = img[0]
+            rects = page.get_image_rects(xref)
+            for r in rects:
+                image_area += r.width * r.height
+        return image_area / page_area if page_area else 0
     def _is_whole_pdf_ocr(self):
         # read and extract the whole text from the pdf
         THRESHOLD_VALUE = 100 # minimum text
@@ -44,14 +139,31 @@ class OCRDataExtractor:
             for page_num in range(len(doc)):
                 page = doc.load_page(page_num)
                 text = page.get_text()
-                if len(text.strip()) > THRESHOLD_VALUE:
-                    is_whole_pdf_ocr_applicable = False
-                else:
+                # add logic if the image and text ratio is above a certain threshold
+                image_ratio = self._image_coverage_ratio(page)
+                text_ratio = self._text_coverage_ratio(page)
+                print(f"Image :\t{image_ratio*100:.2f}")
+                # if len(text.strip()) > THRESHOLD_VALUE:
+                if image_ratio > 0.45:
+                    # print("Length of text:\n", len(text.strip()))
+                    print(f"Image :\t{image_ratio*100:.2f}")
                     # append to the page list
                     self.pages_to_ocr.append(page_num)
+                elif text_ratio < 0.05:
+                    print(f"Text :\t{text_ratio*100:.2f}")
+                    # append to the page list
+                    self.pages_to_ocr.append(page_num)
+                else:
+                    is_whole_pdf_ocr_applicable = False
         return is_whole_pdf_ocr_applicable
+    ########### Page by Page Functions
     def _is_page_by_page_ocr(self):
         return len(self.pages_to_ocr) > 0
@@ -118,7 +230,7 @@ class OCRDataExtractor:
             # apply the extraction for the whole pdf through fitz
             text = self._extract_text_whole_pdf() if results else None
-            print(text)
+            # print(text)
             # temporary store the text
             # ext = self.input_pdf_path.split('.')[-1]
@@ -141,7 +253,7 @@ class OCRDataExtractor:
             # do the extraction for specific pages only throug fitz
             text = self._extract_text_page_by_page() if results else None
-            print(text)
+            # print(text)
             # delete the output file
             self.delete_file(self.output_file_path)

myocr_lib-0.1.2.dist-info/RECORD DELETED Viewed

@@ -1,8 +0,0 @@
-ocr_img/__init__.py,sha256=A0HMZNNgwObl6qZHeGKXQSf-VhPlkQt3YKEcA_VXHLw,80
-ocr_img/main_code.py,sha256=aqE-NGFDCJ_nqpMt7GujRvEin6dB4kCK9DEQiADnMRs,1298
-ocr_pdf/__init__.py,sha256=t4SYasAJ1pjnd7sZBnPe1PqMbR18-4onsHXu93zw1LE,96
-ocr_pdf/main_code.py,sha256=-ivDkwevfiLXEj-OtrPP5I9djBgec9xJV0ltIe4TQfg,5188
-myocr_lib-0.1.2.dist-info/METADATA,sha256=UdefSkJ_0SjJdGImDdkCyriWL5Krf3mHOnLohQjIX8c,677
-myocr_lib-0.1.2.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
-myocr_lib-0.1.2.dist-info/top_level.txt,sha256=m22oM9I_3SovUyKNzGQIpqgCp7GTXua0G7cEYqMx654,16
-myocr_lib-0.1.2.dist-info/RECORD,,

{myocr_lib-0.1.2.dist-info → myocr_lib-0.1.4.dist-info}/top_level.txt RENAMED Viewed

File without changes

myocr-lib 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl

myocr-lib 0.1.2py3-none-any.whl → 0.1.4py3-none-any.whl