myocr-lib 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,18 @@
1
+ Metadata-Version: 2.1
2
+ Name: myocr_lib
3
+ Version: 0.1.1
4
+ Summary: To be available soon
5
+ Keywords: utility,library,functions,ocr,image-processing
6
+ Classifier: Development Status :: 3 - Alpha
7
+ Classifier: Intended Audience :: Developers
8
+ Classifier: Operating System :: OS Independent
9
+ Classifier: Programming Language :: Python :: 3.10
10
+ Classifier: Programming Language :: Python :: 3.11
11
+ Classifier: Programming Language :: Python :: 3.12
12
+ Requires-Python: >=3.10
13
+ Description-Content-Type: text/markdown
14
+ Requires-Dist: pytesseract>=0.3.13
15
+ Requires-Dist: ocrmypdf>=16.12.0
16
+ Requires-Dist: Pillow>=9.0.0
17
+ Requires-Dist: opencv-python>=4.12.0
18
+ Requires-Dist: PyMuPDF>=1.26.7
File without changes
@@ -0,0 +1,42 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "myocr_lib" # Replace with your actual library name
7
+ version = "0.1.1"
8
+ # authors = [
9
+ # {name = "Muhammad Asif Ali", email = "creativedeveloper151214@gmail.com"}
10
+ # ]
11
+ description = "To be available soon"
12
+ readme = "README.md"
13
+ requires-python = ">=3.10"
14
+ classifiers = [
15
+ "Development Status :: 3 - Alpha",
16
+ "Intended Audience :: Developers",
17
+ "Operating System :: OS Independent",
18
+ "Programming Language :: Python :: 3.10",
19
+ "Programming Language :: Python :: 3.11",
20
+ "Programming Language :: Python :: 3.12",
21
+ ]
22
+ keywords = ["utility", "library","functions","ocr","image-processing"] # Add relevant keywords
23
+ dependencies = [
24
+ # Add your dependencies here, e.g.:
25
+ # "requests>=2.28.0",
26
+ # "numpy>=1.20.0",
27
+
28
+ "pytesseract>=0.3.13",
29
+ "ocrmypdf>=16.12.0",
30
+ "Pillow>=9.0.0",
31
+ "opencv-python>=4.12.0",
32
+ "PyMuPDF>=1.26.7",
33
+
34
+ ]
35
+
36
+
37
+
38
+ [tool.setuptools.packages.find]
39
+ where = ["src"]
40
+
41
+ [tool.setuptools.package-dir]
42
+ "" = "src"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,18 @@
1
+ Metadata-Version: 2.1
2
+ Name: myocr_lib
3
+ Version: 0.1.1
4
+ Summary: To be available soon
5
+ Keywords: utility,library,functions,ocr,image-processing
6
+ Classifier: Development Status :: 3 - Alpha
7
+ Classifier: Intended Audience :: Developers
8
+ Classifier: Operating System :: OS Independent
9
+ Classifier: Programming Language :: Python :: 3.10
10
+ Classifier: Programming Language :: Python :: 3.11
11
+ Classifier: Programming Language :: Python :: 3.12
12
+ Requires-Python: >=3.10
13
+ Description-Content-Type: text/markdown
14
+ Requires-Dist: pytesseract>=0.3.13
15
+ Requires-Dist: ocrmypdf>=16.12.0
16
+ Requires-Dist: Pillow>=9.0.0
17
+ Requires-Dist: opencv-python>=4.12.0
18
+ Requires-Dist: PyMuPDF>=1.26.7
@@ -0,0 +1,11 @@
1
+ README.md
2
+ pyproject.toml
3
+ src/myocr_lib.egg-info/PKG-INFO
4
+ src/myocr_lib.egg-info/SOURCES.txt
5
+ src/myocr_lib.egg-info/dependency_links.txt
6
+ src/myocr_lib.egg-info/requires.txt
7
+ src/myocr_lib.egg-info/top_level.txt
8
+ src/ocr_img/__init__.py
9
+ src/ocr_img/main_code.py
10
+ src/ocr_pdf/__init__.py
11
+ src/ocr_pdf/main_code.py
@@ -0,0 +1,5 @@
1
+ pytesseract>=0.3.13
2
+ ocrmypdf>=16.12.0
3
+ Pillow>=9.0.0
4
+ opencv-python>=4.12.0
5
+ PyMuPDF>=1.26.7
@@ -0,0 +1,2 @@
1
+ ocr_img
2
+ ocr_pdf
@@ -0,0 +1,3 @@
1
+ from .main_code import ImageOCR
2
+ __all__ = ['ImageOCR']
3
+ __version__ = "0.1.1"
@@ -0,0 +1,56 @@
1
+ import cv2
2
+ from PIL import Image
3
+ import pytesseract
4
+
5
+
6
+ class ImageOCR:
7
+ def __init__(self, image_path):
8
+
9
+ # variables section
10
+ self.image_path = image_path
11
+
12
+ def _preprocess_image(self, img_path):
13
+
14
+ img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
15
+
16
+ # check if it supported webp
17
+ if img is None:
18
+ # Process for the webp image
19
+ img = Image.open(img_path).convert("RGB")
20
+
21
+
22
+ else:
23
+
24
+ # Light preprocessing (fast)
25
+ img = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
26
+
27
+ return img
28
+
29
+ def _apply_ocr(self, img):
30
+ text = pytesseract.image_to_string(
31
+ img,
32
+ lang="eng",
33
+ config="--oem 3 --psm 6"
34
+ )
35
+ return text
36
+
37
+
38
+ def run(self):
39
+ # preprocess the image for better results
40
+ img = self._preprocess_image(self.image_path)
41
+
42
+ # apply the ocr
43
+ text = self._apply_ocr(img)
44
+
45
+ print(text)
46
+
47
+ # temporary save to file for demo
48
+ # get the extension
49
+ # ext = self.image_path.split('.')[-1]
50
+ # with open(f'{self.image_path.replace(ext,"txt")}', 'w') as file:
51
+ # file.write(text)
52
+
53
+ return text
54
+
55
+
56
+
@@ -0,0 +1,3 @@
1
+ from .main_code import OCRDataExtractor
2
+ __all__ = ['OCRDataExtractor']
3
+ __version__ = "0.1.1"
@@ -0,0 +1,176 @@
1
+
2
+ import ocrmypdf
3
+ import fitz # PyMuPDF
4
+ import subprocess
5
+ import os
6
+
7
+ class OCRDataExtractor:
8
+ def __init__(self, input_pdf_path):
9
+
10
+ # variables section
11
+ self.input_pdf_path = input_pdf_path
12
+ self.output_file_path = input_pdf_path.replace('.pdf','_output.pdf')
13
+ self.pages_to_ocr = [] # holds the pages needing ocr
14
+
15
+
16
+
17
+ def _apply_whole_pdf_ocr(self):
18
+ # Define the Commands
19
+ command = [
20
+ "ocrmypdf",
21
+ f"{self.input_pdf_path}",
22
+ f"{self.output_file_path}",
23
+ # "--force-ocr",
24
+ "--tesseract-timeout", "1000" # increased timeout
25
+ ]
26
+
27
+ # run the command
28
+ result = subprocess.run(command, capture_output=True, text=True)
29
+
30
+ if result.returncode == 0:
31
+ print("PDF Was converted to Selectable Successfully.")
32
+
33
+ return True
34
+
35
+ return False
36
+
37
+
38
+ def _is_whole_pdf_ocr(self):
39
+ # read and extract the whole text from the pdf
40
+ THRESHOLD_VALUE = 100 # minimum text
41
+ is_whole_pdf_ocr_applicable = True
42
+
43
+ with fitz.open(self.input_pdf_path) as doc:
44
+ for page_num in range(len(doc)):
45
+ page = doc.load_page(page_num)
46
+ text = page.get_text()
47
+ if len(text.strip()) > THRESHOLD_VALUE:
48
+ is_whole_pdf_ocr_applicable = False
49
+ else:
50
+ # append to the page list
51
+ self.pages_to_ocr.append(page_num)
52
+
53
+ return is_whole_pdf_ocr_applicable
54
+
55
+ def _is_page_by_page_ocr(self):
56
+ return len(self.pages_to_ocr) > 0
57
+
58
+
59
+ def _apply_page_by_page_ocr(self):
60
+ # apply the ocr on all the pages needed ocr separately
61
+ pages_str_format = ','.join(str(page+1) for page in self.pages_to_ocr) # Convert to 1-based indexing
62
+
63
+ print("Pages to extract text from...", pages_str_format)
64
+
65
+ # Define the Commands
66
+ command = [
67
+ "ocrmypdf",
68
+ f"{self.input_pdf_path}",
69
+ f"{self.output_file_path}",
70
+ # "--force-ocr",
71
+ '--pages',pages_str_format,
72
+ "--tesseract-timeout", "1000" # increased timeout
73
+ ]
74
+
75
+ # run the command
76
+ result = subprocess.run(command, capture_output=True, text=True)
77
+
78
+ if result.returncode == 0:
79
+ print("Ocr was applied on the pages.")
80
+
81
+ return True
82
+
83
+ return False
84
+
85
+
86
+ def _extract_text_whole_pdf(self):
87
+ text = {}
88
+ text = ""
89
+ with fitz.open(self.output_file_path) as doc:
90
+ # Iterate through each page
91
+ for page_num in range(len(doc)):
92
+ page = doc.load_page(page_num)
93
+ # text[page_num] = page.get_text()
94
+ text += f'Page {page_num + 1} Text\n' + '*' * 20 + page.get_text() + '\n\n'
95
+
96
+ return text
97
+
98
+ def _extract_text_page_by_page(self):
99
+ text = {}
100
+
101
+ with fitz.open(self.output_file_path) as doc:
102
+ for page_num in self.pages_to_ocr:
103
+ page = doc.load_page(page_num)
104
+ text[page_num] = page.get_text()
105
+
106
+ return text
107
+
108
+
109
+ def get_ocr_results(self):
110
+ # apply the whole pdf ocr if all the pages are extractable
111
+ if self._is_whole_pdf_ocr():
112
+ print("Applying whole Pdf ocr...")
113
+
114
+ results = self._apply_whole_pdf_ocr()
115
+
116
+ # apply the extraction for the whole pdf through fitz
117
+ text = self._extract_text_whole_pdf() if results else None
118
+
119
+ print(text)
120
+
121
+ # temporary store the text
122
+ # ext = self.input_pdf_path.split('.')[-1]
123
+ # with open(self.input_pdf_path.replace(ext,'txt'),'w') as file:
124
+ # file.write(text)
125
+
126
+ # delete the output file
127
+ self.delete_file(self.output_file_path)
128
+ print("Done Successfully...")
129
+
130
+ return text
131
+
132
+
133
+ elif self._is_page_by_page_ocr():
134
+ print("Applying page by page ocr...")
135
+
136
+ # do page by page
137
+ results = self._apply_page_by_page_ocr()
138
+
139
+ # do the extraction for specific pages only throug fitz
140
+ text = self._extract_text_page_by_page() if results else None
141
+
142
+ print(text)
143
+
144
+ # delete the output file
145
+ self.delete_file(self.output_file_path)
146
+ print("Done...")
147
+
148
+ return text
149
+
150
+ else:
151
+ # do normal extraction
152
+ return None
153
+
154
+ print("No Ocr needed...")
155
+
156
+
157
+
158
+ def delete_file(self, file):
159
+ if os.path.exists(file):
160
+ os.remove(file)
161
+
162
+
163
+ def get_ocred_pages(self):
164
+ return self.pages_to_ocr
165
+
166
+ def run(self):
167
+
168
+ text = self.get_ocr_results()
169
+ pages_to_ignore = self.get_ocred_pages()
170
+
171
+ return text, pages_to_ignore
172
+
173
+
174
+
175
+
176
+