myocr-lib 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- myocr_lib-0.1.1/PKG-INFO +18 -0
- myocr_lib-0.1.1/README.md +0 -0
- myocr_lib-0.1.1/pyproject.toml +42 -0
- myocr_lib-0.1.1/setup.cfg +4 -0
- myocr_lib-0.1.1/src/myocr_lib.egg-info/PKG-INFO +18 -0
- myocr_lib-0.1.1/src/myocr_lib.egg-info/SOURCES.txt +11 -0
- myocr_lib-0.1.1/src/myocr_lib.egg-info/dependency_links.txt +1 -0
- myocr_lib-0.1.1/src/myocr_lib.egg-info/requires.txt +5 -0
- myocr_lib-0.1.1/src/myocr_lib.egg-info/top_level.txt +2 -0
- myocr_lib-0.1.1/src/ocr_img/__init__.py +3 -0
- myocr_lib-0.1.1/src/ocr_img/main_code.py +56 -0
- myocr_lib-0.1.1/src/ocr_pdf/__init__.py +3 -0
- myocr_lib-0.1.1/src/ocr_pdf/main_code.py +176 -0
myocr_lib-0.1.1/PKG-INFO
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: myocr_lib
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: To be available soon
|
|
5
|
+
Keywords: utility,library,functions,ocr,image-processing
|
|
6
|
+
Classifier: Development Status :: 3 - Alpha
|
|
7
|
+
Classifier: Intended Audience :: Developers
|
|
8
|
+
Classifier: Operating System :: OS Independent
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
12
|
+
Requires-Python: >=3.10
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
Requires-Dist: pytesseract>=0.3.13
|
|
15
|
+
Requires-Dist: ocrmypdf>=16.12.0
|
|
16
|
+
Requires-Dist: Pillow>=9.0.0
|
|
17
|
+
Requires-Dist: opencv-python>=4.12.0
|
|
18
|
+
Requires-Dist: PyMuPDF>=1.26.7
|
|
File without changes
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "myocr_lib" # Replace with your actual library name
|
|
7
|
+
version = "0.1.1"
|
|
8
|
+
# authors = [
|
|
9
|
+
# {name = "Muhammad Asif Ali", email = "creativedeveloper151214@gmail.com"}
|
|
10
|
+
# ]
|
|
11
|
+
description = "To be available soon"
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
requires-python = ">=3.10"
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 3 - Alpha",
|
|
16
|
+
"Intended Audience :: Developers",
|
|
17
|
+
"Operating System :: OS Independent",
|
|
18
|
+
"Programming Language :: Python :: 3.10",
|
|
19
|
+
"Programming Language :: Python :: 3.11",
|
|
20
|
+
"Programming Language :: Python :: 3.12",
|
|
21
|
+
]
|
|
22
|
+
keywords = ["utility", "library","functions","ocr","image-processing"] # Add relevant keywords
|
|
23
|
+
dependencies = [
|
|
24
|
+
# Add your dependencies here, e.g.:
|
|
25
|
+
# "requests>=2.28.0",
|
|
26
|
+
# "numpy>=1.20.0",
|
|
27
|
+
|
|
28
|
+
"pytesseract>=0.3.13",
|
|
29
|
+
"ocrmypdf>=16.12.0",
|
|
30
|
+
"Pillow>=9.0.0",
|
|
31
|
+
"opencv-python>=4.12.0",
|
|
32
|
+
"PyMuPDF>=1.26.7",
|
|
33
|
+
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
[tool.setuptools.packages.find]
|
|
39
|
+
where = ["src"]
|
|
40
|
+
|
|
41
|
+
[tool.setuptools.package-dir]
|
|
42
|
+
"" = "src"
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: myocr_lib
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: To be available soon
|
|
5
|
+
Keywords: utility,library,functions,ocr,image-processing
|
|
6
|
+
Classifier: Development Status :: 3 - Alpha
|
|
7
|
+
Classifier: Intended Audience :: Developers
|
|
8
|
+
Classifier: Operating System :: OS Independent
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
12
|
+
Requires-Python: >=3.10
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
Requires-Dist: pytesseract>=0.3.13
|
|
15
|
+
Requires-Dist: ocrmypdf>=16.12.0
|
|
16
|
+
Requires-Dist: Pillow>=9.0.0
|
|
17
|
+
Requires-Dist: opencv-python>=4.12.0
|
|
18
|
+
Requires-Dist: PyMuPDF>=1.26.7
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
src/myocr_lib.egg-info/PKG-INFO
|
|
4
|
+
src/myocr_lib.egg-info/SOURCES.txt
|
|
5
|
+
src/myocr_lib.egg-info/dependency_links.txt
|
|
6
|
+
src/myocr_lib.egg-info/requires.txt
|
|
7
|
+
src/myocr_lib.egg-info/top_level.txt
|
|
8
|
+
src/ocr_img/__init__.py
|
|
9
|
+
src/ocr_img/main_code.py
|
|
10
|
+
src/ocr_pdf/__init__.py
|
|
11
|
+
src/ocr_pdf/main_code.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import cv2
|
|
2
|
+
from PIL import Image
|
|
3
|
+
import pytesseract
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ImageOCR:
|
|
7
|
+
def __init__(self, image_path):
|
|
8
|
+
|
|
9
|
+
# variables section
|
|
10
|
+
self.image_path = image_path
|
|
11
|
+
|
|
12
|
+
def _preprocess_image(self, img_path):
|
|
13
|
+
|
|
14
|
+
img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
|
|
15
|
+
|
|
16
|
+
# check if it supported webp
|
|
17
|
+
if img is None:
|
|
18
|
+
# Process for the webp image
|
|
19
|
+
img = Image.open(img_path).convert("RGB")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
else:
|
|
23
|
+
|
|
24
|
+
# Light preprocessing (fast)
|
|
25
|
+
img = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
|
|
26
|
+
|
|
27
|
+
return img
|
|
28
|
+
|
|
29
|
+
def _apply_ocr(self, img):
|
|
30
|
+
text = pytesseract.image_to_string(
|
|
31
|
+
img,
|
|
32
|
+
lang="eng",
|
|
33
|
+
config="--oem 3 --psm 6"
|
|
34
|
+
)
|
|
35
|
+
return text
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def run(self):
|
|
39
|
+
# preprocess the image for better results
|
|
40
|
+
img = self._preprocess_image(self.image_path)
|
|
41
|
+
|
|
42
|
+
# apply the ocr
|
|
43
|
+
text = self._apply_ocr(img)
|
|
44
|
+
|
|
45
|
+
print(text)
|
|
46
|
+
|
|
47
|
+
# temporary save to file for demo
|
|
48
|
+
# get the extension
|
|
49
|
+
# ext = self.image_path.split('.')[-1]
|
|
50
|
+
# with open(f'{self.image_path.replace(ext,"txt")}', 'w') as file:
|
|
51
|
+
# file.write(text)
|
|
52
|
+
|
|
53
|
+
return text
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
|
|
2
|
+
import ocrmypdf
|
|
3
|
+
import fitz # PyMuPDF
|
|
4
|
+
import subprocess
|
|
5
|
+
import os
|
|
6
|
+
|
|
7
|
+
class OCRDataExtractor:
|
|
8
|
+
def __init__(self, input_pdf_path):
|
|
9
|
+
|
|
10
|
+
# variables section
|
|
11
|
+
self.input_pdf_path = input_pdf_path
|
|
12
|
+
self.output_file_path = input_pdf_path.replace('.pdf','_output.pdf')
|
|
13
|
+
self.pages_to_ocr = [] # holds the pages needing ocr
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _apply_whole_pdf_ocr(self):
|
|
18
|
+
# Define the Commands
|
|
19
|
+
command = [
|
|
20
|
+
"ocrmypdf",
|
|
21
|
+
f"{self.input_pdf_path}",
|
|
22
|
+
f"{self.output_file_path}",
|
|
23
|
+
# "--force-ocr",
|
|
24
|
+
"--tesseract-timeout", "1000" # increased timeout
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
# run the command
|
|
28
|
+
result = subprocess.run(command, capture_output=True, text=True)
|
|
29
|
+
|
|
30
|
+
if result.returncode == 0:
|
|
31
|
+
print("PDF Was converted to Selectable Successfully.")
|
|
32
|
+
|
|
33
|
+
return True
|
|
34
|
+
|
|
35
|
+
return False
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _is_whole_pdf_ocr(self):
|
|
39
|
+
# read and extract the whole text from the pdf
|
|
40
|
+
THRESHOLD_VALUE = 100 # minimum text
|
|
41
|
+
is_whole_pdf_ocr_applicable = True
|
|
42
|
+
|
|
43
|
+
with fitz.open(self.input_pdf_path) as doc:
|
|
44
|
+
for page_num in range(len(doc)):
|
|
45
|
+
page = doc.load_page(page_num)
|
|
46
|
+
text = page.get_text()
|
|
47
|
+
if len(text.strip()) > THRESHOLD_VALUE:
|
|
48
|
+
is_whole_pdf_ocr_applicable = False
|
|
49
|
+
else:
|
|
50
|
+
# append to the page list
|
|
51
|
+
self.pages_to_ocr.append(page_num)
|
|
52
|
+
|
|
53
|
+
return is_whole_pdf_ocr_applicable
|
|
54
|
+
|
|
55
|
+
def _is_page_by_page_ocr(self):
|
|
56
|
+
return len(self.pages_to_ocr) > 0
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _apply_page_by_page_ocr(self):
|
|
60
|
+
# apply the ocr on all the pages needed ocr separately
|
|
61
|
+
pages_str_format = ','.join(str(page+1) for page in self.pages_to_ocr) # Convert to 1-based indexing
|
|
62
|
+
|
|
63
|
+
print("Pages to extract text from...", pages_str_format)
|
|
64
|
+
|
|
65
|
+
# Define the Commands
|
|
66
|
+
command = [
|
|
67
|
+
"ocrmypdf",
|
|
68
|
+
f"{self.input_pdf_path}",
|
|
69
|
+
f"{self.output_file_path}",
|
|
70
|
+
# "--force-ocr",
|
|
71
|
+
'--pages',pages_str_format,
|
|
72
|
+
"--tesseract-timeout", "1000" # increased timeout
|
|
73
|
+
]
|
|
74
|
+
|
|
75
|
+
# run the command
|
|
76
|
+
result = subprocess.run(command, capture_output=True, text=True)
|
|
77
|
+
|
|
78
|
+
if result.returncode == 0:
|
|
79
|
+
print("Ocr was applied on the pages.")
|
|
80
|
+
|
|
81
|
+
return True
|
|
82
|
+
|
|
83
|
+
return False
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _extract_text_whole_pdf(self):
|
|
87
|
+
text = {}
|
|
88
|
+
text = ""
|
|
89
|
+
with fitz.open(self.output_file_path) as doc:
|
|
90
|
+
# Iterate through each page
|
|
91
|
+
for page_num in range(len(doc)):
|
|
92
|
+
page = doc.load_page(page_num)
|
|
93
|
+
# text[page_num] = page.get_text()
|
|
94
|
+
text += f'Page {page_num + 1} Text\n' + '*' * 20 + page.get_text() + '\n\n'
|
|
95
|
+
|
|
96
|
+
return text
|
|
97
|
+
|
|
98
|
+
def _extract_text_page_by_page(self):
|
|
99
|
+
text = {}
|
|
100
|
+
|
|
101
|
+
with fitz.open(self.output_file_path) as doc:
|
|
102
|
+
for page_num in self.pages_to_ocr:
|
|
103
|
+
page = doc.load_page(page_num)
|
|
104
|
+
text[page_num] = page.get_text()
|
|
105
|
+
|
|
106
|
+
return text
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def get_ocr_results(self):
|
|
110
|
+
# apply the whole pdf ocr if all the pages are extractable
|
|
111
|
+
if self._is_whole_pdf_ocr():
|
|
112
|
+
print("Applying whole Pdf ocr...")
|
|
113
|
+
|
|
114
|
+
results = self._apply_whole_pdf_ocr()
|
|
115
|
+
|
|
116
|
+
# apply the extraction for the whole pdf through fitz
|
|
117
|
+
text = self._extract_text_whole_pdf() if results else None
|
|
118
|
+
|
|
119
|
+
print(text)
|
|
120
|
+
|
|
121
|
+
# temporary store the text
|
|
122
|
+
# ext = self.input_pdf_path.split('.')[-1]
|
|
123
|
+
# with open(self.input_pdf_path.replace(ext,'txt'),'w') as file:
|
|
124
|
+
# file.write(text)
|
|
125
|
+
|
|
126
|
+
# delete the output file
|
|
127
|
+
self.delete_file(self.output_file_path)
|
|
128
|
+
print("Done Successfully...")
|
|
129
|
+
|
|
130
|
+
return text
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
elif self._is_page_by_page_ocr():
|
|
134
|
+
print("Applying page by page ocr...")
|
|
135
|
+
|
|
136
|
+
# do page by page
|
|
137
|
+
results = self._apply_page_by_page_ocr()
|
|
138
|
+
|
|
139
|
+
# do the extraction for specific pages only throug fitz
|
|
140
|
+
text = self._extract_text_page_by_page() if results else None
|
|
141
|
+
|
|
142
|
+
print(text)
|
|
143
|
+
|
|
144
|
+
# delete the output file
|
|
145
|
+
self.delete_file(self.output_file_path)
|
|
146
|
+
print("Done...")
|
|
147
|
+
|
|
148
|
+
return text
|
|
149
|
+
|
|
150
|
+
else:
|
|
151
|
+
# do normal extraction
|
|
152
|
+
return None
|
|
153
|
+
|
|
154
|
+
print("No Ocr needed...")
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def delete_file(self, file):
|
|
159
|
+
if os.path.exists(file):
|
|
160
|
+
os.remove(file)
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def get_ocred_pages(self):
|
|
164
|
+
return self.pages_to_ocr
|
|
165
|
+
|
|
166
|
+
def run(self):
|
|
167
|
+
|
|
168
|
+
text = self.get_ocr_results()
|
|
169
|
+
pages_to_ignore = self.get_ocred_pages()
|
|
170
|
+
|
|
171
|
+
return text, pages_to_ignore
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
|