simple-ocr 1.0.2 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/simple-ocr/image.rb +3 -2
- data/lib/simple-ocr/path.rb +5 -5
- data/lib/simple-ocr/scan.rb +5 -9
- data/lib/textcleaner +564 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0be030f3372adbcdaf03d9eeaa5be0a4b09a138f
|
4
|
+
data.tar.gz: 9fb31cb8b19cd79e6b03ad460867b17d96ffc581
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 395dd09141e4ee03841e23ed86e3333943b97fb3f47b8a46bb82f0513d01d844969cefc20576e4c129179d258df8c5579fdc4d9bb72cc9eb40112cbf5043425a
|
7
|
+
data.tar.gz: b0de671ff7cf564bc2d90b57d003bff019097ca22b1fa9597fe99ed66b72df0c7ff53f75e8bf81c7045223f7c4b07ad35ebfe7c8ccf334dd86ef7f36eac0b067
|
data/lib/simple-ocr/image.rb
CHANGED
@@ -13,7 +13,8 @@ module OCR
|
|
13
13
|
#
|
14
14
|
# @params [String, String, String] path to output file, options of conversion (e.g. Language), output format of file.
|
15
15
|
def scan(output_file, options, type)
|
16
|
-
|
17
|
-
|
16
|
+
Scan.new(@image, output_file, options, type).scan_img
|
17
|
+
end
|
18
|
+
|
18
19
|
end
|
19
20
|
end
|
data/lib/simple-ocr/path.rb
CHANGED
@@ -21,7 +21,7 @@ module OCR
|
|
21
21
|
#
|
22
22
|
# @return [String] input file path
|
23
23
|
def duplicate_path
|
24
|
-
@input_file.dup
|
24
|
+
return @input_file.dup
|
25
25
|
end
|
26
26
|
|
27
27
|
# From PDF to Image conversion
|
@@ -30,7 +30,7 @@ module OCR
|
|
30
30
|
def image_path
|
31
31
|
duppath = duplicate_path
|
32
32
|
duppath[name_exten[1]] = Path::EXTENS[:png]
|
33
|
-
duppath
|
33
|
+
return duppath
|
34
34
|
end
|
35
35
|
|
36
36
|
# Clean your Input File
|
@@ -38,15 +38,15 @@ module OCR
|
|
38
38
|
# @return [String] Cleaned Image Path
|
39
39
|
def clean_image_path
|
40
40
|
duppath = duplicate_path
|
41
|
-
duppath[get_filename] = "cleaned_"+
|
42
|
-
duppath
|
41
|
+
duppath[get_filename] = "cleaned_"+name_exten[0]+".png"
|
42
|
+
return duppath
|
43
43
|
end
|
44
44
|
|
45
45
|
# Get the FileName
|
46
46
|
#
|
47
47
|
# @return [String] Filename
|
48
48
|
def get_filename
|
49
|
-
File.basename(@input_file)
|
49
|
+
File.basename(@input_file).split("/")[0]
|
50
50
|
end
|
51
51
|
end
|
52
52
|
end
|
data/lib/simple-ocr/scan.rb
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
require 'open3'
|
2
|
-
require 'fileutils'
|
3
2
|
|
4
3
|
module OCR
|
5
4
|
class Scan
|
@@ -14,20 +13,20 @@ module OCR
|
|
14
13
|
@options = options
|
15
14
|
@type = handle_output_type(type)
|
16
15
|
@input_file = input_file
|
17
|
-
if
|
16
|
+
if OCR::Path.new(input_file).name_exten[1] == OCR::Path::EXTENS[:pdf]
|
18
17
|
@image = OCR::Path.new(input_file).image_path
|
19
18
|
convert_to_img
|
20
19
|
else
|
21
20
|
@image = input_file
|
22
21
|
end
|
23
|
-
@clean_image = OCR::Path.new(
|
22
|
+
@clean_image = OCR::Path.new(input_file).clean_image_path
|
24
23
|
end
|
25
24
|
|
26
25
|
def handle_output_type(type)
|
27
26
|
if type == :pdf
|
28
|
-
|
27
|
+
"pdf"
|
29
28
|
elsif type == :hocr
|
30
|
-
|
29
|
+
"hocr"
|
31
30
|
else
|
32
31
|
nil.to_s
|
33
32
|
end
|
@@ -60,11 +59,8 @@ module OCR
|
|
60
59
|
# Deleting unnecessary files after processing.
|
61
60
|
def delete_files
|
62
61
|
FileUtils.rm_rf(@clean_image)
|
63
|
-
FileUtils.rm_rf(@image) if pdf
|
62
|
+
FileUtils.rm_rf(@image) if OCR::Path.new(@input_file).name_exten[1] == "pdf"
|
64
63
|
end
|
65
64
|
|
66
|
-
def pdf?(input_file = @input_file)
|
67
|
-
OCR::Path.new(input_file).name_exten[1] == OCR::Path::EXTENS[:pdf]
|
68
|
-
end
|
69
65
|
end
|
70
66
|
end
|
data/lib/textcleaner
ADDED
@@ -0,0 +1,564 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
#
|
3
|
+
# Developed by Fred Weinhaus 6/9/2009 .......... revised 6/26/2015
|
4
|
+
#
|
5
|
+
# ------------------------------------------------------------------------------
|
6
|
+
#
|
7
|
+
# Licensing:
|
8
|
+
#
|
9
|
+
# Copyright � Fred Weinhaus
|
10
|
+
#
|
11
|
+
# My scripts are available free of charge for non-commercial use, ONLY.
|
12
|
+
#
|
13
|
+
# For use of my scripts in commercial (for-profit) environments or
|
14
|
+
# non-free applications, please contact me (Fred Weinhaus) for
|
15
|
+
# licensing arrangements. My email address is fmw at alink dot net.
|
16
|
+
#
|
17
|
+
# If you: 1) redistribute, 2) incorporate any of these scripts into other
|
18
|
+
# free applications or 3) reprogram them in another scripting language,
|
19
|
+
# then you must contact me for permission, especially if the result might
|
20
|
+
# be used in a commercial or for-profit environment.
|
21
|
+
#
|
22
|
+
# My scripts are also subject, in a subordinate manner, to the ImageMagick
|
23
|
+
# license, which can be found at: http://www.imagemagick.org/script/license.php
|
24
|
+
#
|
25
|
+
# ------------------------------------------------------------------------------
|
26
|
+
#
|
27
|
+
####
|
28
|
+
#
|
29
|
+
# USAGE: textcleaner [-r rotate] [-l layout] [-c cropoff] [-g] [-e enhance ] [-f filtersize] [-o offset] [-u] [-t threshold] [-s sharpamt] [-s saturation] [-a adaptblur] [-T] [-p padamt] [-b bgcolor] infile outfile
|
30
|
+
# USAGE: textcleaner [-help]
|
31
|
+
#
|
32
|
+
# OPTIONS:
|
33
|
+
#
|
34
|
+
# -r rotate rotate image 90 degrees in direction specified if
|
35
|
+
# aspect ratio does not match layout; options are cw
|
36
|
+
# (or clockwise), ccw (or counterclockwise) and n
|
37
|
+
# (or none); default=none or no rotation
|
38
|
+
# -l layout desired layout; options are p (or portrait) or
|
39
|
+
# l (or landscape); default=portrait
|
40
|
+
# -c cropoff image cropping offsets after potential rotate 90;
|
41
|
+
# choices: one, two or four non-negative integer comma
|
42
|
+
# separated values; one value will crop all around;
|
43
|
+
# two values will crop at left/right,top/bottom;
|
44
|
+
# four values will crop left,top,right,bottom
|
45
|
+
# -g convert document to grayscale before enhancing
|
46
|
+
# -e enhance enhance image brightness before cleaning;
|
47
|
+
# choices are: none, stretch or normalize;
|
48
|
+
# default=stretch
|
49
|
+
# -f filtersize size of filter used to clean background;
|
50
|
+
# integer>0; default=15
|
51
|
+
# -o offset offset of filter in percent used to reduce noise;
|
52
|
+
# integer>=0; default=5
|
53
|
+
# -u unrotate image; cannot unrotate more than
|
54
|
+
# about 5 degrees
|
55
|
+
# -t threshold text smoothing threshold; 0<=threshold<=100;
|
56
|
+
# nominal value is about 50; default is no smoothing
|
57
|
+
# -s sharpamt sharpening amount in pixels; float>=0;
|
58
|
+
# nominal about 1; default=0
|
59
|
+
# -S saturation color saturation expressed as percent; integer>=0;
|
60
|
+
# only applicable if -g not set; a value of 100 is
|
61
|
+
# no change; default=200 (double saturation)
|
62
|
+
# -a adaptblur alternate text smoothing using adaptive blur;
|
63
|
+
# floats>=0; default=0 (no smoothing)
|
64
|
+
# -T trim background around outer part of image
|
65
|
+
# -p padamt border pad amount around outer part of image;
|
66
|
+
# integer>=0; default=0
|
67
|
+
# -b bgcolor desired color for background; default=white
|
68
|
+
#
|
69
|
+
###
|
70
|
+
#
|
71
|
+
# NAME: TEXTCLEANER
|
72
|
+
#
|
73
|
+
# PURPOSE: To process a scanned document of text to clean the text background.
|
74
|
+
#
|
75
|
+
# DESCRIPTION: TEXTCLEANER processses a scanned document of text to clean
|
76
|
+
# the text background and enhance the text. The order of processing is:
|
77
|
+
# 1) optional 90 degree rotate if aspect does not match layout
|
78
|
+
# 2) optional crop,
|
79
|
+
# 3) optional convert to grayscale,
|
80
|
+
# 4) optional enhance,
|
81
|
+
# 5) filter to clean background and optionally smooth/antialias,
|
82
|
+
# 6) optional unrotate (limited to about 5 degrees or less),
|
83
|
+
# 7) optional text smoothing,
|
84
|
+
# 8) optional sharpening,
|
85
|
+
# 9) optional saturation change (if -g is not specified),
|
86
|
+
# 10) optional alternate text smoothing via adaptive blur
|
87
|
+
# 11) optional auto trim of border (effective only if background well-cleaned),
|
88
|
+
# 12) optional pad of border
|
89
|
+
#
|
90
|
+
# OPTIONS:
|
91
|
+
#
|
92
|
+
# -r rotate ... ROTATE image either clockwise or counterclockwise by 90 degrees,
|
93
|
+
# if image aspect ratio does not match the layout mode. Choices are: cc (or
|
94
|
+
# clockwise), ccw (or counterclockwise) and n (or none). The default is no rotation.
|
95
|
+
#
|
96
|
+
# -l layout ... LAYOUT for determining if rotation is to be applied. The choices
|
97
|
+
# are p (or portrait) or l (or landscape). The image will be rotated if rotate is
|
98
|
+
# specified and the aspect ratio of the image does not match the layout chosen.
|
99
|
+
# The default is portrait.
|
100
|
+
#
|
101
|
+
# -c cropoffsets ... CROPOFFSETS are the image cropping offsets after potential
|
102
|
+
# rotate 90. Choices: one, two or four non-negative integer comma separated
|
103
|
+
# values. One value will crop all around. Two values will crop at
|
104
|
+
# left/right,top/bottom. Four values will crop left,top,right,bottom.
|
105
|
+
#
|
106
|
+
# -g ... Convert the document to grayscale.
|
107
|
+
#
|
108
|
+
# -e enhance ... ENHANCE brightness of image. The choices are: none, stretch,
|
109
|
+
# or normalize. The default=stretch.
|
110
|
+
#
|
111
|
+
# -f filtersize ... FILTERSIZE is the size of the filter used to clean up the
|
112
|
+
# background. Values are integers>0. The filtersize needs to be larger than
|
113
|
+
# the thickness of the writing, but the smaller the better beyond this. Making it
|
114
|
+
# larger will increase the processing time and may lose text. The default is 15.
|
115
|
+
#
|
116
|
+
# -o offset ... OFFSET is the offset threshold in percent used by the filter
|
117
|
+
# to eliminate noise. Values are integers>=0. Values too small will leave much
|
118
|
+
# noise and artifacts in the result. Values too large will remove too much
|
119
|
+
# text leaving gaps. The default is 5.
|
120
|
+
#
|
121
|
+
# -u ... UNROTATE the image. This is limited to about 5 degrees or less.
|
122
|
+
#
|
123
|
+
# -t threshold ... THRESHOLD is the text smoothing threshold. Values are integers
|
124
|
+
# between 0 and 100. Smaller values smooth/thicken the text more. Larger values
|
125
|
+
# thin, but can result in gaps in the text. Nominal value is in the middle at
|
126
|
+
# about 50. The default is to disable smoothing.
|
127
|
+
#
|
128
|
+
# -s sharpamt ... SHARPAMT is the amount of pixel sharpening to be applied to
|
129
|
+
# the resulting text. Values are floats>=0. If used, it should be small
|
130
|
+
# (suggested about 1). The default=0 (no sharpening).
|
131
|
+
#
|
132
|
+
# -S saturation ... SATURATION is the desired color saturation of the text
|
133
|
+
# expressed as a percentage. Values are integers>=0. A value of 100 is no change.
|
134
|
+
# Larger values will make the text colors more saturated. The default=200
|
135
|
+
# indicates double saturation. Not applicable when -g option specified.
|
136
|
+
#
|
137
|
+
# -a adaptblur ... ADAPTBLUR applies an alternate text smoothing using
|
138
|
+
# an adaptive blur. The values are floats>=0. The default=0 indicates no
|
139
|
+
# blurring.
|
140
|
+
#
|
141
|
+
# -T ... TRIM the border around the image.
|
142
|
+
#
|
143
|
+
# -p padamt ... PADAMT is the border pad amount in pixels. The default=0.
|
144
|
+
#
|
145
|
+
# -b bgcolor ... BGCOLOR is the desired background color after it has been
|
146
|
+
# cleaned up. Any valid IM color may be use. The default is white.
|
147
|
+
#
|
148
|
+
# CAVEAT: No guarantee that this script will work on all platforms,
|
149
|
+
# nor that trapping of inconsistent parameters is complete and
|
150
|
+
# foolproof. Use At Your Own Risk.
|
151
|
+
#
|
152
|
+
######
|
153
|
+
#
|
154
|
+
|
155
|
+
# set default values
|
156
|
+
rotate="none" # rotate 90 clockwise (cw) or counterclockwise (ccw)
|
157
|
+
layout="portrait" # rotate 90 to match layout; portrait or landscape
|
158
|
+
cropoff="" # crop amounts; comma separate list of 1, 2 or 4 integers
|
159
|
+
numcrops=0 # number of crops flag
|
160
|
+
gray="no" # convert to grayscale flag
|
161
|
+
enhance="stretch" # none, stretch, normalize
|
162
|
+
filtersize=15 # local area filter size
|
163
|
+
offset=5 # local area offset to remove "noise"; too small-get noise, too large-lose text
|
164
|
+
threshold="" # smoothing threshold
|
165
|
+
sharpamt=0 # sharpen sigma
|
166
|
+
saturation=200 # color saturation percent; 100 is no change
|
167
|
+
adaptblur=0 # adaptive blur
|
168
|
+
unrotate="no" # unrotate flag
|
169
|
+
trim="no" # trim flag
|
170
|
+
padamt=0 # pad amount
|
171
|
+
bgcolor="white" # color for output whiteboard background
|
172
|
+
|
173
|
+
# set directory for temporary files
|
174
|
+
dir="/tmp" # suggestions are dir="." or dir="/tmp"
|
175
|
+
|
176
|
+
# set up functions to report Usage and Usage with Description
|
177
|
+
PROGNAME=`type $0 | awk '{print $3}'` # search for executable on path
|
178
|
+
PROGDIR=`dirname $PROGNAME` # extract directory of program
|
179
|
+
PROGNAME=`basename $PROGNAME` # base name of program
|
180
|
+
usage1()
|
181
|
+
{
|
182
|
+
echo >&2 ""
|
183
|
+
echo >&2 "$PROGNAME:" "$@"
|
184
|
+
sed >&2 -e '1,/^####/d; /^###/g; /^#/!q; s/^#//; s/^ //; 4,$p' "$PROGDIR/$PROGNAME"
|
185
|
+
}
|
186
|
+
usage2()
|
187
|
+
{
|
188
|
+
echo >&2 ""
|
189
|
+
echo >&2 "$PROGNAME:" "$@"
|
190
|
+
sed >&2 -e '1,/^####/d; /^######/g; /^#/!q; s/^#*//; s/^ //; 4,$p' "$PROGDIR/$PROGNAME"
|
191
|
+
}
|
192
|
+
|
193
|
+
|
194
|
+
# function to report error messages
|
195
|
+
errMsg()
|
196
|
+
{
|
197
|
+
echo ""
|
198
|
+
echo $1
|
199
|
+
echo ""
|
200
|
+
usage1
|
201
|
+
exit 1
|
202
|
+
}
|
203
|
+
|
204
|
+
|
205
|
+
# function to test for minus at start of value of second part of option 1 or 2
|
206
|
+
checkMinus()
|
207
|
+
{
|
208
|
+
test=`echo "$1" | grep -c '^-.*$'` # returns 1 if match; 0 otherwise
|
209
|
+
[ $test -eq 1 ] && errMsg "$errorMsg"
|
210
|
+
}
|
211
|
+
|
212
|
+
# test for correct number of arguments and get values
|
213
|
+
if [ $# -eq 0 ]
|
214
|
+
then
|
215
|
+
# help information
|
216
|
+
echo ""
|
217
|
+
usage2
|
218
|
+
exit 0
|
219
|
+
elif [ $# -gt 27 ]
|
220
|
+
then
|
221
|
+
errMsg "--- TOO MANY ARGUMENTS WERE PROVIDED ---"
|
222
|
+
else
|
223
|
+
while [ $# -gt 0 ]
|
224
|
+
do
|
225
|
+
# get parameter values
|
226
|
+
case "$1" in
|
227
|
+
-h|-help) # help information
|
228
|
+
echo ""
|
229
|
+
usage2
|
230
|
+
exit 0
|
231
|
+
;;
|
232
|
+
-r) # rotate
|
233
|
+
shift # to get the next parameter
|
234
|
+
# test if parameter starts with minus sign
|
235
|
+
errorMsg="--- INVALID ROTATE SPECIFICATION ---"
|
236
|
+
checkMinus "$1"
|
237
|
+
rotate=`echo "$1" | tr "[:upper:]" "[:lower:]"`
|
238
|
+
case "$rotate" in
|
239
|
+
none|n) rotate="none" ;;
|
240
|
+
clockwise|cw) rotate="cw" ;;
|
241
|
+
counterclockwise|ccw) rotate="ccw" ;;
|
242
|
+
*) errMsg "--- ROTATE=$rotate IS NOT A VALID CHOICE ---" ;;
|
243
|
+
esac
|
244
|
+
;;
|
245
|
+
-l) # layout
|
246
|
+
shift # to get the next parameter
|
247
|
+
# test if parameter starts with minus sign
|
248
|
+
errorMsg="--- INVALID LAYOUT SPECIFICATION ---"
|
249
|
+
checkMinus "$1"
|
250
|
+
layout=`echo "$1" | tr "[:upper:]" "[:lower:]"`
|
251
|
+
case "$layout" in
|
252
|
+
portrait|p) layout="portrait" ;;
|
253
|
+
landscape|l) layout="landscape" ;;
|
254
|
+
*) errMsg "--- LAYOUT=$layout IS NOT A VALID CHOICE ---" ;;
|
255
|
+
esac
|
256
|
+
;;
|
257
|
+
-c) # get cropoffsets
|
258
|
+
shift # to get the next parameter
|
259
|
+
# test if parameter starts with minus sign
|
260
|
+
errorMsg="--- INVALID CROPOFFSETS SPECIFICATION ---"
|
261
|
+
checkMinus "$1"
|
262
|
+
cropoff="$1"
|
263
|
+
cropoff="${cropoff},"
|
264
|
+
cropoff=`expr "$cropoff" : '\([,0-9]*\)'`
|
265
|
+
numcrops=`echo "$cropoff" | tr "," " " | wc -w`
|
266
|
+
[ "$cropoff" = "" ] && errMsg "--- ONE OR TWO OR FOUR OFFSETS MUST BE PROVIDED ---"
|
267
|
+
[ $numcrops -ne 1 -a $numcrops -ne 2 -a $numcrops -ne 4 ] && errMsg "--- ONE OR TWO OR FOUR OFFSETS MUST BE PROVIDED ---"
|
268
|
+
crop1=`echo "$cropoff" | cut -d, -f1`
|
269
|
+
crop2=`echo "$cropoff" | cut -d, -f2`
|
270
|
+
crop3=`echo "$cropoff" | cut -d, -f3`
|
271
|
+
crop4=`echo "$cropoff" | cut -d, -f4`
|
272
|
+
;;
|
273
|
+
-g) # set grayscale
|
274
|
+
gray="yes"
|
275
|
+
;;
|
276
|
+
-e) # get enhance
|
277
|
+
shift # to get the next parameter
|
278
|
+
# test if parameter starts with minus sign
|
279
|
+
errorMsg="--- INVALID ENHANCE SPECIFICATION ---"
|
280
|
+
checkMinus "$1"
|
281
|
+
enhance="$1"
|
282
|
+
case "$1" in
|
283
|
+
none) ;;
|
284
|
+
stretch) ;;
|
285
|
+
normalize) ;;
|
286
|
+
*) errMsg "--- ENHANCE=$enhance IS NOT A VALID CHOICE ---" ;;
|
287
|
+
esac
|
288
|
+
;;
|
289
|
+
-f) # get filtersize
|
290
|
+
shift # to get the next parameter
|
291
|
+
# test if parameter starts with minus sign
|
292
|
+
errorMsg="--- INVALID FILTERSIZE SPECIFICATION ---"
|
293
|
+
checkMinus "$1"
|
294
|
+
filtersize=`expr "$1" : '\([0-9]*\)'`
|
295
|
+
[ "$filtersize" = "" ] && errMsg "--- FILTERSIZE=$filtersize MUST BE A NON-NEGATIVE INTEGER ---"
|
296
|
+
filtersizetest=`echo "$filtersize < 1" | bc`
|
297
|
+
[ $filtersizetest -eq 1 ] && errMsg "--- FILTERSIZE=$filtersize MUST BE AN INTEGER GREATER THAN 0 ---"
|
298
|
+
;;
|
299
|
+
-o) # get offset
|
300
|
+
shift # to get the next parameter
|
301
|
+
# test if parameter starts with minus sign
|
302
|
+
errorMsg="--- INVALID OFFSET SPECIFICATION ---"
|
303
|
+
checkMinus "$1"
|
304
|
+
offset=`expr "$1" : '\([0-9]*\)'`
|
305
|
+
[ "$offset" = "" ] && errMsg "--- OFFSET=$offset MUST BE A NON-NEGATIVE INTEGER ---"
|
306
|
+
;;
|
307
|
+
-t) # get threshold
|
308
|
+
shift # to get the next parameter
|
309
|
+
# test if parameter starts with minus sign
|
310
|
+
errorMsg="--- INVALID THRESHOLD SPECIFICATION ---"
|
311
|
+
checkMinus "$1"
|
312
|
+
threshold=`expr "$1" : '\([0-9]*\)'`
|
313
|
+
[ "$threshold" = "" ] && errMsg "--- THRESHOLD=$threshold MUST BE A NON-NEGATIVE INTEGER ---"
|
314
|
+
thresholdtestA=`echo "$threshold < 0" | bc`
|
315
|
+
thresholdtestB=`echo "$threshold > 100" | bc`
|
316
|
+
[ $thresholdtestA -eq 1 -o $thresholdtestB -eq 1 ] && errMsg "--- THRESHOLD=$threshold MUST BE AN INTEGER GREATER BETWEEN 0 AND 100 ---"
|
317
|
+
;;
|
318
|
+
-s) # get sharpamt
|
319
|
+
shift # to get the next parameter
|
320
|
+
# test if parameter starts with minus sign
|
321
|
+
errorMsg="--- INVALID SHARPAMT SPECIFICATION ---"
|
322
|
+
checkMinus "$1"
|
323
|
+
sharpamt=`expr "$1" : '\([.0-9]*\)'`
|
324
|
+
[ "$sharpamt" = "" ] && errMsg "--- SHARPAMT=$sharpamt MUST BE A NON-NEGATIVE FLOAT ---"
|
325
|
+
;;
|
326
|
+
-S) # get saturation
|
327
|
+
shift # to get the next parameter
|
328
|
+
# test if parameter starts with minus sign
|
329
|
+
errorMsg="--- INVALID SATURATION SPECIFICATION ---"
|
330
|
+
checkMinus "$1"
|
331
|
+
saturation=`expr "$1" : '\([0-9]*\)'`
|
332
|
+
[ "$saturation" = "" ] && errMsg "--- SATURATION=$saturation MUST BE A NON-NEGATIVE INTEGER ---"
|
333
|
+
;;
|
334
|
+
-a) # get adaptblur
|
335
|
+
shift # to get the next parameter
|
336
|
+
# test if parameter starts with minus sign
|
337
|
+
errorMsg="--- INVALID ADAPTBLUR SPECIFICATION ---"
|
338
|
+
checkMinus "$1"
|
339
|
+
adaptblur=`expr "$1" : '\([.0-9]*\)'`
|
340
|
+
[ "$adaptblur" = "" ] && errMsg "--- ADAPTBLUR=$adaptblur MUST BE A NON-NEGATIVE FLOAT ---"
|
341
|
+
;;
|
342
|
+
-u) # set unrotate
|
343
|
+
unrotate="yes"
|
344
|
+
;;
|
345
|
+
-T) # set trim
|
346
|
+
trim="yes"
|
347
|
+
;;
|
348
|
+
-p) # get padamt
|
349
|
+
shift # to get the next parameter
|
350
|
+
# test if parameter starts with minus sign
|
351
|
+
errorMsg="--- INVALID PADAMT SPECIFICATION ---"
|
352
|
+
checkMinus "$1"
|
353
|
+
padamt=`expr "$1" : '\([0-9]*\)'`
|
354
|
+
[ "$padamt" = "" ] && errMsg "--- PADAMT=$padamt MUST BE A NON-NEGATIVE INTEGER ---"
|
355
|
+
;;
|
356
|
+
-b) # get bgcolor
|
357
|
+
shift # to get the next parameter
|
358
|
+
# test if parameter starts with minus sign
|
359
|
+
errorMsg="--- INVALID BACKGROUND COLOR SPECIFICATION ---"
|
360
|
+
checkMinus "$1"
|
361
|
+
bgcolor="$1"
|
362
|
+
;;
|
363
|
+
-) # STDIN and end of arguments
|
364
|
+
break
|
365
|
+
;;
|
366
|
+
-*) # any other - argument
|
367
|
+
errMsg "--- UNKNOWN OPTION ---"
|
368
|
+
;;
|
369
|
+
*) # end of arguments
|
370
|
+
break
|
371
|
+
;;
|
372
|
+
esac
|
373
|
+
shift # next option
|
374
|
+
done
|
375
|
+
#
|
376
|
+
# get infile and outfile
|
377
|
+
infile="$1"
|
378
|
+
outfile="$2"
|
379
|
+
fi
|
380
|
+
|
381
|
+
# test that infile provided
|
382
|
+
[ "$infile" = "" ] && errMsg "NO INPUT FILE SPECIFIED"
|
383
|
+
|
384
|
+
# test that outfile provided
|
385
|
+
[ "$outfile" = "" ] && errMsg "NO OUTPUT FILE SPECIFIED"
|
386
|
+
|
387
|
+
# get im version
|
388
|
+
im_version=`convert -list configure | \
|
389
|
+
sed '/^LIB_VERSION_NUMBER /!d; s//,/; s/,/,0/g; s/,0*\([0-9][0-9]\)/\1/g' | head -n 1`
|
390
|
+
|
391
|
+
tmpA1="$dir/textcleaner_1_$$.mpc"
|
392
|
+
tmpA2="$dir/textcleaner_1_$$.cache"
|
393
|
+
trap "rm -f $tmpA1 $tmpA2 exit 0;" 0
|
394
|
+
trap "rm -f $tmpA1 $tmpA2; exit 1" 1 2 3 15
|
395
|
+
#trap "rm -f $tmpA1 $tmpA2; exit 1" ERR
|
396
|
+
|
397
|
+
|
398
|
+
# test for hdri enabled
|
399
|
+
# NOTE: must put grep before trap using ERR in case it does not find a match
|
400
|
+
if [ "$im_version" -ge "07000000" ]; then
|
401
|
+
hdri_on=`convert -version | grep "HDRI"`
|
402
|
+
else
|
403
|
+
hdri_on=`convert -list configure | grep "enable-hdri"`
|
404
|
+
fi
|
405
|
+
|
406
|
+
|
407
|
+
# colorspace RGB and sRGB swapped between 6.7.5.5 and 6.7.6.7
|
408
|
+
# though probably not resolved until the latter
|
409
|
+
# then -colorspace gray changed to linear between 6.7.6.7 and 6.7.8.2
|
410
|
+
# then -separate converted to linear gray channels between 6.7.6.7 and 6.7.8.2,
|
411
|
+
# though probably not resolved until the latter
|
412
|
+
# so -colorspace HSL/HSB -separate and -colorspace gray became linear
|
413
|
+
# but we need to use -set colorspace RGB before using them at appropriate times
|
414
|
+
# so that results stay as in original script
|
415
|
+
# The following was determined from various version tests using textcleaner
|
416
|
+
# with IM 6.7.4.10, 6.7.6.10, 6.7.9.0
|
417
|
+
if [ "$im_version" -lt "06070607" -o "$im_version" -gt "06070707" ]; then
|
418
|
+
setcspace="-set colorspace RGB"
|
419
|
+
else
|
420
|
+
setcspace=""
|
421
|
+
fi
|
422
|
+
# no need for setcspace for grayscale or channels after 6.8.5.4
|
423
|
+
if [ "$im_version" -gt "06080504" ]; then
|
424
|
+
setcspace=""
|
425
|
+
fi
|
426
|
+
|
427
|
+
|
428
|
+
# read the input image into the TMP cached image.
|
429
|
+
convert -quiet "$infile" +repage "$tmpA1" ||
|
430
|
+
errMsg "--- FILE $infile NOT READABLE OR HAS ZERO SIZE ---"
|
431
|
+
|
432
|
+
# get image size
|
433
|
+
ww=`convert $tmpA1 -ping -format "%w" info:`
|
434
|
+
hh=`convert $tmpA1 -ping -format "%h" info:`
|
435
|
+
|
436
|
+
# get image h/w aspect ratio and determine if portrait=1 (h/w>1) or landscape=0 (h/w<1)
|
437
|
+
aspect=`convert xc: -format "%[fx:($hh/$ww)>=1?1:0]" info:`
|
438
|
+
|
439
|
+
#echo "ww=$ww; hh=$hh; aspect=$aspect"
|
440
|
+
|
441
|
+
# set up rotation
|
442
|
+
if [ "$layout" = "portrait" -a $aspect -eq 0 -a "$rotate" = "cw" ]; then
|
443
|
+
rotation="-rotate 90"
|
444
|
+
elif [ "$layout" = "portrait" -a $aspect -eq 0 -a "$rotate" = "ccw" ]; then
|
445
|
+
rotation="-rotate -90"
|
446
|
+
elif [ "$layout" = "landscape" -a $aspect -eq 1 -a "$rotate" = "cw" ]; then
|
447
|
+
rotation="-rotate 90"
|
448
|
+
elif [ "$layout" = "landscape" -a $aspect -eq 1 -a "$rotate" = "ccw" ]; then
|
449
|
+
rotation="-rotate -90"
|
450
|
+
else
|
451
|
+
rotation=""
|
452
|
+
fi
|
453
|
+
|
454
|
+
# set up cropping
|
455
|
+
if [ "$cropoff" != "" -a $numcrops -eq 1 ]; then
|
456
|
+
wwc=`convert xc: -format "%[fx:$ww-2*$crop1]" info:`
|
457
|
+
hhc=`convert xc: -format "%[fx:$hh-2*$crop1]" info:`
|
458
|
+
cropping="-crop ${wwc}x${hhc}+$crop1+$crop1 +repage"
|
459
|
+
elif [ "$cropoff" != "" -a $numcrops -eq 2 ]; then
|
460
|
+
wwc=`convert xc: -format "%[fx:$ww-2*$crop1]" info:`
|
461
|
+
hhc=`convert xc: -format "%[fx:$hh-2*$crop2]" info:`
|
462
|
+
cropping="-crop ${wwc}x${hhc}+$crop1+$crop2 +repage"
|
463
|
+
elif [ "$cropoff" != "" -a $numcrops -eq 4 ]; then
|
464
|
+
wwc=`convert xc: -format "%[fx:$ww-($crop1+$crop3)]" info:`
|
465
|
+
hhc=`convert xc: -format "%[fx:$hh-($crop2+$crop4)]" info:`
|
466
|
+
cropping="-crop ${wwc}x${hhc}+$crop1+$crop2 +repage"
|
467
|
+
else
|
468
|
+
cropping=""
|
469
|
+
fi
|
470
|
+
#echo "cropoff=$cropoff; numcrops=$numcrops; cropping=$cropping"
|
471
|
+
|
472
|
+
# test if grayscale
|
473
|
+
grayscale=`convert $tmpA1 -format "%[colorspace]" info:`
|
474
|
+
typegray=`convert $tmpA1 -format '%r' info: | grep 'Gray'`
|
475
|
+
if [ "$gray" = "yes" -o "$grayscale" = "Gray" -o "$typegray" != "" ]; then
|
476
|
+
makegray="$setcspace -colorspace gray -type grayscale"
|
477
|
+
else
|
478
|
+
makegray=""
|
479
|
+
fi
|
480
|
+
#echo "makegray=$makegray"
|
481
|
+
|
482
|
+
# set up enhance
|
483
|
+
if [ "$enhance" = "stretch" ]; then
|
484
|
+
enhancing="$setcspace -contrast-stretch 0"
|
485
|
+
elif [ "$enhance" = "normalize" ]; then
|
486
|
+
enhancing="$setcspace -normalize"
|
487
|
+
else
|
488
|
+
enhancing=""
|
489
|
+
fi
|
490
|
+
#echo "enhancing=$enhancing"
|
491
|
+
|
492
|
+
# setup blurring
|
493
|
+
if [ "$threshold" = "" ]; then
|
494
|
+
blurring=""
|
495
|
+
else
|
496
|
+
# note: any 0<bluramt<=1, will be the same as using bluramt=1, since radius must be used as an integer
|
497
|
+
# bluramt=`convert xc: -format "%[fx:$threshold/100]" info:`
|
498
|
+
# blurring="-blur ${bluramt}x65535 -level ${threshold}x100%"
|
499
|
+
blurring="-blur 1x65535 -level ${threshold}x100%"
|
500
|
+
fi
|
501
|
+
#echo "blurring=$blurring"
|
502
|
+
|
503
|
+
# set up unrotate
|
504
|
+
if [ "$unrotate" = "yes" ]; then
|
505
|
+
unrotating="-background $bgcolor -deskew 40%"
|
506
|
+
else
|
507
|
+
unrotating=""
|
508
|
+
fi
|
509
|
+
#echo "unrotating=$unrotating"
|
510
|
+
|
511
|
+
# setup sharpening
|
512
|
+
if [ "$sharpamt" = "0" -o "$sharpamt" = "0.0" ]; then
|
513
|
+
sharpening=""
|
514
|
+
else
|
515
|
+
sharpening="-sharpen 0x${sharpamt}"
|
516
|
+
fi
|
517
|
+
#echo "sharpening=$sharpening"
|
518
|
+
|
519
|
+
# setup modulation
|
520
|
+
[ "$gray" = "yes" -o "$grayscale" = "Gray" -o "$typegray" != "" ] && saturation=100
|
521
|
+
if [ $saturation -eq 100 ]; then
|
522
|
+
modulation=""
|
523
|
+
else
|
524
|
+
modulation="-modulate 100,$saturation,100"
|
525
|
+
fi
|
526
|
+
#echo "modulation=$modulation"
|
527
|
+
|
528
|
+
# set up adaptiveblurring
|
529
|
+
if [ "$adaptblur" = "0" ]; then
|
530
|
+
adaptiveblurring=""
|
531
|
+
else
|
532
|
+
adaptiveblurring="-adaptive-blur $adaptblur"
|
533
|
+
fi
|
534
|
+
|
535
|
+
# set up trim
|
536
|
+
if [ "$trim" = "yes" -a "$hdri_on" != "" ]; then
|
537
|
+
# hdri is enabled
|
538
|
+
# need to round near white to pure white for trim to work
|
539
|
+
trimming="-white-threshold 99.9% -trim +repage "
|
540
|
+
elif [ "$trim" = "yes" -a "$hdri_on" = "" ]; then
|
541
|
+
# hdri is not enabled
|
542
|
+
trimming="-trim +repage "
|
543
|
+
else
|
544
|
+
trimming=""
|
545
|
+
fi
|
546
|
+
#echo "trimming=$trimming"
|
547
|
+
|
548
|
+
# set up pad
|
549
|
+
if [ $padamt -gt 0 ]; then
|
550
|
+
# note must reset -compose from -compose copy_opacity as -border uses -compose
|
551
|
+
padding="-compose over -bordercolor $bgcolor -border $padamt"
|
552
|
+
else
|
553
|
+
padding=""
|
554
|
+
fi
|
555
|
+
#echo "padding=$padding"
|
556
|
+
|
557
|
+
|
558
|
+
# process image
|
559
|
+
convert -respect-parenthesis \( $tmpA1 $rotation $cropping $makegray $enhancing \) \
|
560
|
+
\( -clone 0 $setcspace -colorspace gray -negate -lat ${filtersize}x${filtersize}+${offset}% -contrast-stretch 0 $blurring \) \
|
561
|
+
-compose copy_opacity -composite -fill "$bgcolor" -opaque none -alpha off \
|
562
|
+
$unrotating $sharpening $modulation $adaptiveblurring $trimming $padding \
|
563
|
+
"$outfile"
|
564
|
+
exit 0
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: simple-ocr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Swaathi Kakarla
|
@@ -22,6 +22,7 @@ files:
|
|
22
22
|
- lib/simple-ocr/path.rb
|
23
23
|
- lib/simple-ocr/scan.rb
|
24
24
|
- lib/simple-ocr/zonal_ocr.rb
|
25
|
+
- lib/textcleaner
|
25
26
|
homepage: http://www.skcript.com
|
26
27
|
licenses:
|
27
28
|
- Closed
|
@@ -42,7 +43,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
42
43
|
version: '0'
|
43
44
|
requirements: []
|
44
45
|
rubyforge_project:
|
45
|
-
rubygems_version: 2.4.
|
46
|
+
rubygems_version: 2.4.5
|
46
47
|
signing_key:
|
47
48
|
specification_version: 4
|
48
49
|
summary: OCR Engine by Skcript
|