simple-ocr 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/simple-ocr.rb +8 -0
- data/lib/simple-ocr/image.rb +20 -0
- data/lib/simple-ocr/path.rb +52 -0
- data/lib/simple-ocr/scan.rb +54 -0
- data/lib/simple-ocr/zonal_ocr.rb +5 -0
- metadata +50 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 1f3b37c06436ef5f307e64c4033e5116b531f55c
|
4
|
+
data.tar.gz: 55a206753bfaec92711f8e42f2e5516dcde68ed6
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 8336d1620e4982e00961862ce62f11db12344cb671712988c8258a34ddd2a64b021e36116fe541d9d81c26211e154cbff41e30c7d2f2f07d168a8f4b43ea3f54
|
7
|
+
data.tar.gz: 23957867d8bba43086ee2cb9f76a331f9a9da4b68c7811cbc37bd227ca78a2b6e9acce9dab58404242b4e9da41df6eb4e3aa9813912d3d743a6c5b4623079039
|
data/lib/simple-ocr.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
module OCR
|
2
|
+
|
3
|
+
class Image
|
4
|
+
|
5
|
+
# Initialize your Input File.
|
6
|
+
#
|
7
|
+
# @params [String] path to input file.
|
8
|
+
def initialize(path)
|
9
|
+
@image = path
|
10
|
+
end
|
11
|
+
|
12
|
+
# OCR of input file (Main Function)
|
13
|
+
#
|
14
|
+
# @params [String, String, String] path to output file, options of conversion (e.g. Language), output format of file.
|
15
|
+
def scan(output_file, options, type)
|
16
|
+
Scan.new(@image, output_file, options, type).scan_img
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
module OCR
|
2
|
+
class Path
|
3
|
+
|
4
|
+
EXTENS = {:pdf => "pdf", :png => "png"}
|
5
|
+
|
6
|
+
# Initialize your Input File.
|
7
|
+
#
|
8
|
+
# @params [String] path to input file.
|
9
|
+
def initialize(input_file)
|
10
|
+
@input_file = input_file
|
11
|
+
end
|
12
|
+
|
13
|
+
# Split the Realname
|
14
|
+
#
|
15
|
+
# @return [Array] name and extension
|
16
|
+
def name_exten
|
17
|
+
File.basename(@input_file).split(".")
|
18
|
+
end
|
19
|
+
|
20
|
+
# Duplicate the input file path
|
21
|
+
#
|
22
|
+
# @return [String] input file path
|
23
|
+
def duplicate_path
|
24
|
+
return @input_file.dup
|
25
|
+
end
|
26
|
+
|
27
|
+
# From PDF to Image conversion
|
28
|
+
#
|
29
|
+
# @return [String] Converted Image Path
|
30
|
+
def image_path
|
31
|
+
duppath = duplicate_path
|
32
|
+
duppath[name_exten[1]] = Path::EXTENS[:png]
|
33
|
+
return duppath
|
34
|
+
end
|
35
|
+
|
36
|
+
# Clean your Input File
|
37
|
+
#
|
38
|
+
# @return [String] Cleaned Image Path
|
39
|
+
def clean_image_path
|
40
|
+
duppath = duplicate_path
|
41
|
+
duppath[get_filename] = "cleaned_"+name_exten[0]+".png"
|
42
|
+
return duppath
|
43
|
+
end
|
44
|
+
|
45
|
+
# Get the FileName
|
46
|
+
#
|
47
|
+
# @return [String] Filename
|
48
|
+
def get_filename
|
49
|
+
File.basename(@input_file).split("/")[0]
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
require 'open3'
|
2
|
+
|
3
|
+
module OCR
|
4
|
+
class Scan
|
5
|
+
|
6
|
+
EXTENS = %w{pdf}
|
7
|
+
|
8
|
+
# Initialize your Input File, Output File, Options, Type.
|
9
|
+
#
|
10
|
+
# @params [String, String, String, String] path to input file, path to output file, options of conversion (e.g. Language), output format of file.
|
11
|
+
def initialize(input_file, output_file, options, type)
|
12
|
+
@output_file = output_file
|
13
|
+
@options = options
|
14
|
+
@type = type
|
15
|
+
@input_file = input_file
|
16
|
+
if OCR::Path.new(input_file).name_exten[1] == OCR::Path::EXTENS[:pdf]
|
17
|
+
@image = OCR::Path.new(input_file).image_path
|
18
|
+
convert_to_img
|
19
|
+
else
|
20
|
+
@image = input_file
|
21
|
+
end
|
22
|
+
@clean_image = OCR::Path.new(input_file).clean_image_path
|
23
|
+
end
|
24
|
+
|
25
|
+
# Conversion of PDF to Image
|
26
|
+
def convert_to_img
|
27
|
+
`gs -sDEVICE=png16m '-r#{OCR::MIN_DENSITY}' -o '#{@image}' '#{@input_file}'`
|
28
|
+
end
|
29
|
+
|
30
|
+
# OCR of Input
|
31
|
+
def scan_img
|
32
|
+
clean_img
|
33
|
+
`tesseract '#{@clean_image}' #{@options} '#{@output_file}' #{@type}`
|
34
|
+
delete_files
|
35
|
+
end
|
36
|
+
|
37
|
+
# Execute Command
|
38
|
+
def exec_command(command)
|
39
|
+
Open3.popen3(command)
|
40
|
+
end
|
41
|
+
|
42
|
+
# Shell Script for cleaning the Image.
|
43
|
+
def clean_img
|
44
|
+
`sh ./textcleaner -g -e stretch -f 25 -o 20 -t 30 -u -s 1 -T -p 20 '#{@image}' '#{@clean_image}'`
|
45
|
+
end
|
46
|
+
|
47
|
+
# Deleting unnecessary file after processing.
|
48
|
+
def delete_files
|
49
|
+
FileUtils.rm_rf(@clean_image)
|
50
|
+
FileUtils.rm_rf(@image) if OCR::Path.new(@input_file).name_exten[1] == "pdf"
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
54
|
+
end
|
metadata
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: simple-ocr
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Swaathi Kakarla
|
8
|
+
- Shilpi Agrawal
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2015-10-28 00:00:00.000000000 Z
|
13
|
+
dependencies: []
|
14
|
+
description: Provides smart conversion of all scanned Images.
|
15
|
+
email: shilpi@skcript.com
|
16
|
+
executables: []
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- lib/simple-ocr.rb
|
21
|
+
- lib/simple-ocr/image.rb
|
22
|
+
- lib/simple-ocr/path.rb
|
23
|
+
- lib/simple-ocr/scan.rb
|
24
|
+
- lib/simple-ocr/zonal_ocr.rb
|
25
|
+
homepage: http://www.skcript.com
|
26
|
+
licenses:
|
27
|
+
- Closed
|
28
|
+
metadata: {}
|
29
|
+
post_install_message:
|
30
|
+
rdoc_options: []
|
31
|
+
require_paths:
|
32
|
+
- lib
|
33
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
34
|
+
requirements:
|
35
|
+
- - ">="
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
39
|
+
requirements:
|
40
|
+
- - ">="
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
version: '0'
|
43
|
+
requirements: []
|
44
|
+
rubyforge_project:
|
45
|
+
rubygems_version: 2.4.5
|
46
|
+
signing_key:
|
47
|
+
specification_version: 4
|
48
|
+
summary: OCR Engine by Skcript
|
49
|
+
test_files: []
|
50
|
+
has_rdoc:
|