tesseract_ffi 0.2.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/Rakefile +6 -0
- data/lib/tesseract_ffi.rb +3 -2
- data/lib/tesseract_ffi/rectangles.rb +58 -0
- data/lib/tesseract_ffi/tesseract.rb +13 -30
- data/lib/tesseract_ffi/version.rb +1 -1
- data/tesseract_ffi.gemspec +39 -0
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f462f5b13af429c30a228b4b3aeedb26af935c7caa7cff1752c7647a03a78d19
|
4
|
+
data.tar.gz: 14bb8f67addf6c8e3b6961d09e5ae0992830ddf32be048e1eae92fcefc958f57
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 31feb442ee485aadd9f5df19029aba9d6991702cb4635f9842826661ee3d001c72c5cd633e2c8172018318921e95157c17adb4fe9d9ce8a88e7a13a9cb1c9add
|
7
|
+
data.tar.gz: 2e4371431e1b7d3f19dd5f40fe4e3d9237f7b1ce056b066a9d16b89070bfdbaf1cf80b5a8ec41003c4fb19ed065aedb956baa98ddfdbbcb1f6d33519fe53d1f8
|
data/Gemfile.lock
CHANGED
@@ -4,7 +4,7 @@ GEM
|
|
4
4
|
awesome_print (1.8.0)
|
5
5
|
docile (1.3.2)
|
6
6
|
ffi (1.13.1)
|
7
|
-
hocr_reader (0.
|
7
|
+
hocr_reader (0.2.0)
|
8
8
|
nokogiri (~> 1.10.10)
|
9
9
|
mini_portile2 (2.4.0)
|
10
10
|
minitest (5.14.1)
|
@@ -13,7 +13,7 @@ GEM
|
|
13
13
|
mini_portile2 (~> 2.4.0)
|
14
14
|
rake (13.0.1)
|
15
15
|
rdoc (6.2.1)
|
16
|
-
simplecov (0.
|
16
|
+
simplecov (0.19.0)
|
17
17
|
docile (~> 1.1)
|
18
18
|
simplecov-html (~> 0.11)
|
19
19
|
simplecov-html (0.12.2)
|
data/Rakefile
CHANGED
@@ -19,6 +19,12 @@ Rake::TestTask.new(:test_units) do |test|
|
|
19
19
|
test.verbose = true
|
20
20
|
end
|
21
21
|
|
22
|
+
Rake::TestTask.new(:bench) do |test|
|
23
|
+
test.libs << 'lib' << 'test'
|
24
|
+
test.pattern = 'test/performance/*_bench.rb'
|
25
|
+
test.verbose = true
|
26
|
+
end
|
27
|
+
|
22
28
|
task :default => :test
|
23
29
|
|
24
30
|
|
data/lib/tesseract_ffi.rb
CHANGED
@@ -2,8 +2,9 @@
|
|
2
2
|
|
3
3
|
require 'ffi'
|
4
4
|
require 'tesseract_ffi/version'
|
5
|
-
require 'tesseract_ffi/conf_vars'
|
6
|
-
require 'tesseract_ffi/oem'
|
5
|
+
require 'tesseract_ffi/conf_vars' # mix-in to tesseract
|
6
|
+
require 'tesseract_ffi/oem' # mix-in to tesseract
|
7
|
+
require 'tesseract_ffi/rectangles' # mix-in to tesseract
|
7
8
|
require 'tesseract_ffi/tesseract'
|
8
9
|
require 'tesseract_ffi/tess_exception'
|
9
10
|
require 'tesseract_ffi/quick'
|
@@ -0,0 +1,58 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module TesseractFFI
|
4
|
+
# module Rectangles mixin for recognizing text blocks defined by rectangles
|
5
|
+
module Rectangles
|
6
|
+
def set_rectangle(x_coord, y_coord, width, height)
|
7
|
+
tess_set_rectangle(@handle, x_coord, y_coord, width, height)
|
8
|
+
end
|
9
|
+
|
10
|
+
def recognize_rectangle(x_coord, y_coord, width, height)
|
11
|
+
setup do
|
12
|
+
set_rectangle(x_coord, y_coord, width, height)
|
13
|
+
ocr
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
# rubocop:disable Metrics/MethodLength
|
18
|
+
def recognize_rectangles(rectangle_list)
|
19
|
+
@texts = []
|
20
|
+
@rectangle_list = nil
|
21
|
+
if valid_rectangle_list? rectangle_list
|
22
|
+
@rectangle_list = rectangle_list
|
23
|
+
setup do
|
24
|
+
@rectangle_list.each do |r|
|
25
|
+
set_rectangle(r[0], r[1], r[2], r[3])
|
26
|
+
ocr
|
27
|
+
@texts << @utf8_text.strip
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
@texts
|
32
|
+
end
|
33
|
+
|
34
|
+
def valid_rectangle_list?(list)
|
35
|
+
if list.is_a?(Array) && list.all? { |r| valid_rectangle?(r) }
|
36
|
+
true
|
37
|
+
else
|
38
|
+
msg = 'Tess Error Argument must be a list'
|
39
|
+
# copy the error message as we are not going to Setup
|
40
|
+
@errors << msg
|
41
|
+
raise TessException.new(error_msg: msg)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def valid_rectangle?(rectangle)
|
46
|
+
if rectangle.is_a?(Array) &&
|
47
|
+
rectangle.length == 4 &&
|
48
|
+
rectangle.all? { |r| r.is_a?(Integer) }
|
49
|
+
true
|
50
|
+
else
|
51
|
+
msg = 'Tesseract Error Argument must be array of 4 Integer'
|
52
|
+
@errors << msg
|
53
|
+
raise TessException.new(error_msg: msg)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
# rubocop:enable Metrics/MethodLength
|
57
|
+
end
|
58
|
+
end
|
@@ -6,22 +6,29 @@ module TesseractFFI
|
|
6
6
|
include TesseractFFI
|
7
7
|
include ConfVars
|
8
8
|
include OEM
|
9
|
+
include Rectangles
|
9
10
|
|
10
11
|
attr_accessor :language, :file_name, :source_resolution
|
11
12
|
attr_reader :utf8_text, :hocr_text, :errors
|
12
13
|
|
13
14
|
def initialize(file_name: nil, language: 'eng', source_resolution: 72, oem: DEFAULT)
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
15
|
+
unless file_name.is_a?(String) && File.exist?(file_name)
|
16
|
+
log 'Error: Tesseract needs a file ' + (file_name || 'no name given')
|
17
|
+
raise TessException.new(error_msg: 'file_name must be provided')
|
18
|
+
end
|
18
19
|
|
19
20
|
@file_name = file_name
|
21
|
+
@language = language
|
20
22
|
@source_resolution = source_resolution
|
21
23
|
@oem = oem
|
22
24
|
@errors = []
|
23
25
|
end
|
24
26
|
|
27
|
+
# just output to console
|
28
|
+
def log(msg)
|
29
|
+
puts msg
|
30
|
+
end
|
31
|
+
|
25
32
|
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
26
33
|
def setup
|
27
34
|
@handle = tess_create
|
@@ -37,6 +44,7 @@ module TesseractFFI
|
|
37
44
|
yield # run the block for recognition etc
|
38
45
|
rescue TessException => e
|
39
46
|
@errors << "Tesseract Error #{e.error[:error_msg]}"
|
47
|
+
log @errors
|
40
48
|
raise
|
41
49
|
ensure
|
42
50
|
tess_end(@handle)
|
@@ -48,9 +56,7 @@ module TesseractFFI
|
|
48
56
|
tess_set_source_resolution(@handle, @source_resolution)
|
49
57
|
raise TessException.new(error_msg: 'Recognition Error') if tess_recognize(@handle, 0) != 0
|
50
58
|
|
51
|
-
@utf8_text =
|
52
|
-
text = tess_get_utf8(@handle, 0)
|
53
|
-
@utf8_text = text.encode('UTF-8') if text
|
59
|
+
@utf8_text = tess_get_utf8(@handle, 0)
|
54
60
|
@hocr_text = tess_get_hocr(@handle, 0)
|
55
61
|
end
|
56
62
|
|
@@ -67,28 +73,5 @@ module TesseractFFI
|
|
67
73
|
TesseractFFI.tess_process_pages(@handle, @file_name, nil, 5000, pdf_renderer)
|
68
74
|
end
|
69
75
|
end
|
70
|
-
|
71
|
-
def set_rectangle(x_coord, y_coord, width, height)
|
72
|
-
tess_set_rectangle(@handle, x_coord, y_coord, width, height)
|
73
|
-
end
|
74
|
-
|
75
|
-
def recognize_rectangle(x_coord, y_coord, width, height)
|
76
|
-
setup do
|
77
|
-
set_rectangle(x_coord, y_coord, width, height)
|
78
|
-
ocr
|
79
|
-
end
|
80
|
-
end
|
81
|
-
|
82
|
-
def recognize_rectangles(rectangle_list)
|
83
|
-
texts = []
|
84
|
-
setup do
|
85
|
-
rectangle_list.each do |r|
|
86
|
-
set_rectangle(r[0], r[1], r[2], r[3])
|
87
|
-
ocr
|
88
|
-
texts << @utf8_text.strip
|
89
|
-
end
|
90
|
-
end
|
91
|
-
texts
|
92
|
-
end
|
93
76
|
end
|
94
77
|
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'lib/tesseract_ffi/version'
|
4
|
+
|
5
|
+
Gem::Specification.new do |spec|
|
6
|
+
spec.name = "tesseract_ffi"
|
7
|
+
spec.version = TesseractFFI::VERSION
|
8
|
+
spec.authors = ["David Verrier"]
|
9
|
+
spec.email = ["dverrier@gmail.com"]
|
10
|
+
|
11
|
+
spec.summary = %q{This is a Ruby-wrapper around the Tesseract C-API.}
|
12
|
+
spec.description = %q{This wrapper around the C-API allows use of the legacy modes of the recognition engine.}
|
13
|
+
spec.homepage = "https://github.com/dverrier/tesseract_ffi"
|
14
|
+
spec.license = "MIT"
|
15
|
+
spec.required_ruby_version = Gem::Requirement.new(">= 2.3.0")
|
16
|
+
|
17
|
+
spec.metadata["allowed_push_host"] = "https://rubygems.org"
|
18
|
+
|
19
|
+
spec.metadata["homepage_uri"] = spec.homepage
|
20
|
+
spec.metadata["source_code_uri"] = "https://github.com/dverrier/tesseract_ffi"
|
21
|
+
spec.metadata["changelog_uri"] = "https://github.com/dverrier/tesseract_ffi"
|
22
|
+
|
23
|
+
# Specify which files should be added to the gem when it is released.
|
24
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
25
|
+
spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
|
26
|
+
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
27
|
+
end
|
28
|
+
spec.add_dependency "ffi"
|
29
|
+
|
30
|
+
spec.add_development_dependency "minitest", "~> 5.14.1"
|
31
|
+
spec.add_development_dependency "mocha", "~> 1.11.2"
|
32
|
+
spec.add_development_dependency "simplecov", "~> 0.18.5"
|
33
|
+
spec.add_development_dependency "awesome_print", "> 1.8.0"
|
34
|
+
spec.add_development_dependency "nokogiri","> 1.10.10"
|
35
|
+
|
36
|
+
spec.bindir = "bin"
|
37
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
38
|
+
spec.require_paths = ["lib"]
|
39
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tesseract_ffi
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.7.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- David Verrier
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-08
|
11
|
+
date: 2020-09-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ffi
|
@@ -121,9 +121,11 @@ files:
|
|
121
121
|
- lib/tesseract_ffi/conf_vars.rb
|
122
122
|
- lib/tesseract_ffi/oem.rb
|
123
123
|
- lib/tesseract_ffi/quick.rb
|
124
|
+
- lib/tesseract_ffi/rectangles.rb
|
124
125
|
- lib/tesseract_ffi/tess_exception.rb
|
125
126
|
- lib/tesseract_ffi/tesseract.rb
|
126
127
|
- lib/tesseract_ffi/version.rb
|
128
|
+
- tesseract_ffi.gemspec
|
127
129
|
- tmp/keep.txt
|
128
130
|
homepage: https://github.com/dverrier/tesseract_ffi
|
129
131
|
licenses:
|