tesseract_ffi 0.2.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b299b8e91905a54a81999730de02ae2dbf5ed58cf2bbf826854c178fd5f33296
4
- data.tar.gz: 8906f6a6dd21011c076c8d785f6ac99e2297112831f829a924a32f3763b09713
3
+ metadata.gz: f462f5b13af429c30a228b4b3aeedb26af935c7caa7cff1752c7647a03a78d19
4
+ data.tar.gz: 14bb8f67addf6c8e3b6961d09e5ae0992830ddf32be048e1eae92fcefc958f57
5
5
  SHA512:
6
- metadata.gz: 9d11f40349117f99080ba4d3eb7f75c4c3d44941c6a3e4aaeec71f22b0ac8e6bc0a361e89c6c657a3fd864853300112883c42162ee5af89f20b841a3cb91f544
7
- data.tar.gz: cb2e721f373141bd6f42ef97ebd39257a6735c1b34e241739cdc4cfcdf5f0d7c6747ad95df3b6aa13b12e8a49115081dda447c708c5a6b64718d9dc57c739d2c
6
+ metadata.gz: 31feb442ee485aadd9f5df19029aba9d6991702cb4635f9842826661ee3d001c72c5cd633e2c8172018318921e95157c17adb4fe9d9ce8a88e7a13a9cb1c9add
7
+ data.tar.gz: 2e4371431e1b7d3f19dd5f40fe4e3d9237f7b1ce056b066a9d16b89070bfdbaf1cf80b5a8ec41003c4fb19ed065aedb956baa98ddfdbbcb1f6d33519fe53d1f8
@@ -4,7 +4,7 @@ GEM
4
4
  awesome_print (1.8.0)
5
5
  docile (1.3.2)
6
6
  ffi (1.13.1)
7
- hocr_reader (0.1.0)
7
+ hocr_reader (0.2.0)
8
8
  nokogiri (~> 1.10.10)
9
9
  mini_portile2 (2.4.0)
10
10
  minitest (5.14.1)
@@ -13,7 +13,7 @@ GEM
13
13
  mini_portile2 (~> 2.4.0)
14
14
  rake (13.0.1)
15
15
  rdoc (6.2.1)
16
- simplecov (0.18.5)
16
+ simplecov (0.19.0)
17
17
  docile (~> 1.1)
18
18
  simplecov-html (~> 0.11)
19
19
  simplecov-html (0.12.2)
data/Rakefile CHANGED
@@ -19,6 +19,12 @@ Rake::TestTask.new(:test_units) do |test|
19
19
  test.verbose = true
20
20
  end
21
21
 
22
+ Rake::TestTask.new(:bench) do |test|
23
+ test.libs << 'lib' << 'test'
24
+ test.pattern = 'test/performance/*_bench.rb'
25
+ test.verbose = true
26
+ end
27
+
22
28
  task :default => :test
23
29
 
24
30
 
@@ -2,8 +2,9 @@
2
2
 
3
3
  require 'ffi'
4
4
  require 'tesseract_ffi/version'
5
- require 'tesseract_ffi/conf_vars' # mix-in to tesseract
6
- require 'tesseract_ffi/oem' # mix-in to tesseract
5
+ require 'tesseract_ffi/conf_vars' # mix-in to tesseract
6
+ require 'tesseract_ffi/oem' # mix-in to tesseract
7
+ require 'tesseract_ffi/rectangles' # mix-in to tesseract
7
8
  require 'tesseract_ffi/tesseract'
8
9
  require 'tesseract_ffi/tess_exception'
9
10
  require 'tesseract_ffi/quick'
@@ -0,0 +1,58 @@
1
+ # frozen_string_literal: true
2
+
3
+ module TesseractFFI
4
+ # module Rectangles mixin for recognizing text blocks defined by rectangles
5
+ module Rectangles
6
+ def set_rectangle(x_coord, y_coord, width, height)
7
+ tess_set_rectangle(@handle, x_coord, y_coord, width, height)
8
+ end
9
+
10
+ def recognize_rectangle(x_coord, y_coord, width, height)
11
+ setup do
12
+ set_rectangle(x_coord, y_coord, width, height)
13
+ ocr
14
+ end
15
+ end
16
+
17
+ # rubocop:disable Metrics/MethodLength
18
+ def recognize_rectangles(rectangle_list)
19
+ @texts = []
20
+ @rectangle_list = nil
21
+ if valid_rectangle_list? rectangle_list
22
+ @rectangle_list = rectangle_list
23
+ setup do
24
+ @rectangle_list.each do |r|
25
+ set_rectangle(r[0], r[1], r[2], r[3])
26
+ ocr
27
+ @texts << @utf8_text.strip
28
+ end
29
+ end
30
+ end
31
+ @texts
32
+ end
33
+
34
+ def valid_rectangle_list?(list)
35
+ if list.is_a?(Array) && list.all? { |r| valid_rectangle?(r) }
36
+ true
37
+ else
38
+ msg = 'Tess Error Argument must be a list'
39
+ # copy the error message as we are not going to Setup
40
+ @errors << msg
41
+ raise TessException.new(error_msg: msg)
42
+ end
43
+ end
44
+
45
+ def valid_rectangle?(rectangle)
46
+ if rectangle.is_a?(Array) &&
47
+ rectangle.length == 4 &&
48
+ rectangle.all? { |r| r.is_a?(Integer) }
49
+ true
50
+ else
51
+ msg = 'Tesseract Error Argument must be array of 4 Integer'
52
+ @errors << msg
53
+ raise TessException.new(error_msg: msg)
54
+ end
55
+ end
56
+ # rubocop:enable Metrics/MethodLength
57
+ end
58
+ end
@@ -6,22 +6,29 @@ module TesseractFFI
6
6
  include TesseractFFI
7
7
  include ConfVars
8
8
  include OEM
9
+ include Rectangles
9
10
 
10
11
  attr_accessor :language, :file_name, :source_resolution
11
12
  attr_reader :utf8_text, :hocr_text, :errors
12
13
 
13
14
  def initialize(file_name: nil, language: 'eng', source_resolution: 72, oem: DEFAULT)
14
- @language = language
15
- raise TessException.new(error_msg: 'file_name must be provided') unless file_name
16
-
17
- raise TessException.new(error_msg: "File #{file_name} not found") unless File.exist? file_name
15
+ unless file_name.is_a?(String) && File.exist?(file_name)
16
+ log 'Error: Tesseract needs a file ' + (file_name || 'no name given')
17
+ raise TessException.new(error_msg: 'file_name must be provided')
18
+ end
18
19
 
19
20
  @file_name = file_name
21
+ @language = language
20
22
  @source_resolution = source_resolution
21
23
  @oem = oem
22
24
  @errors = []
23
25
  end
24
26
 
27
+ # just output to console
28
+ def log(msg)
29
+ puts msg
30
+ end
31
+
25
32
  # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
26
33
  def setup
27
34
  @handle = tess_create
@@ -37,6 +44,7 @@ module TesseractFFI
37
44
  yield # run the block for recognition etc
38
45
  rescue TessException => e
39
46
  @errors << "Tesseract Error #{e.error[:error_msg]}"
47
+ log @errors
40
48
  raise
41
49
  ensure
42
50
  tess_end(@handle)
@@ -48,9 +56,7 @@ module TesseractFFI
48
56
  tess_set_source_resolution(@handle, @source_resolution)
49
57
  raise TessException.new(error_msg: 'Recognition Error') if tess_recognize(@handle, 0) != 0
50
58
 
51
- @utf8_text = ''
52
- text = tess_get_utf8(@handle, 0)
53
- @utf8_text = text.encode('UTF-8') if text
59
+ @utf8_text = tess_get_utf8(@handle, 0)
54
60
  @hocr_text = tess_get_hocr(@handle, 0)
55
61
  end
56
62
 
@@ -67,28 +73,5 @@ module TesseractFFI
67
73
  TesseractFFI.tess_process_pages(@handle, @file_name, nil, 5000, pdf_renderer)
68
74
  end
69
75
  end
70
-
71
- def set_rectangle(x_coord, y_coord, width, height)
72
- tess_set_rectangle(@handle, x_coord, y_coord, width, height)
73
- end
74
-
75
- def recognize_rectangle(x_coord, y_coord, width, height)
76
- setup do
77
- set_rectangle(x_coord, y_coord, width, height)
78
- ocr
79
- end
80
- end
81
-
82
- def recognize_rectangles(rectangle_list)
83
- texts = []
84
- setup do
85
- rectangle_list.each do |r|
86
- set_rectangle(r[0], r[1], r[2], r[3])
87
- ocr
88
- texts << @utf8_text.strip
89
- end
90
- end
91
- texts
92
- end
93
76
  end
94
77
  end
@@ -2,5 +2,5 @@
2
2
 
3
3
  # module with version
4
4
  module TesseractFFI
5
- VERSION = '0.2.0'
5
+ VERSION = '0.7.0'
6
6
  end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'lib/tesseract_ffi/version'
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "tesseract_ffi"
7
+ spec.version = TesseractFFI::VERSION
8
+ spec.authors = ["David Verrier"]
9
+ spec.email = ["dverrier@gmail.com"]
10
+
11
+ spec.summary = %q{This is a Ruby-wrapper around the Tesseract C-API.}
12
+ spec.description = %q{This wrapper around the C-API allows use of the legacy modes of the recognition engine.}
13
+ spec.homepage = "https://github.com/dverrier/tesseract_ffi"
14
+ spec.license = "MIT"
15
+ spec.required_ruby_version = Gem::Requirement.new(">= 2.3.0")
16
+
17
+ spec.metadata["allowed_push_host"] = "https://rubygems.org"
18
+
19
+ spec.metadata["homepage_uri"] = spec.homepage
20
+ spec.metadata["source_code_uri"] = "https://github.com/dverrier/tesseract_ffi"
21
+ spec.metadata["changelog_uri"] = "https://github.com/dverrier/tesseract_ffi"
22
+
23
+ # Specify which files should be added to the gem when it is released.
24
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
25
+ spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
26
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
27
+ end
28
+ spec.add_dependency "ffi"
29
+
30
+ spec.add_development_dependency "minitest", "~> 5.14.1"
31
+ spec.add_development_dependency "mocha", "~> 1.11.2"
32
+ spec.add_development_dependency "simplecov", "~> 0.18.5"
33
+ spec.add_development_dependency "awesome_print", "> 1.8.0"
34
+ spec.add_development_dependency "nokogiri","> 1.10.10"
35
+
36
+ spec.bindir = "bin"
37
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
38
+ spec.require_paths = ["lib"]
39
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tesseract_ffi
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - David Verrier
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-08-12 00:00:00.000000000 Z
11
+ date: 2020-09-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ffi
@@ -121,9 +121,11 @@ files:
121
121
  - lib/tesseract_ffi/conf_vars.rb
122
122
  - lib/tesseract_ffi/oem.rb
123
123
  - lib/tesseract_ffi/quick.rb
124
+ - lib/tesseract_ffi/rectangles.rb
124
125
  - lib/tesseract_ffi/tess_exception.rb
125
126
  - lib/tesseract_ffi/tesseract.rb
126
127
  - lib/tesseract_ffi/version.rb
128
+ - tesseract_ffi.gemspec
127
129
  - tmp/keep.txt
128
130
  homepage: https://github.com/dverrier/tesseract_ffi
129
131
  licenses: