rtesseract 3.0.2 → 3.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +8 -0
- data/CHANGELOG.md +12 -0
- data/Gemfile +2 -2
- data/Gemfile.lock +2 -2
- data/Rakefile +3 -3
- data/bin/console +3 -3
- data/lib/rtesseract.rb +15 -12
- data/lib/rtesseract/base.rb +2 -1
- data/lib/rtesseract/box.rb +15 -12
- data/lib/rtesseract/check.rb +3 -4
- data/lib/rtesseract/command.rb +9 -6
- data/lib/rtesseract/configuration.rb +8 -4
- data/lib/rtesseract/pdf.rb +3 -3
- data/lib/rtesseract/text.rb +3 -3
- data/lib/rtesseract/tsv.rb +3 -3
- data/lib/rtesseract/version.rb +1 -1
- data/rtesseract.gemspec +18 -19
- metadata +18 -17
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8eb694ea4b37475f756451795c145b8ba9618a0e5e94b0774a90301bb1aa97a2
|
4
|
+
data.tar.gz: fe725c774fc39720ff830e47e4580f7335def9ece76f67fad77fe9722f415d6c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f40f8b53fc3c63e4968d9b1adab5153771730897fd681e354afea79f2007c28c4216dc61cad0d218dc7d76814360ae9169a0ffb8999ab6c8d15ef51ad712ec07
|
7
|
+
data.tar.gz: e2dfbf63b972c6e678d4bb79ec00064a958c9607f76cffe197bc26f555dff35bd7b0e1c263e376ede9598e2f429a344abb4b8d39b0e47ede6d1c09a32e6c44a4
|
data/.rubocop.yml
ADDED
data/CHANGELOG.md
CHANGED
data/Gemfile
CHANGED
@@ -1,6 +1,6 @@
|
|
1
|
-
source
|
1
|
+
source 'https://rubygems.org'
|
2
2
|
|
3
|
-
git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
|
3
|
+
git_source(:github) { |repo_name| "https://github.com/#{repo_name}" }
|
4
4
|
|
5
5
|
# Specify your gem's dependencies in rtesseract.gemspec
|
6
6
|
gemspec
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
rtesseract (3.0.
|
4
|
+
rtesseract (3.0.3)
|
5
5
|
nokogiri
|
6
6
|
|
7
7
|
GEM
|
@@ -17,7 +17,7 @@ GEM
|
|
17
17
|
docile (1.3.1)
|
18
18
|
json (2.1.0)
|
19
19
|
mini_portile2 (2.4.0)
|
20
|
-
nokogiri (1.
|
20
|
+
nokogiri (1.10.1)
|
21
21
|
mini_portile2 (~> 2.4.0)
|
22
22
|
rake (10.5.0)
|
23
23
|
rspec (3.8.0)
|
data/Rakefile
CHANGED
data/bin/console
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
require
|
4
|
-
require
|
3
|
+
require 'bundler/setup'
|
4
|
+
require 'rtesseract'
|
5
5
|
|
6
6
|
# You can add fixtures and/or initialization code here to make experimenting
|
7
7
|
# with your gem easier. You can also use a different console, if you like.
|
@@ -10,5 +10,5 @@ require "rtesseract"
|
|
10
10
|
# require "pry"
|
11
11
|
# Pry.start
|
12
12
|
|
13
|
-
require
|
13
|
+
require 'irb'
|
14
14
|
IRB.start(__FILE__)
|
data/lib/rtesseract.rb
CHANGED
@@ -1,11 +1,11 @@
|
|
1
|
-
require
|
2
|
-
require
|
3
|
-
require
|
4
|
-
require
|
5
|
-
require
|
6
|
-
require
|
7
|
-
require
|
8
|
-
require
|
1
|
+
require 'rtesseract/check'
|
2
|
+
require 'rtesseract/configuration'
|
3
|
+
require 'rtesseract/command'
|
4
|
+
require 'rtesseract/base'
|
5
|
+
require 'rtesseract/text'
|
6
|
+
require 'rtesseract/pdf'
|
7
|
+
require 'rtesseract/box'
|
8
|
+
require 'rtesseract/tsv'
|
9
9
|
|
10
10
|
class RTesseract
|
11
11
|
class Error < StandardError; end
|
@@ -15,10 +15,11 @@ class RTesseract
|
|
15
15
|
def initialize(src = '', options = {})
|
16
16
|
@source = src
|
17
17
|
@config = RTesseract.config.merge(options)
|
18
|
+
@errors = []
|
18
19
|
end
|
19
20
|
|
20
21
|
def to_box
|
21
|
-
Box.run(@source, config)
|
22
|
+
Box.run(@source, @errors, config)
|
22
23
|
end
|
23
24
|
|
24
25
|
def words
|
@@ -26,20 +27,22 @@ class RTesseract
|
|
26
27
|
end
|
27
28
|
|
28
29
|
def to_pdf
|
29
|
-
Pdf.run(@source, config)
|
30
|
+
Pdf.run(@source, @errors, config)
|
30
31
|
end
|
31
32
|
|
32
33
|
def to_tsv
|
33
|
-
Tsv.run(@source, config)
|
34
|
+
Tsv.run(@source, @errors, config)
|
34
35
|
end
|
35
36
|
|
36
37
|
# Output value
|
37
38
|
def to_s
|
38
|
-
Text.run(@source, config)
|
39
|
+
Text.run(@source, @errors, config)
|
39
40
|
end
|
40
41
|
|
41
42
|
# Remove spaces and break-lines
|
42
43
|
def to_s_without_spaces
|
43
44
|
to_s.gsub(/\s/, '')
|
44
45
|
end
|
46
|
+
|
47
|
+
attr_reader :errors
|
45
48
|
end
|
data/lib/rtesseract/base.rb
CHANGED
data/lib/rtesseract/box.rb
CHANGED
@@ -4,10 +4,10 @@ class RTesseract
|
|
4
4
|
module Box
|
5
5
|
extend RTesseract::Base
|
6
6
|
|
7
|
-
def self.run(source, options)
|
7
|
+
def self.run(source, errors, options)
|
8
8
|
options.tessedit_create_hocr = 1
|
9
9
|
|
10
|
-
RTesseract::Command.new(source, temp_file, options).run
|
10
|
+
RTesseract::Command.new(source, temp_file, errors, options).run
|
11
11
|
|
12
12
|
parse(File.read(temp_file('.hocr')))
|
13
13
|
end
|
@@ -15,16 +15,19 @@ class RTesseract
|
|
15
15
|
def self.parse(content)
|
16
16
|
html = Nokogiri::HTML(content)
|
17
17
|
html.css('span.ocrx_word, span.ocr_word').map do |word|
|
18
|
-
|
19
|
-
|
20
|
-
{
|
21
|
-
word: word.text,
|
22
|
-
x_start: @attributes[1].to_i,
|
23
|
-
y_start: @attributes[2].to_i,
|
24
|
-
x_end: @attributes[3].to_i,
|
25
|
-
y_end: @attributes[4].to_i
|
26
|
-
}
|
18
|
+
attributes = word.attributes['title'].value.to_s.delete(';').split(' ')
|
19
|
+
word_info(word, attributes)
|
27
20
|
end
|
28
21
|
end
|
22
|
+
|
23
|
+
def self.word_info(word, data)
|
24
|
+
{
|
25
|
+
word: word.text,
|
26
|
+
x_start: data[1].to_i,
|
27
|
+
y_start: data[2].to_i,
|
28
|
+
x_end: data[3].to_i,
|
29
|
+
y_end: data[4].to_i
|
30
|
+
}
|
31
|
+
end
|
29
32
|
end
|
30
|
-
end
|
33
|
+
end
|
data/lib/rtesseract/check.rb
CHANGED
@@ -1,14 +1,13 @@
|
|
1
|
-
|
2
1
|
class RTesseract
|
3
2
|
class << self
|
4
3
|
def tesseract_version
|
5
|
-
Open3.capture2e(RTesseract.config.command,
|
4
|
+
Open3.capture2e(RTesseract.config.command, '--version').first.to_s.match(/\d+.\d+/)[0].to_f
|
6
5
|
rescue Errno::ENOENT
|
7
6
|
0
|
8
7
|
end
|
9
8
|
|
10
9
|
def check_version!
|
11
|
-
raise RTesseract::Error
|
10
|
+
raise RTesseract::Error, 'Tesseract OCR 3.5 or later not installed' if RTesseract.tesseract_version < 3.05
|
12
11
|
end
|
13
12
|
end
|
14
|
-
end
|
13
|
+
end
|
data/lib/rtesseract/command.rb
CHANGED
@@ -1,14 +1,15 @@
|
|
1
1
|
class RTesseract
|
2
2
|
class Command
|
3
|
-
FIXED = [
|
3
|
+
FIXED = %i[command psm oem lang tessdata_dir user_words user_patterns config_file].freeze
|
4
4
|
|
5
5
|
attr_reader :options
|
6
6
|
|
7
|
-
def initialize(source, output, options)
|
7
|
+
def initialize(source, output, errors, options)
|
8
8
|
@source = source
|
9
9
|
@output = output
|
10
10
|
@options = options
|
11
|
-
@
|
11
|
+
@errors = errors
|
12
|
+
@full_command = [options.command, @source, @output]
|
12
13
|
end
|
13
14
|
|
14
15
|
def full_command
|
@@ -41,11 +42,13 @@ class RTesseract
|
|
41
42
|
end
|
42
43
|
|
43
44
|
def run
|
44
|
-
output, status = Open3.
|
45
|
+
output, error, status = Open3.capture3(*full_command.flatten)
|
46
|
+
|
47
|
+
@errors.push(error)
|
45
48
|
|
46
49
|
return output if status.success?
|
47
50
|
|
48
|
-
raise RTesseract::Error
|
51
|
+
raise RTesseract::Error, error
|
49
52
|
end
|
50
53
|
end
|
51
|
-
end
|
54
|
+
end
|
@@ -3,16 +3,20 @@ require 'ostruct'
|
|
3
3
|
class RTesseract
|
4
4
|
class Configuration < OpenStruct
|
5
5
|
def merge(options)
|
6
|
-
RTesseract::Configuration.new(
|
6
|
+
RTesseract::Configuration.new(to_h.merge(options))
|
7
|
+
end
|
8
|
+
|
9
|
+
def command
|
10
|
+
@table[:command]
|
7
11
|
end
|
8
12
|
end
|
9
13
|
|
10
14
|
class << self
|
11
15
|
def config
|
12
16
|
@config ||= RTesseract::Configuration.new(
|
13
|
-
|
14
|
-
|
15
|
-
|
17
|
+
command: 'tesseract',
|
18
|
+
debug_file: '/dev/null'
|
19
|
+
)
|
16
20
|
end
|
17
21
|
|
18
22
|
def configure
|
data/lib/rtesseract/pdf.rb
CHANGED
@@ -2,12 +2,12 @@ class RTesseract
|
|
2
2
|
module Pdf
|
3
3
|
extend Base
|
4
4
|
|
5
|
-
def self.run(source, options)
|
5
|
+
def self.run(source, errors, options)
|
6
6
|
options.tessedit_create_pdf = 1
|
7
7
|
|
8
|
-
RTesseract::Command.new(source, temp_file, options).run
|
8
|
+
RTesseract::Command.new(source, temp_file, errors, options).run
|
9
9
|
|
10
10
|
File.open(temp_file('.pdf'), 'r')
|
11
11
|
end
|
12
12
|
end
|
13
|
-
end
|
13
|
+
end
|
data/lib/rtesseract/text.rb
CHANGED
@@ -2,8 +2,8 @@ require 'open3'
|
|
2
2
|
|
3
3
|
class RTesseract
|
4
4
|
module Text
|
5
|
-
def self.run(source, options)
|
6
|
-
RTesseract::Command.new(source, 'stdout', options).run
|
5
|
+
def self.run(source, errors, options)
|
6
|
+
RTesseract::Command.new(source, 'stdout', errors, options).run
|
7
7
|
end
|
8
8
|
end
|
9
|
-
end
|
9
|
+
end
|
data/lib/rtesseract/tsv.rb
CHANGED
@@ -2,12 +2,12 @@ class RTesseract
|
|
2
2
|
module Tsv
|
3
3
|
extend Base
|
4
4
|
|
5
|
-
def self.run(source, options)
|
5
|
+
def self.run(source, errors, options)
|
6
6
|
options.tessedit_create_tsv = 1
|
7
7
|
|
8
|
-
RTesseract::Command.new(source, temp_file, options).run
|
8
|
+
RTesseract::Command.new(source, temp_file, errors, options).run
|
9
9
|
|
10
10
|
File.open(temp_file('.tsv'), 'r')
|
11
11
|
end
|
12
12
|
end
|
13
|
-
end
|
13
|
+
end
|
data/lib/rtesseract/version.rb
CHANGED
data/rtesseract.gemspec
CHANGED
@@ -1,33 +1,32 @@
|
|
1
|
-
|
2
|
-
lib = File.expand_path("../lib", __FILE__)
|
1
|
+
lib = File.expand_path('lib', __dir__)
|
3
2
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
-
require
|
3
|
+
require 'rtesseract/version'
|
5
4
|
|
6
5
|
Gem::Specification.new do |spec|
|
7
|
-
spec.name =
|
6
|
+
spec.name = 'rtesseract'
|
8
7
|
spec.version = RTesseract::VERSION
|
9
|
-
spec.authors = [
|
10
|
-
spec.email = [
|
8
|
+
spec.authors = ['Danilo Jeremias da Silva']
|
9
|
+
spec.email = ['dannnylo@gmail.com']
|
11
10
|
|
12
|
-
spec.summary =
|
13
|
-
spec.description =
|
14
|
-
spec.homepage =
|
15
|
-
spec.license =
|
11
|
+
spec.summary = 'Ruby library for working with the Tesseract OCR.'.freeze
|
12
|
+
spec.description = 'Ruby library for working with the Tesseract OCR.'.freeze
|
13
|
+
spec.homepage = 'http://github.com/dannnylo/rtesseract'.freeze
|
14
|
+
spec.license = 'MIT'
|
16
15
|
|
17
16
|
# Specify which files should be added to the gem when it is released.
|
18
17
|
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
19
|
-
spec.files = Dir.chdir(File.expand_path(
|
18
|
+
spec.files = Dir.chdir(File.expand_path(__dir__)) do
|
20
19
|
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
21
20
|
end
|
22
|
-
spec.bindir =
|
21
|
+
spec.bindir = 'exe'
|
23
22
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
24
|
-
spec.require_paths = [
|
23
|
+
spec.require_paths = ['lib']
|
25
24
|
|
26
|
-
spec.add_development_dependency
|
27
|
-
spec.add_development_dependency
|
28
|
-
spec.add_development_dependency
|
29
|
-
spec.add_development_dependency
|
30
|
-
spec.add_development_dependency
|
25
|
+
spec.add_development_dependency 'bundler', '~> 1.17'
|
26
|
+
spec.add_development_dependency 'coveralls'
|
27
|
+
spec.add_development_dependency 'rake', '~> 10.0'
|
28
|
+
spec.add_development_dependency 'rspec', '~> 3.0'
|
29
|
+
spec.add_development_dependency 'simplecov'
|
31
30
|
|
32
|
-
spec.add_dependency
|
31
|
+
spec.add_dependency 'nokogiri'
|
33
32
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rtesseract
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.0.
|
4
|
+
version: 3.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Danilo Jeremias da Silva
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-03-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -25,49 +25,49 @@ dependencies:
|
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '1.17'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
28
|
+
name: coveralls
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- - "
|
31
|
+
- - ">="
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
33
|
+
version: '0'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- - "
|
38
|
+
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '
|
40
|
+
version: '0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: rake
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: '
|
47
|
+
version: '10.0'
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: '
|
54
|
+
version: '10.0'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
56
|
+
name: rspec
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
|
-
- - "
|
59
|
+
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version: '0'
|
61
|
+
version: '3.0'
|
62
62
|
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
|
-
- - "
|
66
|
+
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version: '0'
|
68
|
+
version: '3.0'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
|
-
name:
|
70
|
+
name: simplecov
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
73
|
- - ">="
|
@@ -104,6 +104,7 @@ files:
|
|
104
104
|
- ".document"
|
105
105
|
- ".gitignore"
|
106
106
|
- ".rspec"
|
107
|
+
- ".rubocop.yml"
|
107
108
|
- ".travis.yml"
|
108
109
|
- CHANGELOG.md
|
109
110
|
- CODE_OF_CONDUCT.md
|
@@ -145,7 +146,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
145
146
|
version: '0'
|
146
147
|
requirements: []
|
147
148
|
rubyforge_project:
|
148
|
-
rubygems_version: 2.7.
|
149
|
+
rubygems_version: 2.7.8
|
149
150
|
signing_key:
|
150
151
|
specification_version: 4
|
151
152
|
summary: Ruby library for working with the Tesseract OCR.
|