rtesseract 3.0.2 → 3.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +8 -0
- data/CHANGELOG.md +12 -0
- data/Gemfile +2 -2
- data/Gemfile.lock +2 -2
- data/Rakefile +3 -3
- data/bin/console +3 -3
- data/lib/rtesseract.rb +15 -12
- data/lib/rtesseract/base.rb +2 -1
- data/lib/rtesseract/box.rb +15 -12
- data/lib/rtesseract/check.rb +3 -4
- data/lib/rtesseract/command.rb +9 -6
- data/lib/rtesseract/configuration.rb +8 -4
- data/lib/rtesseract/pdf.rb +3 -3
- data/lib/rtesseract/text.rb +3 -3
- data/lib/rtesseract/tsv.rb +3 -3
- data/lib/rtesseract/version.rb +1 -1
- data/rtesseract.gemspec +18 -19
- metadata +18 -17
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8eb694ea4b37475f756451795c145b8ba9618a0e5e94b0774a90301bb1aa97a2
|
4
|
+
data.tar.gz: fe725c774fc39720ff830e47e4580f7335def9ece76f67fad77fe9722f415d6c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f40f8b53fc3c63e4968d9b1adab5153771730897fd681e354afea79f2007c28c4216dc61cad0d218dc7d76814360ae9169a0ffb8999ab6c8d15ef51ad712ec07
|
7
|
+
data.tar.gz: e2dfbf63b972c6e678d4bb79ec00064a958c9607f76cffe197bc26f555dff35bd7b0e1c263e376ede9598e2f429a344abb4b8d39b0e47ede6d1c09a32e6c44a4
|
data/.rubocop.yml
ADDED
data/CHANGELOG.md
CHANGED
data/Gemfile
CHANGED
@@ -1,6 +1,6 @@
|
|
1
|
-
source
|
1
|
+
source 'https://rubygems.org'
|
2
2
|
|
3
|
-
git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
|
3
|
+
git_source(:github) { |repo_name| "https://github.com/#{repo_name}" }
|
4
4
|
|
5
5
|
# Specify your gem's dependencies in rtesseract.gemspec
|
6
6
|
gemspec
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
rtesseract (3.0.
|
4
|
+
rtesseract (3.0.3)
|
5
5
|
nokogiri
|
6
6
|
|
7
7
|
GEM
|
@@ -17,7 +17,7 @@ GEM
|
|
17
17
|
docile (1.3.1)
|
18
18
|
json (2.1.0)
|
19
19
|
mini_portile2 (2.4.0)
|
20
|
-
nokogiri (1.
|
20
|
+
nokogiri (1.10.1)
|
21
21
|
mini_portile2 (~> 2.4.0)
|
22
22
|
rake (10.5.0)
|
23
23
|
rspec (3.8.0)
|
data/Rakefile
CHANGED
data/bin/console
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
require
|
4
|
-
require
|
3
|
+
require 'bundler/setup'
|
4
|
+
require 'rtesseract'
|
5
5
|
|
6
6
|
# You can add fixtures and/or initialization code here to make experimenting
|
7
7
|
# with your gem easier. You can also use a different console, if you like.
|
@@ -10,5 +10,5 @@ require "rtesseract"
|
|
10
10
|
# require "pry"
|
11
11
|
# Pry.start
|
12
12
|
|
13
|
-
require
|
13
|
+
require 'irb'
|
14
14
|
IRB.start(__FILE__)
|
data/lib/rtesseract.rb
CHANGED
@@ -1,11 +1,11 @@
|
|
1
|
-
require
|
2
|
-
require
|
3
|
-
require
|
4
|
-
require
|
5
|
-
require
|
6
|
-
require
|
7
|
-
require
|
8
|
-
require
|
1
|
+
require 'rtesseract/check'
|
2
|
+
require 'rtesseract/configuration'
|
3
|
+
require 'rtesseract/command'
|
4
|
+
require 'rtesseract/base'
|
5
|
+
require 'rtesseract/text'
|
6
|
+
require 'rtesseract/pdf'
|
7
|
+
require 'rtesseract/box'
|
8
|
+
require 'rtesseract/tsv'
|
9
9
|
|
10
10
|
class RTesseract
|
11
11
|
class Error < StandardError; end
|
@@ -15,10 +15,11 @@ class RTesseract
|
|
15
15
|
def initialize(src = '', options = {})
|
16
16
|
@source = src
|
17
17
|
@config = RTesseract.config.merge(options)
|
18
|
+
@errors = []
|
18
19
|
end
|
19
20
|
|
20
21
|
def to_box
|
21
|
-
Box.run(@source, config)
|
22
|
+
Box.run(@source, @errors, config)
|
22
23
|
end
|
23
24
|
|
24
25
|
def words
|
@@ -26,20 +27,22 @@ class RTesseract
|
|
26
27
|
end
|
27
28
|
|
28
29
|
def to_pdf
|
29
|
-
Pdf.run(@source, config)
|
30
|
+
Pdf.run(@source, @errors, config)
|
30
31
|
end
|
31
32
|
|
32
33
|
def to_tsv
|
33
|
-
Tsv.run(@source, config)
|
34
|
+
Tsv.run(@source, @errors, config)
|
34
35
|
end
|
35
36
|
|
36
37
|
# Output value
|
37
38
|
def to_s
|
38
|
-
Text.run(@source, config)
|
39
|
+
Text.run(@source, @errors, config)
|
39
40
|
end
|
40
41
|
|
41
42
|
# Remove spaces and break-lines
|
42
43
|
def to_s_without_spaces
|
43
44
|
to_s.gsub(/\s/, '')
|
44
45
|
end
|
46
|
+
|
47
|
+
attr_reader :errors
|
45
48
|
end
|
data/lib/rtesseract/base.rb
CHANGED
data/lib/rtesseract/box.rb
CHANGED
@@ -4,10 +4,10 @@ class RTesseract
|
|
4
4
|
module Box
|
5
5
|
extend RTesseract::Base
|
6
6
|
|
7
|
-
def self.run(source, options)
|
7
|
+
def self.run(source, errors, options)
|
8
8
|
options.tessedit_create_hocr = 1
|
9
9
|
|
10
|
-
RTesseract::Command.new(source, temp_file, options).run
|
10
|
+
RTesseract::Command.new(source, temp_file, errors, options).run
|
11
11
|
|
12
12
|
parse(File.read(temp_file('.hocr')))
|
13
13
|
end
|
@@ -15,16 +15,19 @@ class RTesseract
|
|
15
15
|
def self.parse(content)
|
16
16
|
html = Nokogiri::HTML(content)
|
17
17
|
html.css('span.ocrx_word, span.ocr_word').map do |word|
|
18
|
-
|
19
|
-
|
20
|
-
{
|
21
|
-
word: word.text,
|
22
|
-
x_start: @attributes[1].to_i,
|
23
|
-
y_start: @attributes[2].to_i,
|
24
|
-
x_end: @attributes[3].to_i,
|
25
|
-
y_end: @attributes[4].to_i
|
26
|
-
}
|
18
|
+
attributes = word.attributes['title'].value.to_s.delete(';').split(' ')
|
19
|
+
word_info(word, attributes)
|
27
20
|
end
|
28
21
|
end
|
22
|
+
|
23
|
+
def self.word_info(word, data)
|
24
|
+
{
|
25
|
+
word: word.text,
|
26
|
+
x_start: data[1].to_i,
|
27
|
+
y_start: data[2].to_i,
|
28
|
+
x_end: data[3].to_i,
|
29
|
+
y_end: data[4].to_i
|
30
|
+
}
|
31
|
+
end
|
29
32
|
end
|
30
|
-
end
|
33
|
+
end
|
data/lib/rtesseract/check.rb
CHANGED
@@ -1,14 +1,13 @@
|
|
1
|
-
|
2
1
|
class RTesseract
|
3
2
|
class << self
|
4
3
|
def tesseract_version
|
5
|
-
Open3.capture2e(RTesseract.config.command,
|
4
|
+
Open3.capture2e(RTesseract.config.command, '--version').first.to_s.match(/\d+.\d+/)[0].to_f
|
6
5
|
rescue Errno::ENOENT
|
7
6
|
0
|
8
7
|
end
|
9
8
|
|
10
9
|
def check_version!
|
11
|
-
raise RTesseract::Error
|
10
|
+
raise RTesseract::Error, 'Tesseract OCR 3.5 or later not installed' if RTesseract.tesseract_version < 3.05
|
12
11
|
end
|
13
12
|
end
|
14
|
-
end
|
13
|
+
end
|
data/lib/rtesseract/command.rb
CHANGED
@@ -1,14 +1,15 @@
|
|
1
1
|
class RTesseract
|
2
2
|
class Command
|
3
|
-
FIXED = [
|
3
|
+
FIXED = %i[command psm oem lang tessdata_dir user_words user_patterns config_file].freeze
|
4
4
|
|
5
5
|
attr_reader :options
|
6
6
|
|
7
|
-
def initialize(source, output, options)
|
7
|
+
def initialize(source, output, errors, options)
|
8
8
|
@source = source
|
9
9
|
@output = output
|
10
10
|
@options = options
|
11
|
-
@
|
11
|
+
@errors = errors
|
12
|
+
@full_command = [options.command, @source, @output]
|
12
13
|
end
|
13
14
|
|
14
15
|
def full_command
|
@@ -41,11 +42,13 @@ class RTesseract
|
|
41
42
|
end
|
42
43
|
|
43
44
|
def run
|
44
|
-
output, status = Open3.
|
45
|
+
output, error, status = Open3.capture3(*full_command.flatten)
|
46
|
+
|
47
|
+
@errors.push(error)
|
45
48
|
|
46
49
|
return output if status.success?
|
47
50
|
|
48
|
-
raise RTesseract::Error
|
51
|
+
raise RTesseract::Error, error
|
49
52
|
end
|
50
53
|
end
|
51
|
-
end
|
54
|
+
end
|
@@ -3,16 +3,20 @@ require 'ostruct'
|
|
3
3
|
class RTesseract
|
4
4
|
class Configuration < OpenStruct
|
5
5
|
def merge(options)
|
6
|
-
RTesseract::Configuration.new(
|
6
|
+
RTesseract::Configuration.new(to_h.merge(options))
|
7
|
+
end
|
8
|
+
|
9
|
+
def command
|
10
|
+
@table[:command]
|
7
11
|
end
|
8
12
|
end
|
9
13
|
|
10
14
|
class << self
|
11
15
|
def config
|
12
16
|
@config ||= RTesseract::Configuration.new(
|
13
|
-
|
14
|
-
|
15
|
-
|
17
|
+
command: 'tesseract',
|
18
|
+
debug_file: '/dev/null'
|
19
|
+
)
|
16
20
|
end
|
17
21
|
|
18
22
|
def configure
|
data/lib/rtesseract/pdf.rb
CHANGED
@@ -2,12 +2,12 @@ class RTesseract
|
|
2
2
|
module Pdf
|
3
3
|
extend Base
|
4
4
|
|
5
|
-
def self.run(source, options)
|
5
|
+
def self.run(source, errors, options)
|
6
6
|
options.tessedit_create_pdf = 1
|
7
7
|
|
8
|
-
RTesseract::Command.new(source, temp_file, options).run
|
8
|
+
RTesseract::Command.new(source, temp_file, errors, options).run
|
9
9
|
|
10
10
|
File.open(temp_file('.pdf'), 'r')
|
11
11
|
end
|
12
12
|
end
|
13
|
-
end
|
13
|
+
end
|
data/lib/rtesseract/text.rb
CHANGED
@@ -2,8 +2,8 @@ require 'open3'
|
|
2
2
|
|
3
3
|
class RTesseract
|
4
4
|
module Text
|
5
|
-
def self.run(source, options)
|
6
|
-
RTesseract::Command.new(source, 'stdout', options).run
|
5
|
+
def self.run(source, errors, options)
|
6
|
+
RTesseract::Command.new(source, 'stdout', errors, options).run
|
7
7
|
end
|
8
8
|
end
|
9
|
-
end
|
9
|
+
end
|
data/lib/rtesseract/tsv.rb
CHANGED
@@ -2,12 +2,12 @@ class RTesseract
|
|
2
2
|
module Tsv
|
3
3
|
extend Base
|
4
4
|
|
5
|
-
def self.run(source, options)
|
5
|
+
def self.run(source, errors, options)
|
6
6
|
options.tessedit_create_tsv = 1
|
7
7
|
|
8
|
-
RTesseract::Command.new(source, temp_file, options).run
|
8
|
+
RTesseract::Command.new(source, temp_file, errors, options).run
|
9
9
|
|
10
10
|
File.open(temp_file('.tsv'), 'r')
|
11
11
|
end
|
12
12
|
end
|
13
|
-
end
|
13
|
+
end
|
data/lib/rtesseract/version.rb
CHANGED
data/rtesseract.gemspec
CHANGED
@@ -1,33 +1,32 @@
|
|
1
|
-
|
2
|
-
lib = File.expand_path("../lib", __FILE__)
|
1
|
+
lib = File.expand_path('lib', __dir__)
|
3
2
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
-
require
|
3
|
+
require 'rtesseract/version'
|
5
4
|
|
6
5
|
Gem::Specification.new do |spec|
|
7
|
-
spec.name =
|
6
|
+
spec.name = 'rtesseract'
|
8
7
|
spec.version = RTesseract::VERSION
|
9
|
-
spec.authors = [
|
10
|
-
spec.email = [
|
8
|
+
spec.authors = ['Danilo Jeremias da Silva']
|
9
|
+
spec.email = ['dannnylo@gmail.com']
|
11
10
|
|
12
|
-
spec.summary =
|
13
|
-
spec.description =
|
14
|
-
spec.homepage =
|
15
|
-
spec.license =
|
11
|
+
spec.summary = 'Ruby library for working with the Tesseract OCR.'.freeze
|
12
|
+
spec.description = 'Ruby library for working with the Tesseract OCR.'.freeze
|
13
|
+
spec.homepage = 'http://github.com/dannnylo/rtesseract'.freeze
|
14
|
+
spec.license = 'MIT'
|
16
15
|
|
17
16
|
# Specify which files should be added to the gem when it is released.
|
18
17
|
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
19
|
-
spec.files = Dir.chdir(File.expand_path(
|
18
|
+
spec.files = Dir.chdir(File.expand_path(__dir__)) do
|
20
19
|
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
21
20
|
end
|
22
|
-
spec.bindir =
|
21
|
+
spec.bindir = 'exe'
|
23
22
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
24
|
-
spec.require_paths = [
|
23
|
+
spec.require_paths = ['lib']
|
25
24
|
|
26
|
-
spec.add_development_dependency
|
27
|
-
spec.add_development_dependency
|
28
|
-
spec.add_development_dependency
|
29
|
-
spec.add_development_dependency
|
30
|
-
spec.add_development_dependency
|
25
|
+
spec.add_development_dependency 'bundler', '~> 1.17'
|
26
|
+
spec.add_development_dependency 'coveralls'
|
27
|
+
spec.add_development_dependency 'rake', '~> 10.0'
|
28
|
+
spec.add_development_dependency 'rspec', '~> 3.0'
|
29
|
+
spec.add_development_dependency 'simplecov'
|
31
30
|
|
32
|
-
spec.add_dependency
|
31
|
+
spec.add_dependency 'nokogiri'
|
33
32
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rtesseract
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.0.
|
4
|
+
version: 3.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Danilo Jeremias da Silva
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-03-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -25,49 +25,49 @@ dependencies:
|
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '1.17'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
28
|
+
name: coveralls
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- - "
|
31
|
+
- - ">="
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
33
|
+
version: '0'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- - "
|
38
|
+
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '
|
40
|
+
version: '0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: rake
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: '
|
47
|
+
version: '10.0'
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: '
|
54
|
+
version: '10.0'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
56
|
+
name: rspec
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
|
-
- - "
|
59
|
+
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version: '0'
|
61
|
+
version: '3.0'
|
62
62
|
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
|
-
- - "
|
66
|
+
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version: '0'
|
68
|
+
version: '3.0'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
|
-
name:
|
70
|
+
name: simplecov
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
73
|
- - ">="
|
@@ -104,6 +104,7 @@ files:
|
|
104
104
|
- ".document"
|
105
105
|
- ".gitignore"
|
106
106
|
- ".rspec"
|
107
|
+
- ".rubocop.yml"
|
107
108
|
- ".travis.yml"
|
108
109
|
- CHANGELOG.md
|
109
110
|
- CODE_OF_CONDUCT.md
|
@@ -145,7 +146,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
145
146
|
version: '0'
|
146
147
|
requirements: []
|
147
148
|
rubyforge_project:
|
148
|
-
rubygems_version: 2.7.
|
149
|
+
rubygems_version: 2.7.8
|
149
150
|
signing_key:
|
150
151
|
specification_version: 4
|
151
152
|
summary: Ruby library for working with the Tesseract OCR.
|