rtesseract 3.1.2 → 3.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.deepsource.toml +9 -0
- data/.github/FUNDING.yml +1 -0
- data/.github/workflows/ci.yml +12 -7
- data/.rubocop.yml +3 -0
- data/CHANGELOG.md +9 -0
- data/Gemfile +10 -0
- data/Gemfile.lock +27 -34
- data/README.md +17 -3
- data/lib/rtesseract/base.rb +4 -0
- data/lib/rtesseract/box.rb +8 -5
- data/lib/rtesseract/pdf.rb +1 -1
- data/lib/rtesseract/text.rb +3 -1
- data/lib/rtesseract/tsv.rb +7 -2
- data/lib/rtesseract/version.rb +1 -1
- data/lib/rtesseract.rb +5 -7
- data/rtesseract.gemspec +2 -4
- metadata +9 -65
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0a6663ca2e040b7f9ea75dba9efd078c4d4e8e9ddd51f5e6891049328fc39492
|
4
|
+
data.tar.gz: 0f0cb1ccfbaf0fde1a04da091f55ee536992504f2fcf698ab56fbd354292b2a3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e5a661e73977518acc17627a850d1cdc22df44af6ed4118b974af8b9a487cab1bf487583cf3a196c4b23b3a708d1fed9521ecbd5aa1e0d5d0284bdc671739fe3
|
7
|
+
data.tar.gz: b575ccd5b3fe076d0565857a558ca454b0e919e1b421ca071b6eaa8636b20ffed8f899280d666ba2735f0cf73e72552535944aa55f72a8c985116008f7059457
|
data/.deepsource.toml
ADDED
data/.github/FUNDING.yml
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
github: dannnylo
|
data/.github/workflows/ci.yml
CHANGED
@@ -6,26 +6,31 @@ jobs:
|
|
6
6
|
strategy:
|
7
7
|
matrix:
|
8
8
|
ruby:
|
9
|
-
- '2.
|
10
|
-
- '
|
11
|
-
|
9
|
+
- '3.2.0'
|
10
|
+
- '3.4.2'
|
11
|
+
repository:
|
12
|
+
- 'ppa:alex-p/tesseract-ocr5'
|
13
|
+
- 'ppa:alex-p/tesseract-ocr-devel'
|
12
14
|
steps:
|
13
15
|
- uses: actions/checkout@v2
|
14
16
|
- name: Install tesseract-ocr
|
15
17
|
run: |
|
16
|
-
sudo add-apt-repository
|
18
|
+
sudo add-apt-repository ${{ matrix.repository }} -y
|
17
19
|
sudo apt-get update -q
|
18
20
|
sudo apt-get install tesseract-ocr tesseract-ocr-eng ghostscript -y
|
21
|
+
tesseract --version
|
19
22
|
- name: Setup Ruby
|
20
|
-
uses:
|
23
|
+
uses: ruby/setup-ruby@v1
|
21
24
|
with:
|
22
25
|
ruby-version: ${{ matrix.ruby }}
|
23
26
|
- name: Bundle
|
24
|
-
env:
|
25
|
-
MTSR_RAILS_VERSION: ${{ matrix.rails }}
|
26
27
|
run: |
|
27
28
|
gem uninstall -aIx bundler
|
28
29
|
gem install bundler
|
29
30
|
bundle install --jobs 4 --retry 3
|
30
31
|
- name: Test
|
31
32
|
run: bundle exec rake
|
33
|
+
- name: Coverage
|
34
|
+
env:
|
35
|
+
CODACY_PROJECT_TOKEN: ${{ secrets.CODACY_PROJECT_TOKEN }}
|
36
|
+
run: bash <(curl -Ls https://coverage.codacy.com/get.sh) report -l Ruby -r coverage/lcov/*
|
data/.rubocop.yml
CHANGED
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,12 @@
|
|
1
|
+
# Changes
|
2
|
+
|
3
|
+
## v3.1.4
|
4
|
+
# Changed
|
5
|
+
* Temporary hocr file is deleted after the file is processed.
|
6
|
+
|
7
|
+
## v3.1.3
|
8
|
+
* Fixed a configuration error that wouldn't allow you to do different kinds of calls on the same object, for example calling .to_box and then .to_s would result in unexpected behavior.
|
9
|
+
|
1
10
|
## v3.1.2
|
2
11
|
|
3
12
|
#### Added
|
data/Gemfile
CHANGED
@@ -6,3 +6,13 @@ git_source(:github) { |repo_name| "https://github.com/#{repo_name}" }
|
|
6
6
|
|
7
7
|
# Specify your gem's dependencies in rtesseract.gemspec
|
8
8
|
gemspec
|
9
|
+
|
10
|
+
group :development, :test do
|
11
|
+
gem 'bundler', '~> 2'
|
12
|
+
gem 'rake'
|
13
|
+
gem 'rspec'
|
14
|
+
|
15
|
+
gem 'simplecov'
|
16
|
+
gem 'simplecov-cobertura'
|
17
|
+
gem 'simplecov-lcov'
|
18
|
+
end
|
data/Gemfile.lock
CHANGED
@@ -1,55 +1,48 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
rtesseract (3.1.
|
4
|
+
rtesseract (3.1.4)
|
5
5
|
|
6
6
|
GEM
|
7
7
|
remote: https://rubygems.org/
|
8
8
|
specs:
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
rspec (3.9.0)
|
20
|
-
rspec-core (~> 3.9.0)
|
21
|
-
rspec-expectations (~> 3.9.0)
|
22
|
-
rspec-mocks (~> 3.9.0)
|
23
|
-
rspec-core (3.9.1)
|
24
|
-
rspec-support (~> 3.9.1)
|
25
|
-
rspec-expectations (3.9.1)
|
9
|
+
diff-lcs (1.4.4)
|
10
|
+
docile (1.4.0)
|
11
|
+
rake (13.0.6)
|
12
|
+
rspec (3.10.0)
|
13
|
+
rspec-core (~> 3.10.0)
|
14
|
+
rspec-expectations (~> 3.10.0)
|
15
|
+
rspec-mocks (~> 3.10.0)
|
16
|
+
rspec-core (3.10.1)
|
17
|
+
rspec-support (~> 3.10.0)
|
18
|
+
rspec-expectations (3.10.1)
|
26
19
|
diff-lcs (>= 1.2.0, < 2.0)
|
27
|
-
rspec-support (~> 3.
|
28
|
-
rspec-mocks (3.
|
20
|
+
rspec-support (~> 3.10.0)
|
21
|
+
rspec-mocks (3.10.2)
|
29
22
|
diff-lcs (>= 1.2.0, < 2.0)
|
30
|
-
rspec-support (~> 3.
|
31
|
-
rspec-support (3.
|
32
|
-
simplecov (0.
|
23
|
+
rspec-support (~> 3.10.0)
|
24
|
+
rspec-support (3.10.2)
|
25
|
+
simplecov (0.21.2)
|
33
26
|
docile (~> 1.1)
|
34
|
-
|
35
|
-
|
36
|
-
simplecov-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
tins (1.24.1)
|
42
|
-
sync
|
27
|
+
simplecov-html (~> 0.11)
|
28
|
+
simplecov_json_formatter (~> 0.1)
|
29
|
+
simplecov-cobertura (1.4.2)
|
30
|
+
simplecov (~> 0.8)
|
31
|
+
simplecov-html (0.12.3)
|
32
|
+
simplecov-lcov (0.8.0)
|
33
|
+
simplecov_json_formatter (0.1.3)
|
43
34
|
|
44
35
|
PLATFORMS
|
45
36
|
ruby
|
46
37
|
|
47
38
|
DEPENDENCIES
|
48
39
|
bundler (~> 2)
|
49
|
-
coveralls
|
50
40
|
rake
|
51
41
|
rspec
|
52
42
|
rtesseract!
|
43
|
+
simplecov
|
44
|
+
simplecov-cobertura
|
45
|
+
simplecov-lcov
|
53
46
|
|
54
47
|
BUNDLED WITH
|
55
|
-
2.
|
48
|
+
2.4.20
|
data/README.md
CHANGED
@@ -6,8 +6,11 @@
|
|
6
6
|
<a href='https://github.com/dannnylo/rtesseract/workflows/CI/badge.svg'>
|
7
7
|
<img src="https://github.com/dannnylo/rtesseract/workflows/CI/badge.svg" alt="Build Status" />
|
8
8
|
</a>
|
9
|
-
<a href='https://
|
10
|
-
|
9
|
+
<a href='https://app.codacy.com/project/badge/Grade/316a48934db8415d84d2f9a318b0f837'>
|
10
|
+
<img src="https://app.codacy.com/project/badge/Grade/316a48934db8415d84d2f9a318b0f837" alt="Coverage Status" />
|
11
|
+
</a>
|
12
|
+
<a href='https://app.codacy.com/project/badge/Coverage/316a48934db8415d84d2f9a318b0f837'>
|
13
|
+
<img src="https://app.codacy.com/project/badge/Coverage/316a48934db8415d84d2f9a318b0f837" alt="Coverage" />
|
11
14
|
</a>
|
12
15
|
<a href='https://codeclimate.com/github/dannnylo/rtesseract'>
|
13
16
|
<img src="https://codeclimate.com/github/dannnylo/rtesseract.png" />
|
@@ -17,10 +20,21 @@ Ruby library for working with the Tesseract OCR.
|
|
17
20
|
|
18
21
|
## Installation
|
19
22
|
|
20
|
-
Check if tesseract ocr programs
|
23
|
+
Check if tesseract ocr programs are installed:
|
21
24
|
|
22
25
|
$ tesseract --version
|
23
26
|
|
27
|
+
If not, you can install them with a command like:
|
28
|
+
|
29
|
+
$ apt install tesseract-ocr
|
30
|
+
|
31
|
+
or
|
32
|
+
|
33
|
+
$ brew install tesseract
|
34
|
+
|
35
|
+
or for Heroku 22 to add the buildpack https://github.com/pathwaysmedical/heroku-buildpack-tesseract
|
36
|
+
|
37
|
+
|
24
38
|
Add this line to your application's Gemfile:
|
25
39
|
|
26
40
|
```ruby
|
data/lib/rtesseract/base.rb
CHANGED
data/lib/rtesseract/box.rb
CHANGED
@@ -6,10 +6,13 @@ class RTesseract
|
|
6
6
|
|
7
7
|
class << self
|
8
8
|
def run(source, errors, options)
|
9
|
-
options.tessedit_create_hocr
|
9
|
+
options = options.merge({ tessedit_create_hocr: 1 })
|
10
10
|
|
11
11
|
RTesseract::Command.new(source, temp_file_path, errors, options).run do |output_path|
|
12
|
-
|
12
|
+
filename = "#{output_path}.hocr"
|
13
|
+
content = File.read(filename)
|
14
|
+
remove_tmp_file(filename)
|
15
|
+
parse(content)
|
13
16
|
end
|
14
17
|
end
|
15
18
|
|
@@ -20,7 +23,7 @@ class RTesseract
|
|
20
23
|
def parse_line(line)
|
21
24
|
return unless line.match?(/oc(rx|r)_word/)
|
22
25
|
|
23
|
-
word = line.
|
26
|
+
word = line.to_s.scan(/>(.*)</).flatten.first.to_s
|
24
27
|
|
25
28
|
return if word.strip == ''
|
26
29
|
|
@@ -39,11 +42,11 @@ class RTesseract
|
|
39
42
|
end
|
40
43
|
|
41
44
|
def parse_position(line)
|
42
|
-
line.match(/(?<=title)(.*?)(?=;)/).to_s.split
|
45
|
+
line.match(/(?<=title)(.*?)(?=;)/).to_s.split
|
43
46
|
end
|
44
47
|
|
45
48
|
def parse_confidence(line)
|
46
|
-
line.match(/(?<=;)(.*?)(?=')/).to_s.split
|
49
|
+
line.match(/(?<=;)(.*?)(?=')/).to_s.split
|
47
50
|
end
|
48
51
|
end
|
49
52
|
end
|
data/lib/rtesseract/pdf.rb
CHANGED
@@ -5,7 +5,7 @@ class RTesseract
|
|
5
5
|
extend Base
|
6
6
|
|
7
7
|
def self.run(source, errors, options)
|
8
|
-
options.tessedit_create_pdf
|
8
|
+
options = options.merge({ tessedit_create_pdf: 1 })
|
9
9
|
|
10
10
|
RTesseract::Command.new(source, temp_file_path, errors, options).run do |output_path|
|
11
11
|
File.open("#{output_path}.pdf", 'r')
|
data/lib/rtesseract/text.rb
CHANGED
@@ -5,7 +5,9 @@ require 'open3'
|
|
5
5
|
class RTesseract
|
6
6
|
module Text
|
7
7
|
def self.run(source, errors, options)
|
8
|
-
RTesseract::Command.new(source, 'stdout', errors, options).run
|
8
|
+
text = RTesseract::Command.new(source, 'stdout', errors, options).run
|
9
|
+
text = text.gsub("\f", '') if text.is_a?(String)
|
10
|
+
text
|
9
11
|
end
|
10
12
|
end
|
11
13
|
end
|
data/lib/rtesseract/tsv.rb
CHANGED
@@ -5,9 +5,14 @@ class RTesseract
|
|
5
5
|
extend Base
|
6
6
|
|
7
7
|
def self.run(source, errors, options)
|
8
|
-
options.tessedit_create_tsv
|
8
|
+
options = options.merge({ tessedit_create_tsv: 1 })
|
9
9
|
|
10
|
-
RTesseract::Command.new(
|
10
|
+
RTesseract::Command.new(
|
11
|
+
source,
|
12
|
+
temp_file_path,
|
13
|
+
errors,
|
14
|
+
options
|
15
|
+
).run do |output_path|
|
11
16
|
File.open("#{output_path}.tsv", 'r')
|
12
17
|
end
|
13
18
|
end
|
data/lib/rtesseract/version.rb
CHANGED
data/lib/rtesseract.rb
CHANGED
@@ -12,7 +12,7 @@ require 'rtesseract/tsv'
|
|
12
12
|
class RTesseract
|
13
13
|
class Error < StandardError; end
|
14
14
|
|
15
|
-
attr_reader :config, :source
|
15
|
+
attr_reader :config, :source, :errors
|
16
16
|
|
17
17
|
def initialize(src = '', options = {})
|
18
18
|
@source = src
|
@@ -21,7 +21,7 @@ class RTesseract
|
|
21
21
|
end
|
22
22
|
|
23
23
|
def to_box
|
24
|
-
Box.run(@source, @errors, config)
|
24
|
+
Box.run(@source, @errors, @config)
|
25
25
|
end
|
26
26
|
|
27
27
|
def words
|
@@ -29,22 +29,20 @@ class RTesseract
|
|
29
29
|
end
|
30
30
|
|
31
31
|
def to_pdf
|
32
|
-
Pdf.run(@source, @errors, config)
|
32
|
+
Pdf.run(@source, @errors, @config)
|
33
33
|
end
|
34
34
|
|
35
35
|
def to_tsv
|
36
|
-
Tsv.run(@source, @errors, config)
|
36
|
+
Tsv.run(@source, @errors, @config)
|
37
37
|
end
|
38
38
|
|
39
39
|
# Output value
|
40
40
|
def to_s
|
41
|
-
Text.run(@source, @errors, config)
|
41
|
+
Text.run(@source, @errors, @config)
|
42
42
|
end
|
43
43
|
|
44
44
|
# Remove spaces and break-lines
|
45
45
|
def to_s_without_spaces
|
46
46
|
to_s.gsub(/\s/, '')
|
47
47
|
end
|
48
|
-
|
49
|
-
attr_reader :errors
|
50
48
|
end
|
data/rtesseract.gemspec
CHANGED
@@ -20,12 +20,10 @@ Gem::Specification.new do |spec|
|
|
20
20
|
spec.files = Dir.chdir(File.expand_path(__dir__)) do
|
21
21
|
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
22
22
|
end
|
23
|
+
spec.required_ruby_version = '>= 2.7'
|
23
24
|
spec.bindir = 'exe'
|
24
25
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
25
26
|
spec.require_paths = ['lib']
|
26
27
|
|
27
|
-
spec.
|
28
|
-
spec.add_development_dependency 'coveralls'
|
29
|
-
spec.add_development_dependency 'rake'
|
30
|
-
spec.add_development_dependency 'rspec'
|
28
|
+
spec.metadata['rubygems_mfa_required'] = 'true'
|
31
29
|
end
|
metadata
CHANGED
@@ -1,71 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rtesseract
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.1.
|
4
|
+
version: 3.1.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Danilo Jeremias da Silva
|
8
|
-
autorequire:
|
9
8
|
bindir: exe
|
10
9
|
cert_chain: []
|
11
|
-
date:
|
12
|
-
dependencies:
|
13
|
-
- !ruby/object:Gem::Dependency
|
14
|
-
name: bundler
|
15
|
-
requirement: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - "~>"
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: '2'
|
20
|
-
type: :development
|
21
|
-
prerelease: false
|
22
|
-
version_requirements: !ruby/object:Gem::Requirement
|
23
|
-
requirements:
|
24
|
-
- - "~>"
|
25
|
-
- !ruby/object:Gem::Version
|
26
|
-
version: '2'
|
27
|
-
- !ruby/object:Gem::Dependency
|
28
|
-
name: coveralls
|
29
|
-
requirement: !ruby/object:Gem::Requirement
|
30
|
-
requirements:
|
31
|
-
- - ">="
|
32
|
-
- !ruby/object:Gem::Version
|
33
|
-
version: '0'
|
34
|
-
type: :development
|
35
|
-
prerelease: false
|
36
|
-
version_requirements: !ruby/object:Gem::Requirement
|
37
|
-
requirements:
|
38
|
-
- - ">="
|
39
|
-
- !ruby/object:Gem::Version
|
40
|
-
version: '0'
|
41
|
-
- !ruby/object:Gem::Dependency
|
42
|
-
name: rake
|
43
|
-
requirement: !ruby/object:Gem::Requirement
|
44
|
-
requirements:
|
45
|
-
- - ">="
|
46
|
-
- !ruby/object:Gem::Version
|
47
|
-
version: '0'
|
48
|
-
type: :development
|
49
|
-
prerelease: false
|
50
|
-
version_requirements: !ruby/object:Gem::Requirement
|
51
|
-
requirements:
|
52
|
-
- - ">="
|
53
|
-
- !ruby/object:Gem::Version
|
54
|
-
version: '0'
|
55
|
-
- !ruby/object:Gem::Dependency
|
56
|
-
name: rspec
|
57
|
-
requirement: !ruby/object:Gem::Requirement
|
58
|
-
requirements:
|
59
|
-
- - ">="
|
60
|
-
- !ruby/object:Gem::Version
|
61
|
-
version: '0'
|
62
|
-
type: :development
|
63
|
-
prerelease: false
|
64
|
-
version_requirements: !ruby/object:Gem::Requirement
|
65
|
-
requirements:
|
66
|
-
- - ">="
|
67
|
-
- !ruby/object:Gem::Version
|
68
|
-
version: '0'
|
10
|
+
date: 2025-04-30 00:00:00.000000000 Z
|
11
|
+
dependencies: []
|
69
12
|
description: Ruby library for working with the Tesseract OCR.
|
70
13
|
email:
|
71
14
|
- dannnylo@gmail.com
|
@@ -73,7 +16,9 @@ executables: []
|
|
73
16
|
extensions: []
|
74
17
|
extra_rdoc_files: []
|
75
18
|
files:
|
19
|
+
- ".deepsource.toml"
|
76
20
|
- ".document"
|
21
|
+
- ".github/FUNDING.yml"
|
77
22
|
- ".github/workflows/ci.yml"
|
78
23
|
- ".gitignore"
|
79
24
|
- ".hound.yml"
|
@@ -102,8 +47,8 @@ files:
|
|
102
47
|
homepage: http://github.com/dannnylo/rtesseract
|
103
48
|
licenses:
|
104
49
|
- MIT
|
105
|
-
metadata:
|
106
|
-
|
50
|
+
metadata:
|
51
|
+
rubygems_mfa_required: 'true'
|
107
52
|
rdoc_options: []
|
108
53
|
require_paths:
|
109
54
|
- lib
|
@@ -111,15 +56,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
111
56
|
requirements:
|
112
57
|
- - ">="
|
113
58
|
- !ruby/object:Gem::Version
|
114
|
-
version: '
|
59
|
+
version: '2.7'
|
115
60
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
116
61
|
requirements:
|
117
62
|
- - ">="
|
118
63
|
- !ruby/object:Gem::Version
|
119
64
|
version: '0'
|
120
65
|
requirements: []
|
121
|
-
rubygems_version: 3.
|
122
|
-
signing_key:
|
66
|
+
rubygems_version: 3.6.3
|
123
67
|
specification_version: 4
|
124
68
|
summary: Ruby library for working with the Tesseract OCR.
|
125
69
|
test_files: []
|