rtesseract 3.0.5 → 3.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.hound.yml +3 -0
- data/.rubocop.yml +15 -3
- data/Gemfile +2 -0
- data/Gemfile.lock +19 -26
- data/Rakefile +2 -0
- data/bin/console +1 -0
- data/lib/rtesseract.rb +2 -0
- data/lib/rtesseract/base.rb +2 -0
- data/lib/rtesseract/box.rb +32 -20
- data/lib/rtesseract/check.rb +2 -0
- data/lib/rtesseract/command.rb +2 -0
- data/lib/rtesseract/configuration.rb +2 -0
- data/lib/rtesseract/pdf.rb +2 -0
- data/lib/rtesseract/text.rb +2 -0
- data/lib/rtesseract/tsv.rb +2 -0
- data/lib/rtesseract/version.rb +3 -1
- data/rtesseract.gemspec +5 -6
- metadata +3 -30
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 43dd931b6649c05a5cb484f3fc0548075d088e5682bbbea46d7ac3202a9b607c
|
4
|
+
data.tar.gz: 1b300fa5a83f458c986dabde158b3ff31dfeed6e7c7c3c98a17812fdf62c4bbb
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 955b9c159f1a89d681e5add1151d79ad2f0f29fbcd65d5558cb08b4a856f5442b6dfd4ae157b57f5bad869dbf9690a13bd1784bb23a7068a7c8402c8abba2468
|
7
|
+
data.tar.gz: daa3a7aa5ba7549c30f0bb85019c6197acec22854122887a8fd5693b52828f8c1b1210806c6d307f1109ddb784f13f090192884f8ae820941b444055df79de7e
|
data/.hound.yml
ADDED
data/.rubocop.yml
CHANGED
@@ -1,8 +1,20 @@
|
|
1
|
-
Documentation:
|
2
|
-
Enabled: false
|
3
1
|
|
4
|
-
|
2
|
+
Layout/LineLength:
|
5
3
|
Max: 150
|
6
4
|
|
7
5
|
Metrics/BlockLength:
|
8
6
|
Max: 50
|
7
|
+
|
8
|
+
Metrics/AbcSize:
|
9
|
+
Max: 30
|
10
|
+
|
11
|
+
Style/Documentation:
|
12
|
+
Enabled: false
|
13
|
+
Style/HashEachMethods:
|
14
|
+
Enabled: true
|
15
|
+
|
16
|
+
Style/HashTransformKeys:
|
17
|
+
Enabled: true
|
18
|
+
|
19
|
+
Style/HashTransformValues:
|
20
|
+
Enabled: true
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,51 +1,45 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
rtesseract (3.0
|
5
|
-
nokogiri
|
4
|
+
rtesseract (3.1.0)
|
6
5
|
|
7
6
|
GEM
|
8
7
|
remote: https://rubygems.org/
|
9
8
|
specs:
|
10
|
-
coveralls (0.
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
9
|
+
coveralls (0.8.23)
|
10
|
+
json (>= 1.8, < 3)
|
11
|
+
simplecov (~> 0.16.1)
|
12
|
+
term-ansicolor (~> 1.3)
|
13
|
+
thor (>= 0.19.4, < 2.0)
|
14
|
+
tins (~> 1.6)
|
16
15
|
diff-lcs (1.3)
|
17
16
|
docile (1.3.2)
|
18
|
-
|
19
|
-
mime-types-data (~> 3.2015)
|
20
|
-
mime-types-data (3.2019.1009)
|
21
|
-
mini_portile2 (2.4.0)
|
22
|
-
multi_json (1.14.1)
|
23
|
-
nokogiri (1.10.9)
|
24
|
-
mini_portile2 (~> 2.4.0)
|
17
|
+
json (2.3.0)
|
25
18
|
rake (13.0.1)
|
26
|
-
rest-client (1.6.7)
|
27
|
-
mime-types (>= 1.16)
|
28
19
|
rspec (3.9.0)
|
29
20
|
rspec-core (~> 3.9.0)
|
30
21
|
rspec-expectations (~> 3.9.0)
|
31
22
|
rspec-mocks (~> 3.9.0)
|
32
23
|
rspec-core (3.9.1)
|
33
24
|
rspec-support (~> 3.9.1)
|
34
|
-
rspec-expectations (3.9.
|
25
|
+
rspec-expectations (3.9.1)
|
35
26
|
diff-lcs (>= 1.2.0, < 2.0)
|
36
27
|
rspec-support (~> 3.9.0)
|
37
28
|
rspec-mocks (3.9.1)
|
38
29
|
diff-lcs (>= 1.2.0, < 2.0)
|
39
30
|
rspec-support (~> 3.9.0)
|
40
31
|
rspec-support (3.9.2)
|
41
|
-
simplecov (0.
|
32
|
+
simplecov (0.16.1)
|
42
33
|
docile (~> 1.1)
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
34
|
+
json (>= 1.8, < 3)
|
35
|
+
simplecov-html (~> 0.10.0)
|
36
|
+
simplecov-html (0.10.2)
|
37
|
+
sync (0.5.0)
|
38
|
+
term-ansicolor (1.7.1)
|
39
|
+
tins (~> 1.0)
|
40
|
+
thor (1.0.1)
|
41
|
+
tins (1.24.1)
|
42
|
+
sync
|
49
43
|
|
50
44
|
PLATFORMS
|
51
45
|
ruby
|
@@ -56,7 +50,6 @@ DEPENDENCIES
|
|
56
50
|
rake
|
57
51
|
rspec
|
58
52
|
rtesseract!
|
59
|
-
simplecov
|
60
53
|
|
61
54
|
BUNDLED WITH
|
62
55
|
2.1.4
|
data/Rakefile
CHANGED
data/bin/console
CHANGED
data/lib/rtesseract.rb
CHANGED
data/lib/rtesseract/base.rb
CHANGED
data/lib/rtesseract/box.rb
CHANGED
@@ -1,33 +1,45 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
3
|
class RTesseract
|
4
4
|
module Box
|
5
5
|
extend RTesseract::Base
|
6
6
|
|
7
|
-
|
8
|
-
|
7
|
+
class << self
|
8
|
+
def run(source, errors, options)
|
9
|
+
options.tessedit_create_hocr = 1
|
9
10
|
|
10
|
-
|
11
|
+
RTesseract::Command.new(source, temp_file, errors, options).run
|
11
12
|
|
12
|
-
|
13
|
-
|
13
|
+
parse(File.read(temp_file('.hocr')))
|
14
|
+
end
|
14
15
|
|
15
|
-
|
16
|
-
|
17
|
-
html.css('span.ocrx_word, span.ocr_word').map do |word|
|
18
|
-
attributes = word.attributes['title'].value.to_s.delete(';').split(' ')
|
19
|
-
word_info(word, attributes)
|
16
|
+
def parse(content)
|
17
|
+
content.lines.map { |line| parse_line(line) }.compact
|
20
18
|
end
|
21
|
-
end
|
22
19
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
20
|
+
def parse_line(line)
|
21
|
+
return unless line.match?(/oc(rx|r)_word/)
|
22
|
+
|
23
|
+
word = line.match(/(?<=>)(.*?)(?=<)/).to_s
|
24
|
+
|
25
|
+
return if word.strip == ''
|
26
|
+
|
27
|
+
word_info(word, parse_position(line))
|
28
|
+
end
|
29
|
+
|
30
|
+
def word_info(word, positions)
|
31
|
+
{
|
32
|
+
word: word,
|
33
|
+
x_start: positions[1].to_i,
|
34
|
+
y_start: positions[2].to_i,
|
35
|
+
x_end: positions[3].to_i,
|
36
|
+
y_end: positions[4].to_i
|
37
|
+
}
|
38
|
+
end
|
39
|
+
|
40
|
+
def parse_position(line)
|
41
|
+
line.match(/(?<=title)(.*?)(?=;)/).to_s.split(' ')
|
42
|
+
end
|
31
43
|
end
|
32
44
|
end
|
33
45
|
end
|
data/lib/rtesseract/check.rb
CHANGED
data/lib/rtesseract/command.rb
CHANGED
data/lib/rtesseract/pdf.rb
CHANGED
data/lib/rtesseract/text.rb
CHANGED
data/lib/rtesseract/tsv.rb
CHANGED
data/lib/rtesseract/version.rb
CHANGED
data/rtesseract.gemspec
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
lib = File.expand_path('lib', __dir__)
|
2
4
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
3
5
|
require 'rtesseract/version'
|
@@ -8,9 +10,9 @@ Gem::Specification.new do |spec|
|
|
8
10
|
spec.authors = ['Danilo Jeremias da Silva']
|
9
11
|
spec.email = ['dannnylo@gmail.com']
|
10
12
|
|
11
|
-
spec.summary = 'Ruby library for working with the Tesseract OCR.'
|
12
|
-
spec.description = 'Ruby library for working with the Tesseract OCR.'
|
13
|
-
spec.homepage = 'http://github.com/dannnylo/rtesseract'
|
13
|
+
spec.summary = 'Ruby library for working with the Tesseract OCR.'
|
14
|
+
spec.description = 'Ruby library for working with the Tesseract OCR.'
|
15
|
+
spec.homepage = 'http://github.com/dannnylo/rtesseract'
|
14
16
|
spec.license = 'MIT'
|
15
17
|
|
16
18
|
# Specify which files should be added to the gem when it is released.
|
@@ -26,7 +28,4 @@ Gem::Specification.new do |spec|
|
|
26
28
|
spec.add_development_dependency 'coveralls'
|
27
29
|
spec.add_development_dependency 'rake'
|
28
30
|
spec.add_development_dependency 'rspec'
|
29
|
-
spec.add_development_dependency 'simplecov'
|
30
|
-
|
31
|
-
spec.add_dependency 'nokogiri'
|
32
31
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rtesseract
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.0
|
4
|
+
version: 3.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Danilo Jeremias da Silva
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-03-
|
11
|
+
date: 2020-03-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -66,34 +66,6 @@ dependencies:
|
|
66
66
|
- - ">="
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '0'
|
69
|
-
- !ruby/object:Gem::Dependency
|
70
|
-
name: simplecov
|
71
|
-
requirement: !ruby/object:Gem::Requirement
|
72
|
-
requirements:
|
73
|
-
- - ">="
|
74
|
-
- !ruby/object:Gem::Version
|
75
|
-
version: '0'
|
76
|
-
type: :development
|
77
|
-
prerelease: false
|
78
|
-
version_requirements: !ruby/object:Gem::Requirement
|
79
|
-
requirements:
|
80
|
-
- - ">="
|
81
|
-
- !ruby/object:Gem::Version
|
82
|
-
version: '0'
|
83
|
-
- !ruby/object:Gem::Dependency
|
84
|
-
name: nokogiri
|
85
|
-
requirement: !ruby/object:Gem::Requirement
|
86
|
-
requirements:
|
87
|
-
- - ">="
|
88
|
-
- !ruby/object:Gem::Version
|
89
|
-
version: '0'
|
90
|
-
type: :runtime
|
91
|
-
prerelease: false
|
92
|
-
version_requirements: !ruby/object:Gem::Requirement
|
93
|
-
requirements:
|
94
|
-
- - ">="
|
95
|
-
- !ruby/object:Gem::Version
|
96
|
-
version: '0'
|
97
69
|
description: Ruby library for working with the Tesseract OCR.
|
98
70
|
email:
|
99
71
|
- dannnylo@gmail.com
|
@@ -103,6 +75,7 @@ extra_rdoc_files: []
|
|
103
75
|
files:
|
104
76
|
- ".document"
|
105
77
|
- ".gitignore"
|
78
|
+
- ".hound.yml"
|
106
79
|
- ".rspec"
|
107
80
|
- ".rubocop.yml"
|
108
81
|
- ".travis.yml"
|