rtesseract 3.0.5 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4b1d915346574fe70ae10644ceda44474731107f8876bef1f01675355c379516
4
- data.tar.gz: 7cfdf5c6838de59afe1e2296b2e08dbfe59c09b93aa16145241e6dc38da845a8
3
+ metadata.gz: 43dd931b6649c05a5cb484f3fc0548075d088e5682bbbea46d7ac3202a9b607c
4
+ data.tar.gz: 1b300fa5a83f458c986dabde158b3ff31dfeed6e7c7c3c98a17812fdf62c4bbb
5
5
  SHA512:
6
- metadata.gz: 247e5d2c0ecb1e77931da2a494fc3410efbb65dc6a920bc63c07b8498bebd846443c4c0cc0d3dc3027212a51a7e8a0ffbd715bb4edb438c69e38aa8be69cbfd6
7
- data.tar.gz: cc172d40a30ffbdc0aa64a8193f04e73c508ca47b4146f28b9d41f0663be0024aed70827045e502140eab0938a09177930310a66872d4fa9edfd09e7bae16d20
6
+ metadata.gz: 955b9c159f1a89d681e5add1151d79ad2f0f29fbcd65d5558cb08b4a856f5442b6dfd4ae157b57f5bad869dbf9690a13bd1784bb23a7068a7c8402c8abba2468
7
+ data.tar.gz: daa3a7aa5ba7549c30f0bb85019c6197acec22854122887a8fd5693b52828f8c1b1210806c6d307f1109ddb784f13f090192884f8ae820941b444055df79de7e
@@ -0,0 +1,3 @@
1
+ rubocop:
2
+ config_file: .rubocop.yml
3
+ version: 0.80.0
@@ -1,8 +1,20 @@
1
- Documentation:
2
- Enabled: false
3
1
 
4
- Metrics/LineLength:
2
+ Layout/LineLength:
5
3
  Max: 150
6
4
 
7
5
  Metrics/BlockLength:
8
6
  Max: 50
7
+
8
+ Metrics/AbcSize:
9
+ Max: 30
10
+
11
+ Style/Documentation:
12
+ Enabled: false
13
+ Style/HashEachMethods:
14
+ Enabled: true
15
+
16
+ Style/HashTransformKeys:
17
+ Enabled: true
18
+
19
+ Style/HashTransformValues:
20
+ Enabled: true
data/Gemfile CHANGED
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  source 'https://rubygems.org'
2
4
 
3
5
  git_source(:github) { |repo_name| "https://github.com/#{repo_name}" }
@@ -1,51 +1,45 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- rtesseract (3.0.5)
5
- nokogiri
4
+ rtesseract (3.1.0)
6
5
 
7
6
  GEM
8
7
  remote: https://rubygems.org/
9
8
  specs:
10
- coveralls (0.7.2)
11
- multi_json (~> 1.3)
12
- rest-client (= 1.6.7)
13
- simplecov (>= 0.7)
14
- term-ansicolor (= 1.2.2)
15
- thor (= 0.18.1)
9
+ coveralls (0.8.23)
10
+ json (>= 1.8, < 3)
11
+ simplecov (~> 0.16.1)
12
+ term-ansicolor (~> 1.3)
13
+ thor (>= 0.19.4, < 2.0)
14
+ tins (~> 1.6)
16
15
  diff-lcs (1.3)
17
16
  docile (1.3.2)
18
- mime-types (3.3.1)
19
- mime-types-data (~> 3.2015)
20
- mime-types-data (3.2019.1009)
21
- mini_portile2 (2.4.0)
22
- multi_json (1.14.1)
23
- nokogiri (1.10.9)
24
- mini_portile2 (~> 2.4.0)
17
+ json (2.3.0)
25
18
  rake (13.0.1)
26
- rest-client (1.6.7)
27
- mime-types (>= 1.16)
28
19
  rspec (3.9.0)
29
20
  rspec-core (~> 3.9.0)
30
21
  rspec-expectations (~> 3.9.0)
31
22
  rspec-mocks (~> 3.9.0)
32
23
  rspec-core (3.9.1)
33
24
  rspec-support (~> 3.9.1)
34
- rspec-expectations (3.9.0)
25
+ rspec-expectations (3.9.1)
35
26
  diff-lcs (>= 1.2.0, < 2.0)
36
27
  rspec-support (~> 3.9.0)
37
28
  rspec-mocks (3.9.1)
38
29
  diff-lcs (>= 1.2.0, < 2.0)
39
30
  rspec-support (~> 3.9.0)
40
31
  rspec-support (3.9.2)
41
- simplecov (0.18.5)
32
+ simplecov (0.16.1)
42
33
  docile (~> 1.1)
43
- simplecov-html (~> 0.11)
44
- simplecov-html (0.12.2)
45
- term-ansicolor (1.2.2)
46
- tins (~> 0.8)
47
- thor (0.18.1)
48
- tins (0.13.2)
34
+ json (>= 1.8, < 3)
35
+ simplecov-html (~> 0.10.0)
36
+ simplecov-html (0.10.2)
37
+ sync (0.5.0)
38
+ term-ansicolor (1.7.1)
39
+ tins (~> 1.0)
40
+ thor (1.0.1)
41
+ tins (1.24.1)
42
+ sync
49
43
 
50
44
  PLATFORMS
51
45
  ruby
@@ -56,7 +50,6 @@ DEPENDENCIES
56
50
  rake
57
51
  rspec
58
52
  rtesseract!
59
- simplecov
60
53
 
61
54
  BUNDLED WITH
62
55
  2.1.4
data/Rakefile CHANGED
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'bundler/gem_tasks'
2
4
  require 'rspec/core/rake_task'
3
5
 
@@ -1,4 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'bundler/setup'
4
5
  require 'rtesseract'
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'rtesseract/check'
2
4
  require 'rtesseract/configuration'
3
5
  require 'rtesseract/command'
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'tmpdir'
2
4
  require 'securerandom'
3
5
  require 'pathname'
@@ -1,33 +1,45 @@
1
- require 'nokogiri'
1
+ # frozen_string_literal: true
2
2
 
3
3
  class RTesseract
4
4
  module Box
5
5
  extend RTesseract::Base
6
6
 
7
- def self.run(source, errors, options)
8
- options.tessedit_create_hocr = 1
7
+ class << self
8
+ def run(source, errors, options)
9
+ options.tessedit_create_hocr = 1
9
10
 
10
- RTesseract::Command.new(source, temp_file, errors, options).run
11
+ RTesseract::Command.new(source, temp_file, errors, options).run
11
12
 
12
- parse(File.read(temp_file('.hocr')))
13
- end
13
+ parse(File.read(temp_file('.hocr')))
14
+ end
14
15
 
15
- def self.parse(content)
16
- html = Nokogiri::HTML(content)
17
- html.css('span.ocrx_word, span.ocr_word').map do |word|
18
- attributes = word.attributes['title'].value.to_s.delete(';').split(' ')
19
- word_info(word, attributes)
16
+ def parse(content)
17
+ content.lines.map { |line| parse_line(line) }.compact
20
18
  end
21
- end
22
19
 
23
- def self.word_info(word, data)
24
- {
25
- word: word.text,
26
- x_start: data[1].to_i,
27
- y_start: data[2].to_i,
28
- x_end: data[3].to_i,
29
- y_end: data[4].to_i
30
- }
20
+ def parse_line(line)
21
+ return unless line.match?(/oc(rx|r)_word/)
22
+
23
+ word = line.match(/(?<=>)(.*?)(?=<)/).to_s
24
+
25
+ return if word.strip == ''
26
+
27
+ word_info(word, parse_position(line))
28
+ end
29
+
30
+ def word_info(word, positions)
31
+ {
32
+ word: word,
33
+ x_start: positions[1].to_i,
34
+ y_start: positions[2].to_i,
35
+ x_end: positions[3].to_i,
36
+ y_end: positions[4].to_i
37
+ }
38
+ end
39
+
40
+ def parse_position(line)
41
+ line.match(/(?<=title)(.*?)(?=;)/).to_s.split(' ')
42
+ end
31
43
  end
32
44
  end
33
45
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class RTesseract
2
4
  class << self
3
5
  def tesseract_version
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class RTesseract
2
4
  class Command
3
5
  FIXED = %i[command psm oem lang tessdata_dir user_words user_patterns config_file].freeze
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'ostruct'
2
4
 
3
5
  class RTesseract
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class RTesseract
2
4
  module Pdf
3
5
  extend Base
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'open3'
2
4
 
3
5
  class RTesseract
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class RTesseract
2
4
  module Tsv
3
5
  extend Base
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class RTesseract
2
- VERSION = '3.0.5'.freeze
4
+ VERSION = '3.1.0'
3
5
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  lib = File.expand_path('lib', __dir__)
2
4
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
5
  require 'rtesseract/version'
@@ -8,9 +10,9 @@ Gem::Specification.new do |spec|
8
10
  spec.authors = ['Danilo Jeremias da Silva']
9
11
  spec.email = ['dannnylo@gmail.com']
10
12
 
11
- spec.summary = 'Ruby library for working with the Tesseract OCR.'.freeze
12
- spec.description = 'Ruby library for working with the Tesseract OCR.'.freeze
13
- spec.homepage = 'http://github.com/dannnylo/rtesseract'.freeze
13
+ spec.summary = 'Ruby library for working with the Tesseract OCR.'
14
+ spec.description = 'Ruby library for working with the Tesseract OCR.'
15
+ spec.homepage = 'http://github.com/dannnylo/rtesseract'
14
16
  spec.license = 'MIT'
15
17
 
16
18
  # Specify which files should be added to the gem when it is released.
@@ -26,7 +28,4 @@ Gem::Specification.new do |spec|
26
28
  spec.add_development_dependency 'coveralls'
27
29
  spec.add_development_dependency 'rake'
28
30
  spec.add_development_dependency 'rspec'
29
- spec.add_development_dependency 'simplecov'
30
-
31
- spec.add_dependency 'nokogiri'
32
31
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rtesseract
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.0.5
4
+ version: 3.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Danilo Jeremias da Silva
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-03-18 00:00:00.000000000 Z
11
+ date: 2020-03-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -66,34 +66,6 @@ dependencies:
66
66
  - - ">="
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0'
69
- - !ruby/object:Gem::Dependency
70
- name: simplecov
71
- requirement: !ruby/object:Gem::Requirement
72
- requirements:
73
- - - ">="
74
- - !ruby/object:Gem::Version
75
- version: '0'
76
- type: :development
77
- prerelease: false
78
- version_requirements: !ruby/object:Gem::Requirement
79
- requirements:
80
- - - ">="
81
- - !ruby/object:Gem::Version
82
- version: '0'
83
- - !ruby/object:Gem::Dependency
84
- name: nokogiri
85
- requirement: !ruby/object:Gem::Requirement
86
- requirements:
87
- - - ">="
88
- - !ruby/object:Gem::Version
89
- version: '0'
90
- type: :runtime
91
- prerelease: false
92
- version_requirements: !ruby/object:Gem::Requirement
93
- requirements:
94
- - - ">="
95
- - !ruby/object:Gem::Version
96
- version: '0'
97
69
  description: Ruby library for working with the Tesseract OCR.
98
70
  email:
99
71
  - dannnylo@gmail.com
@@ -103,6 +75,7 @@ extra_rdoc_files: []
103
75
  files:
104
76
  - ".document"
105
77
  - ".gitignore"
78
+ - ".hound.yml"
106
79
  - ".rspec"
107
80
  - ".rubocop.yml"
108
81
  - ".travis.yml"