rtesseract 3.0.5 → 3.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4b1d915346574fe70ae10644ceda44474731107f8876bef1f01675355c379516
4
- data.tar.gz: 7cfdf5c6838de59afe1e2296b2e08dbfe59c09b93aa16145241e6dc38da845a8
3
+ metadata.gz: 43dd931b6649c05a5cb484f3fc0548075d088e5682bbbea46d7ac3202a9b607c
4
+ data.tar.gz: 1b300fa5a83f458c986dabde158b3ff31dfeed6e7c7c3c98a17812fdf62c4bbb
5
5
  SHA512:
6
- metadata.gz: 247e5d2c0ecb1e77931da2a494fc3410efbb65dc6a920bc63c07b8498bebd846443c4c0cc0d3dc3027212a51a7e8a0ffbd715bb4edb438c69e38aa8be69cbfd6
7
- data.tar.gz: cc172d40a30ffbdc0aa64a8193f04e73c508ca47b4146f28b9d41f0663be0024aed70827045e502140eab0938a09177930310a66872d4fa9edfd09e7bae16d20
6
+ metadata.gz: 955b9c159f1a89d681e5add1151d79ad2f0f29fbcd65d5558cb08b4a856f5442b6dfd4ae157b57f5bad869dbf9690a13bd1784bb23a7068a7c8402c8abba2468
7
+ data.tar.gz: daa3a7aa5ba7549c30f0bb85019c6197acec22854122887a8fd5693b52828f8c1b1210806c6d307f1109ddb784f13f090192884f8ae820941b444055df79de7e
@@ -0,0 +1,3 @@
1
+ rubocop:
2
+ config_file: .rubocop.yml
3
+ version: 0.80.0
@@ -1,8 +1,20 @@
1
- Documentation:
2
- Enabled: false
3
1
 
4
- Metrics/LineLength:
2
+ Layout/LineLength:
5
3
  Max: 150
6
4
 
7
5
  Metrics/BlockLength:
8
6
  Max: 50
7
+
8
+ Metrics/AbcSize:
9
+ Max: 30
10
+
11
+ Style/Documentation:
12
+ Enabled: false
13
+ Style/HashEachMethods:
14
+ Enabled: true
15
+
16
+ Style/HashTransformKeys:
17
+ Enabled: true
18
+
19
+ Style/HashTransformValues:
20
+ Enabled: true
data/Gemfile CHANGED
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  source 'https://rubygems.org'
2
4
 
3
5
  git_source(:github) { |repo_name| "https://github.com/#{repo_name}" }
@@ -1,51 +1,45 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- rtesseract (3.0.5)
5
- nokogiri
4
+ rtesseract (3.1.0)
6
5
 
7
6
  GEM
8
7
  remote: https://rubygems.org/
9
8
  specs:
10
- coveralls (0.7.2)
11
- multi_json (~> 1.3)
12
- rest-client (= 1.6.7)
13
- simplecov (>= 0.7)
14
- term-ansicolor (= 1.2.2)
15
- thor (= 0.18.1)
9
+ coveralls (0.8.23)
10
+ json (>= 1.8, < 3)
11
+ simplecov (~> 0.16.1)
12
+ term-ansicolor (~> 1.3)
13
+ thor (>= 0.19.4, < 2.0)
14
+ tins (~> 1.6)
16
15
  diff-lcs (1.3)
17
16
  docile (1.3.2)
18
- mime-types (3.3.1)
19
- mime-types-data (~> 3.2015)
20
- mime-types-data (3.2019.1009)
21
- mini_portile2 (2.4.0)
22
- multi_json (1.14.1)
23
- nokogiri (1.10.9)
24
- mini_portile2 (~> 2.4.0)
17
+ json (2.3.0)
25
18
  rake (13.0.1)
26
- rest-client (1.6.7)
27
- mime-types (>= 1.16)
28
19
  rspec (3.9.0)
29
20
  rspec-core (~> 3.9.0)
30
21
  rspec-expectations (~> 3.9.0)
31
22
  rspec-mocks (~> 3.9.0)
32
23
  rspec-core (3.9.1)
33
24
  rspec-support (~> 3.9.1)
34
- rspec-expectations (3.9.0)
25
+ rspec-expectations (3.9.1)
35
26
  diff-lcs (>= 1.2.0, < 2.0)
36
27
  rspec-support (~> 3.9.0)
37
28
  rspec-mocks (3.9.1)
38
29
  diff-lcs (>= 1.2.0, < 2.0)
39
30
  rspec-support (~> 3.9.0)
40
31
  rspec-support (3.9.2)
41
- simplecov (0.18.5)
32
+ simplecov (0.16.1)
42
33
  docile (~> 1.1)
43
- simplecov-html (~> 0.11)
44
- simplecov-html (0.12.2)
45
- term-ansicolor (1.2.2)
46
- tins (~> 0.8)
47
- thor (0.18.1)
48
- tins (0.13.2)
34
+ json (>= 1.8, < 3)
35
+ simplecov-html (~> 0.10.0)
36
+ simplecov-html (0.10.2)
37
+ sync (0.5.0)
38
+ term-ansicolor (1.7.1)
39
+ tins (~> 1.0)
40
+ thor (1.0.1)
41
+ tins (1.24.1)
42
+ sync
49
43
 
50
44
  PLATFORMS
51
45
  ruby
@@ -56,7 +50,6 @@ DEPENDENCIES
56
50
  rake
57
51
  rspec
58
52
  rtesseract!
59
- simplecov
60
53
 
61
54
  BUNDLED WITH
62
55
  2.1.4
data/Rakefile CHANGED
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'bundler/gem_tasks'
2
4
  require 'rspec/core/rake_task'
3
5
 
@@ -1,4 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'bundler/setup'
4
5
  require 'rtesseract'
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'rtesseract/check'
2
4
  require 'rtesseract/configuration'
3
5
  require 'rtesseract/command'
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'tmpdir'
2
4
  require 'securerandom'
3
5
  require 'pathname'
@@ -1,33 +1,45 @@
1
- require 'nokogiri'
1
+ # frozen_string_literal: true
2
2
 
3
3
  class RTesseract
4
4
  module Box
5
5
  extend RTesseract::Base
6
6
 
7
- def self.run(source, errors, options)
8
- options.tessedit_create_hocr = 1
7
+ class << self
8
+ def run(source, errors, options)
9
+ options.tessedit_create_hocr = 1
9
10
 
10
- RTesseract::Command.new(source, temp_file, errors, options).run
11
+ RTesseract::Command.new(source, temp_file, errors, options).run
11
12
 
12
- parse(File.read(temp_file('.hocr')))
13
- end
13
+ parse(File.read(temp_file('.hocr')))
14
+ end
14
15
 
15
- def self.parse(content)
16
- html = Nokogiri::HTML(content)
17
- html.css('span.ocrx_word, span.ocr_word').map do |word|
18
- attributes = word.attributes['title'].value.to_s.delete(';').split(' ')
19
- word_info(word, attributes)
16
+ def parse(content)
17
+ content.lines.map { |line| parse_line(line) }.compact
20
18
  end
21
- end
22
19
 
23
- def self.word_info(word, data)
24
- {
25
- word: word.text,
26
- x_start: data[1].to_i,
27
- y_start: data[2].to_i,
28
- x_end: data[3].to_i,
29
- y_end: data[4].to_i
30
- }
20
+ def parse_line(line)
21
+ return unless line.match?(/oc(rx|r)_word/)
22
+
23
+ word = line.match(/(?<=>)(.*?)(?=<)/).to_s
24
+
25
+ return if word.strip == ''
26
+
27
+ word_info(word, parse_position(line))
28
+ end
29
+
30
+ def word_info(word, positions)
31
+ {
32
+ word: word,
33
+ x_start: positions[1].to_i,
34
+ y_start: positions[2].to_i,
35
+ x_end: positions[3].to_i,
36
+ y_end: positions[4].to_i
37
+ }
38
+ end
39
+
40
+ def parse_position(line)
41
+ line.match(/(?<=title)(.*?)(?=;)/).to_s.split(' ')
42
+ end
31
43
  end
32
44
  end
33
45
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class RTesseract
2
4
  class << self
3
5
  def tesseract_version
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class RTesseract
2
4
  class Command
3
5
  FIXED = %i[command psm oem lang tessdata_dir user_words user_patterns config_file].freeze
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'ostruct'
2
4
 
3
5
  class RTesseract
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class RTesseract
2
4
  module Pdf
3
5
  extend Base
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'open3'
2
4
 
3
5
  class RTesseract
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class RTesseract
2
4
  module Tsv
3
5
  extend Base
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class RTesseract
2
- VERSION = '3.0.5'.freeze
4
+ VERSION = '3.1.0'
3
5
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  lib = File.expand_path('lib', __dir__)
2
4
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
5
  require 'rtesseract/version'
@@ -8,9 +10,9 @@ Gem::Specification.new do |spec|
8
10
  spec.authors = ['Danilo Jeremias da Silva']
9
11
  spec.email = ['dannnylo@gmail.com']
10
12
 
11
- spec.summary = 'Ruby library for working with the Tesseract OCR.'.freeze
12
- spec.description = 'Ruby library for working with the Tesseract OCR.'.freeze
13
- spec.homepage = 'http://github.com/dannnylo/rtesseract'.freeze
13
+ spec.summary = 'Ruby library for working with the Tesseract OCR.'
14
+ spec.description = 'Ruby library for working with the Tesseract OCR.'
15
+ spec.homepage = 'http://github.com/dannnylo/rtesseract'
14
16
  spec.license = 'MIT'
15
17
 
16
18
  # Specify which files should be added to the gem when it is released.
@@ -26,7 +28,4 @@ Gem::Specification.new do |spec|
26
28
  spec.add_development_dependency 'coveralls'
27
29
  spec.add_development_dependency 'rake'
28
30
  spec.add_development_dependency 'rspec'
29
- spec.add_development_dependency 'simplecov'
30
-
31
- spec.add_dependency 'nokogiri'
32
31
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rtesseract
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.0.5
4
+ version: 3.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Danilo Jeremias da Silva
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-03-18 00:00:00.000000000 Z
11
+ date: 2020-03-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -66,34 +66,6 @@ dependencies:
66
66
  - - ">="
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0'
69
- - !ruby/object:Gem::Dependency
70
- name: simplecov
71
- requirement: !ruby/object:Gem::Requirement
72
- requirements:
73
- - - ">="
74
- - !ruby/object:Gem::Version
75
- version: '0'
76
- type: :development
77
- prerelease: false
78
- version_requirements: !ruby/object:Gem::Requirement
79
- requirements:
80
- - - ">="
81
- - !ruby/object:Gem::Version
82
- version: '0'
83
- - !ruby/object:Gem::Dependency
84
- name: nokogiri
85
- requirement: !ruby/object:Gem::Requirement
86
- requirements:
87
- - - ">="
88
- - !ruby/object:Gem::Version
89
- version: '0'
90
- type: :runtime
91
- prerelease: false
92
- version_requirements: !ruby/object:Gem::Requirement
93
- requirements:
94
- - - ">="
95
- - !ruby/object:Gem::Version
96
- version: '0'
97
69
  description: Ruby library for working with the Tesseract OCR.
98
70
  email:
99
71
  - dannnylo@gmail.com
@@ -103,6 +75,7 @@ extra_rdoc_files: []
103
75
  files:
104
76
  - ".document"
105
77
  - ".gitignore"
78
+ - ".hound.yml"
106
79
  - ".rspec"
107
80
  - ".rubocop.yml"
108
81
  - ".travis.yml"