rtesseract 2.2.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. checksums.yaml +5 -5
  2. data/.document +1 -2
  3. data/.gitignore +12 -0
  4. data/.rspec +2 -0
  5. data/.travis.yml +13 -10
  6. data/CODE_OF_CONDUCT.md +74 -0
  7. data/Gemfile +4 -17
  8. data/Gemfile.lock +40 -85
  9. data/LICENSE.txt +18 -17
  10. data/README.md +137 -0
  11. data/Rakefile +4 -48
  12. data/bin/console +14 -0
  13. data/bin/setup +8 -0
  14. data/lib/rtesseract.rb +22 -220
  15. data/lib/rtesseract/box.rb +15 -60
  16. data/lib/rtesseract/check.rb +14 -0
  17. data/lib/rtesseract/command.rb +41 -0
  18. data/lib/rtesseract/configuration.rb +15 -64
  19. data/lib/rtesseract/pdf.rb +18 -0
  20. data/lib/rtesseract/text.rb +9 -0
  21. data/lib/rtesseract/tsv.rb +18 -0
  22. data/lib/rtesseract/version.rb +3 -0
  23. data/rtesseract.gemspec +27 -98
  24. metadata +36 -85
  25. data/README.rdoc +0 -156
  26. data/VERSION +0 -1
  27. data/lib/processors/mini_magick.rb +0 -43
  28. data/lib/processors/none.rb +0 -34
  29. data/lib/processors/rmagick.rb +0 -46
  30. data/lib/rtesseract/blob.rb +0 -34
  31. data/lib/rtesseract/box_char.rb +0 -31
  32. data/lib/rtesseract/errors.rb +0 -21
  33. data/lib/rtesseract/mixed.rb +0 -54
  34. data/lib/rtesseract/processor.rb +0 -19
  35. data/lib/rtesseract/utils.rb +0 -44
  36. data/lib/rtesseract/uzn.rb +0 -47
  37. data/spec/configs/eng.user-words.txt +0 -13
  38. data/spec/images/README.pdf +0 -0
  39. data/spec/images/blank.tif +0 -0
  40. data/spec/images/mixed.tif +0 -0
  41. data/spec/images/orientation_reverse.png +0 -0
  42. data/spec/images/test with spaces.tif +0 -0
  43. data/spec/images/test-pdf.png +0 -0
  44. data/spec/images/test.bmp +0 -0
  45. data/spec/images/test.jpg +0 -0
  46. data/spec/images/test.png +0 -0
  47. data/spec/images/test.tif +0 -0
  48. data/spec/images/test1.tif +0 -0
  49. data/spec/images/test_words.png +0 -0
  50. data/spec/rtesseract_box_char_spec.rb +0 -82
  51. data/spec/rtesseract_box_spec.rb +0 -36
  52. data/spec/rtesseract_mixed_spec.rb +0 -49
  53. data/spec/rtesseract_spec.rb +0 -282
  54. data/spec/rtesseract_uzn_spec.rb +0 -56
  55. data/spec/spec_helper.rb +0 -21
@@ -0,0 +1,41 @@
1
+ require 'tmpdir'
2
+
3
+ class RTesseract
4
+ class Command
5
+ FIXED = [:command, :psm, :oem, :lang, :tessdata_dir, :user_words, :user_patterns, :config_file]
6
+
7
+ attr_reader :options
8
+
9
+ def initialize(source, output, options)
10
+ @source = source
11
+ @output = output
12
+ @options = options
13
+ end
14
+
15
+ def configs
16
+ @options.to_h.map { |key, value| ['-c', "#{key}=#{value}"] unless FIXED.include?(key) }.compact
17
+ end
18
+
19
+ def full_command
20
+ command = [options.command, @source, @output]
21
+
22
+ command << ['--psm', options.psm.to_s] if options.psm
23
+ command << ['--oem', options.oem.to_s] if options.oem
24
+ command << ['-l', options.lang] if options.lang
25
+
26
+ command << ['--tessdata_dir', options.tessdata_dir] if options.tessdata_dir
27
+ command << ['--user_words', options.user_words] if options.user_words
28
+ command << ['--user_patterns', options.user_patterns] if options.user_patterns
29
+
30
+ command << configs
31
+
32
+ command << options.config_file.to_s if options.config_file
33
+
34
+ command.flatten
35
+ end
36
+
37
+ def run
38
+ Open3.capture2e(*full_command)
39
+ end
40
+ end
41
+ end
@@ -1,75 +1,26 @@
1
- # RTesseract
2
- class RTesseract
3
- # Aliases to languages names
4
- LANGUAGES = {
5
- 'en' => 'eng',
6
- 'en-us' => 'eng',
7
- 'english' => 'eng',
8
- 'pt' => 'por',
9
- 'pt-br' => 'por',
10
- 'portuguese' => 'por',
11
- 'it' => 'ita',
12
- 'sp' => 'spa'
13
- }.freeze
14
-
15
- # Configuration class
16
- class Configuration
17
- attr_accessor :processor, :lang, :psm, :oem, :tessdata_dir, :user_words, :user_patterns, :command, :debug, :options_cmd
18
-
19
- def initialize
20
- @processor = 'rmagick'
21
- end
22
-
23
- # Global configuration
24
- def parent
25
- @parent ||= RTesseract.configuration || RTesseract::Configuration.new
26
- end
27
-
28
- # Set value of option
29
- def option(options, name, default = nil)
30
- self.instance_variable_set("@#{name}", options.option(name, parent.send(name)) || default)
31
- end
1
+ require 'ostruct'
32
2
 
33
- # Return the values of options
34
- def load_options(options, names = [])
35
- names.each { |name| option(options, name, nil) }
3
+ class RTesseract
4
+ class Configuration < OpenStruct
5
+ def merge(options)
6
+ RTesseract::Configuration.new(self.to_h.merge(options))
36
7
  end
37
8
  end
38
9
 
39
10
  class << self
40
- attr_accessor :configuration
41
- end
42
-
43
- def self.configure
44
- self.configuration ||= Configuration.new
45
- yield(configuration)
46
- self.clear_pdf_option
47
- end
48
-
49
- # Clear pdf option
50
- def self.clear_pdf_option
51
- if self.configuration.options_cmd
52
- self.configuration.options_cmd.delete('pdf')
53
- self.configuration.options_cmd.delete(:pdf)
11
+ def config
12
+ @config ||= RTesseract::Configuration.new(
13
+ command: 'tesseract',
14
+ debug_file: '/dev/null'
15
+ )
54
16
  end
55
- end
56
17
 
57
- # Default command
58
- def self.default_command
59
- TesseractBin::Executables[:tesseract] || 'tesseract'
60
- rescue
61
- 'tesseract'
62
- end
18
+ def configure
19
+ yield(config) if block_given?
20
+ end
63
21
 
64
- # Local config to instance
65
- def self.local_config(options = {})
66
- RTesseract::Configuration.new.tap do |config|
67
- config.command = config.option(options, :command, RTesseract.default_command)
68
- config.processor = config.option(options, :processor, 'rmagick')
69
- config.load_options(options, [:lang, :psm, :oem, :tessdata_dir, :user_words, :user_patterns])
70
- config.debug = config.option(options, :debug, false)
71
- pdf_opts = lambda { |o| o == 'pdf' || o == :pdf }
72
- config.options_cmd = [options.option(:options, nil)].delete_if(&pdf_opts).flatten.compact
22
+ def reset_config!
23
+ @config = nil
73
24
  end
74
25
  end
75
26
  end
@@ -0,0 +1,18 @@
1
+ require 'tmpdir'
2
+
3
+ class RTesseract
4
+ module Pdf
5
+ def self.temp_dir
6
+ @file_path = Pathname.new(Dir.tmpdir)
7
+ end
8
+
9
+ def self.run(source, options)
10
+ name = "rtesseract_#{SecureRandom.uuid}"
11
+ options.tessedit_create_pdf = 1
12
+
13
+ RTesseract::Command.new(source, temp_dir.join(name).to_s, options).run
14
+
15
+ File.open(temp_dir.join("#{name}.pdf").to_s, 'r')
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,9 @@
1
+ require 'open3'
2
+
3
+ class RTesseract
4
+ module Text
5
+ def self.run(source, options)
6
+ RTesseract::Command.new(source, 'stdout', options).run.first
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,18 @@
1
+ require 'tmpdir'
2
+
3
+ class RTesseract
4
+ module Tsv
5
+ def self.temp_dir
6
+ @file_path = Pathname.new(Dir.tmpdir)
7
+ end
8
+
9
+ def self.run(source, options)
10
+ name = "rtesseract_#{SecureRandom.uuid}"
11
+ options.tessedit_create_tsv = 1
12
+
13
+ RTesseract::Command.new(source, temp_dir.join(name).to_s, options).run
14
+
15
+ File.open(temp_dir.join("#{name}.tsv").to_s, 'r')
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,3 @@
1
+ class RTesseract
2
+ VERSION = '3.0.0'.freeze
3
+ end
data/rtesseract.gemspec CHANGED
@@ -1,104 +1,33 @@
1
- # Generated by jeweler
2
- # DO NOT EDIT THIS FILE DIRECTLY
3
- # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
- # -*- encoding: utf-8 -*-
5
- # stub: rtesseract 2.2.0 ruby lib
6
1
 
7
- Gem::Specification.new do |s|
8
- s.name = "rtesseract".freeze
9
- s.version = "2.2.0"
2
+ lib = File.expand_path("../lib", __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require "rtesseract/version"
10
5
 
11
- s.required_rubygems_version = Gem::Requirement.new(">= 0".freeze) if s.respond_to? :required_rubygems_version=
12
- s.require_paths = ["lib".freeze]
13
- s.authors = ["Danilo Jeremias da Silva".freeze]
14
- s.date = "2018-01-05"
15
- s.description = "Ruby library for working with the Tesseract OCR.".freeze
16
- s.email = "dannnylo@gmail.com".freeze
17
- s.extra_rdoc_files = [
18
- "LICENSE.txt",
19
- "README.rdoc"
20
- ]
21
- s.files = [
22
- ".document",
23
- ".rspec",
24
- ".travis.yml",
25
- "CHANGELOG.md",
26
- "Gemfile",
27
- "Gemfile.lock",
28
- "LICENSE.txt",
29
- "README.rdoc",
30
- "Rakefile",
31
- "VERSION",
32
- "lib/processors/mini_magick.rb",
33
- "lib/processors/none.rb",
34
- "lib/processors/rmagick.rb",
35
- "lib/rtesseract.rb",
36
- "lib/rtesseract/blob.rb",
37
- "lib/rtesseract/box.rb",
38
- "lib/rtesseract/box_char.rb",
39
- "lib/rtesseract/configuration.rb",
40
- "lib/rtesseract/errors.rb",
41
- "lib/rtesseract/mixed.rb",
42
- "lib/rtesseract/processor.rb",
43
- "lib/rtesseract/utils.rb",
44
- "lib/rtesseract/uzn.rb",
45
- "rtesseract.gemspec",
46
- "spec/configs/eng.user-words.txt",
47
- "spec/images/README.pdf",
48
- "spec/images/blank.tif",
49
- "spec/images/mixed.tif",
50
- "spec/images/orientation_reverse.png",
51
- "spec/images/test with spaces.tif",
52
- "spec/images/test-pdf.png",
53
- "spec/images/test.bmp",
54
- "spec/images/test.jpg",
55
- "spec/images/test.png",
56
- "spec/images/test.tif",
57
- "spec/images/test1.tif",
58
- "spec/images/test_words.png",
59
- "spec/rtesseract_box_char_spec.rb",
60
- "spec/rtesseract_box_spec.rb",
61
- "spec/rtesseract_mixed_spec.rb",
62
- "spec/rtesseract_spec.rb",
63
- "spec/rtesseract_uzn_spec.rb",
64
- "spec/spec_helper.rb"
65
- ]
66
- s.homepage = "http://github.com/dannnylo/rtesseract".freeze
67
- s.licenses = ["MIT".freeze]
68
- s.rubygems_version = "2.6.14".freeze
69
- s.summary = "Ruby library for working with the Tesseract OCR.".freeze
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "rtesseract"
8
+ spec.version = RTesseract::VERSION
9
+ spec.authors = ["Danilo Jeremias da Silva"]
10
+ spec.email = ["dannnylo@gmail.com"]
70
11
 
71
- if s.respond_to? :specification_version then
72
- s.specification_version = 4
12
+ spec.summary = "Ruby library for working with the Tesseract OCR.".freeze
13
+ spec.description = "Ruby library for working with the Tesseract OCR.".freeze
14
+ spec.homepage = "http://github.com/dannnylo/rtesseract".freeze
15
+ spec.license = "MIT"
73
16
 
74
- if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
75
- s.add_runtime_dependency(%q<nokogiri>.freeze, [">= 0"])
76
- s.add_development_dependency(%q<rspec>.freeze, [">= 0"])
77
- s.add_development_dependency(%q<rdoc>.freeze, [">= 0"])
78
- s.add_development_dependency(%q<bundler>.freeze, [">= 0"])
79
- s.add_development_dependency(%q<jeweler>.freeze, [">= 0"])
80
- s.add_development_dependency(%q<simplecov>.freeze, [">= 0"])
81
- s.add_development_dependency(%q<json>.freeze, [">= 0"])
82
- s.add_development_dependency(%q<coveralls>.freeze, [">= 0"])
83
- else
84
- s.add_dependency(%q<nokogiri>.freeze, [">= 0"])
85
- s.add_dependency(%q<rspec>.freeze, [">= 0"])
86
- s.add_dependency(%q<rdoc>.freeze, [">= 0"])
87
- s.add_dependency(%q<bundler>.freeze, [">= 0"])
88
- s.add_dependency(%q<jeweler>.freeze, [">= 0"])
89
- s.add_dependency(%q<simplecov>.freeze, [">= 0"])
90
- s.add_dependency(%q<json>.freeze, [">= 0"])
91
- s.add_dependency(%q<coveralls>.freeze, [">= 0"])
92
- end
93
- else
94
- s.add_dependency(%q<nokogiri>.freeze, [">= 0"])
95
- s.add_dependency(%q<rspec>.freeze, [">= 0"])
96
- s.add_dependency(%q<rdoc>.freeze, [">= 0"])
97
- s.add_dependency(%q<bundler>.freeze, [">= 0"])
98
- s.add_dependency(%q<jeweler>.freeze, [">= 0"])
99
- s.add_dependency(%q<simplecov>.freeze, [">= 0"])
100
- s.add_dependency(%q<json>.freeze, [">= 0"])
101
- s.add_dependency(%q<coveralls>.freeze, [">= 0"])
17
+ # Specify which files should be added to the gem when it is released.
18
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
19
+ spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
20
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
102
21
  end
103
- end
22
+ spec.bindir = "exe"
23
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
24
+ spec.require_paths = ["lib"]
25
+
26
+ spec.add_development_dependency "bundler", "~> 1.17"
27
+ spec.add_development_dependency "rake", "~> 10.0"
28
+ spec.add_development_dependency "rspec", "~> 3.0"
29
+ spec.add_development_dependency "simplecov"
30
+ spec.add_development_dependency "coveralls"
104
31
 
32
+ spec.add_dependency "nokogiri"
33
+ end
metadata CHANGED
@@ -1,85 +1,57 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rtesseract
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.2.0
4
+ version: 3.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Danilo Jeremias da Silva
8
8
  autorequire:
9
- bindir: bin
9
+ bindir: exe
10
10
  cert_chain: []
11
- date: 2018-01-05 00:00:00.000000000 Z
11
+ date: 2019-01-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: nokogiri
15
- requirement: !ruby/object:Gem::Requirement
16
- requirements:
17
- - - ">="
18
- - !ruby/object:Gem::Version
19
- version: '0'
20
- type: :runtime
21
- prerelease: false
22
- version_requirements: !ruby/object:Gem::Requirement
23
- requirements:
24
- - - ">="
25
- - !ruby/object:Gem::Version
26
- version: '0'
27
- - !ruby/object:Gem::Dependency
28
- name: rspec
29
- requirement: !ruby/object:Gem::Requirement
30
- requirements:
31
- - - ">="
32
- - !ruby/object:Gem::Version
33
- version: '0'
34
- type: :development
35
- prerelease: false
36
- version_requirements: !ruby/object:Gem::Requirement
37
- requirements:
38
- - - ">="
39
- - !ruby/object:Gem::Version
40
- version: '0'
41
- - !ruby/object:Gem::Dependency
42
- name: rdoc
14
+ name: bundler
43
15
  requirement: !ruby/object:Gem::Requirement
44
16
  requirements:
45
- - - ">="
17
+ - - "~>"
46
18
  - !ruby/object:Gem::Version
47
- version: '0'
19
+ version: '1.17'
48
20
  type: :development
49
21
  prerelease: false
50
22
  version_requirements: !ruby/object:Gem::Requirement
51
23
  requirements:
52
- - - ">="
24
+ - - "~>"
53
25
  - !ruby/object:Gem::Version
54
- version: '0'
26
+ version: '1.17'
55
27
  - !ruby/object:Gem::Dependency
56
- name: bundler
28
+ name: rake
57
29
  requirement: !ruby/object:Gem::Requirement
58
30
  requirements:
59
- - - ">="
31
+ - - "~>"
60
32
  - !ruby/object:Gem::Version
61
- version: '0'
33
+ version: '10.0'
62
34
  type: :development
63
35
  prerelease: false
64
36
  version_requirements: !ruby/object:Gem::Requirement
65
37
  requirements:
66
- - - ">="
38
+ - - "~>"
67
39
  - !ruby/object:Gem::Version
68
- version: '0'
40
+ version: '10.0'
69
41
  - !ruby/object:Gem::Dependency
70
- name: jeweler
42
+ name: rspec
71
43
  requirement: !ruby/object:Gem::Requirement
72
44
  requirements:
73
- - - ">="
45
+ - - "~>"
74
46
  - !ruby/object:Gem::Version
75
- version: '0'
47
+ version: '3.0'
76
48
  type: :development
77
49
  prerelease: false
78
50
  version_requirements: !ruby/object:Gem::Requirement
79
51
  requirements:
80
- - - ">="
52
+ - - "~>"
81
53
  - !ruby/object:Gem::Version
82
- version: '0'
54
+ version: '3.0'
83
55
  - !ruby/object:Gem::Dependency
84
56
  name: simplecov
85
57
  requirement: !ruby/object:Gem::Requirement
@@ -95,7 +67,7 @@ dependencies:
95
67
  - !ruby/object:Gem::Version
96
68
  version: '0'
97
69
  - !ruby/object:Gem::Dependency
98
- name: json
70
+ name: coveralls
99
71
  requirement: !ruby/object:Gem::Requirement
100
72
  requirements:
101
73
  - - ">="
@@ -109,13 +81,13 @@ dependencies:
109
81
  - !ruby/object:Gem::Version
110
82
  version: '0'
111
83
  - !ruby/object:Gem::Dependency
112
- name: coveralls
84
+ name: nokogiri
113
85
  requirement: !ruby/object:Gem::Requirement
114
86
  requirements:
115
87
  - - ">="
116
88
  - !ruby/object:Gem::Version
117
89
  version: '0'
118
- type: :development
90
+ type: :runtime
119
91
  prerelease: false
120
92
  version_requirements: !ruby/object:Gem::Requirement
121
93
  requirements:
@@ -123,56 +95,35 @@ dependencies:
123
95
  - !ruby/object:Gem::Version
124
96
  version: '0'
125
97
  description: Ruby library for working with the Tesseract OCR.
126
- email: dannnylo@gmail.com
98
+ email:
99
+ - dannnylo@gmail.com
127
100
  executables: []
128
101
  extensions: []
129
- extra_rdoc_files:
130
- - LICENSE.txt
131
- - README.rdoc
102
+ extra_rdoc_files: []
132
103
  files:
133
104
  - ".document"
105
+ - ".gitignore"
134
106
  - ".rspec"
135
107
  - ".travis.yml"
136
108
  - CHANGELOG.md
109
+ - CODE_OF_CONDUCT.md
137
110
  - Gemfile
138
111
  - Gemfile.lock
139
112
  - LICENSE.txt
140
- - README.rdoc
113
+ - README.md
141
114
  - Rakefile
142
- - VERSION
143
- - lib/processors/mini_magick.rb
144
- - lib/processors/none.rb
145
- - lib/processors/rmagick.rb
115
+ - bin/console
116
+ - bin/setup
146
117
  - lib/rtesseract.rb
147
- - lib/rtesseract/blob.rb
148
118
  - lib/rtesseract/box.rb
149
- - lib/rtesseract/box_char.rb
119
+ - lib/rtesseract/check.rb
120
+ - lib/rtesseract/command.rb
150
121
  - lib/rtesseract/configuration.rb
151
- - lib/rtesseract/errors.rb
152
- - lib/rtesseract/mixed.rb
153
- - lib/rtesseract/processor.rb
154
- - lib/rtesseract/utils.rb
155
- - lib/rtesseract/uzn.rb
122
+ - lib/rtesseract/pdf.rb
123
+ - lib/rtesseract/text.rb
124
+ - lib/rtesseract/tsv.rb
125
+ - lib/rtesseract/version.rb
156
126
  - rtesseract.gemspec
157
- - spec/configs/eng.user-words.txt
158
- - spec/images/README.pdf
159
- - spec/images/blank.tif
160
- - spec/images/mixed.tif
161
- - spec/images/orientation_reverse.png
162
- - spec/images/test with spaces.tif
163
- - spec/images/test-pdf.png
164
- - spec/images/test.bmp
165
- - spec/images/test.jpg
166
- - spec/images/test.png
167
- - spec/images/test.tif
168
- - spec/images/test1.tif
169
- - spec/images/test_words.png
170
- - spec/rtesseract_box_char_spec.rb
171
- - spec/rtesseract_box_spec.rb
172
- - spec/rtesseract_mixed_spec.rb
173
- - spec/rtesseract_spec.rb
174
- - spec/rtesseract_uzn_spec.rb
175
- - spec/spec_helper.rb
176
127
  homepage: http://github.com/dannnylo/rtesseract
177
128
  licenses:
178
129
  - MIT
@@ -193,7 +144,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
193
144
  version: '0'
194
145
  requirements: []
195
146
  rubyforge_project:
196
- rubygems_version: 2.6.14
147
+ rubygems_version: 2.7.6
197
148
  signing_key:
198
149
  specification_version: 4
199
150
  summary: Ruby library for working with the Tesseract OCR.