bagira 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: c89e0a11106f559851a61973771b5470c7311fd8
4
+ data.tar.gz: d24ecb98d615bd5dc485cd14120edb4553151f32
5
+ SHA512:
6
+ metadata.gz: 8b72a911a0d94c4618a4996f29e69d8bbc2910199b595fdfc5bc8bc216a7e3bd60c43835152a50319e92e903c7022769c9d616609f8fba409f2e6326cf96ec90
7
+ data.tar.gz: 6fe3df94d34f740e69b896785bdf1bc1cc33254ff2cee451f5523947a6a4b7f57f2f57d13f241038959c971b2f191f060b5a5adeab929323fadd7d325ff8b255
data/.gitignore ADDED
@@ -0,0 +1,9 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --require spec_helper
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in bagira.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2016 Juan Manuel Vallejo
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,54 @@
1
+ # Bagira
2
+
3
+ This is a simple gem that executes Ghostscript and Tesseract OCR via command line in order to perform ocr on images and pdf documets. It currently supports JPG, PNG and PDF.
4
+
5
+ ## Installation
6
+
7
+ Before using please make sure that you have installed both Ghostscript and Tesseract OCR
8
+
9
+ On Ubuntu:
10
+ ```
11
+ sudo apt-get install libgs-dev
12
+ sudo apt-get install tesseract-ocr
13
+ ```
14
+
15
+ On Mac (Homebrew):
16
+ ```
17
+ brew install gs
18
+ brew install tesseract
19
+ ```
20
+
21
+ Add this line to your application's Gemfile:
22
+
23
+ ```ruby
24
+ gem 'bagira'
25
+ ```
26
+
27
+ And then execute:
28
+
29
+ $ bundle
30
+
31
+ Or install it yourself as:
32
+
33
+ $ gem install bagira
34
+
35
+ ## Usage
36
+
37
+ ```ruby
38
+ bagira = Bagira.new(path_to_your_file)
39
+ output = bagira.perform_ocr
40
+ ```
41
+
42
+ ## Development
43
+
44
+ rSpec is included with basic tests, you can check the spec/ folder for more details.
45
+
46
+ ## Contributing
47
+
48
+ Bug reports and pull requests are welcome on GitHub at https://bitbucket.org/juanmvallejo/bagira/issues
49
+
50
+
51
+ ## License
52
+
53
+ The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
54
+
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+ task :default => :spec
data/bagira.gemspec ADDED
@@ -0,0 +1,28 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'bagira/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "bagira"
8
+ spec.version = Bagira::VERSION
9
+ spec.authors = ["Juan Manuel Vallejo"]
10
+ spec.email = ["jmvallejo@gmail.com"]
11
+
12
+ spec.summary = %q{This gem uses Ghostscript and Tesseract to perform OCR on various file formats.}
13
+ spec.description = %q{This gem uses Ghostscript and Tesseract to perform OCR on various file formats.}
14
+ spec.homepage = "http://bitbucket.org/juanmvallejo/bagira/overview"
15
+ spec.license = "MIT"
16
+
17
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
18
+ spec.bindir = "exe"
19
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
+ spec.require_paths = ["lib"]
21
+
22
+ spec.add_development_dependency "bundler", "~> 1.11"
23
+ spec.add_development_dependency "rake", "~> 10.0"
24
+ spec.add_development_dependency "rspec"
25
+
26
+ # Runtime dependencies
27
+ spec.add_dependency "pdf-reader", "~> 1.4"
28
+ end
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "bagira"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,3 @@
1
+ class Bagira
2
+ VERSION = "0.1.0"
3
+ end
data/lib/bagira.rb ADDED
@@ -0,0 +1,59 @@
1
+ require "bagira/version"
2
+ require "pdf-reader"
3
+
4
+ class Bagira
5
+
6
+ FILE_TYPE = { :invalid => 'invalid', :not_found => 'not_found', :pdf => 'pdf', :jpg => 'jpg', :png => 'png' }
7
+
8
+ attr_reader :file_type
9
+ attr_reader :filepath
10
+ attr_reader :page_count
11
+
12
+ def initialize(filepath)
13
+ @file_type = FILE_TYPE[:invalid]
14
+ @file_type = FILE_TYPE[:pdf] unless filepath.match(/\.pdf$/).nil?
15
+ @file_type = FILE_TYPE[:png] unless filepath.match(/\.png$/).nil?
16
+ @file_type = FILE_TYPE[:jpg] unless filepath.match(/\.jpg$/).nil?
17
+ @file_type = FILE_TYPE[:not_found] unless File.exist?(filepath)
18
+
19
+ @filepath = filepath
20
+
21
+ @page_count = 0
22
+ @page_count = 1 if is_image?
23
+ if is_document?
24
+ reader = PDF::Reader.new(filepath)
25
+ @page_count = reader.page_count
26
+ end
27
+ end
28
+
29
+ def perform_ocr
30
+ return process_image(@filepath) if is_image?
31
+ return process_pdf(@filepath) if is_document?
32
+ end
33
+
34
+ private
35
+ def process_image(filepath)
36
+ result = %x(tesseract #{filepath} stdout)
37
+ return result.strip unless result.nil?
38
+ end
39
+
40
+ def process_pdf(filepath)
41
+ result_array = []
42
+ for i in 1..@page_count
43
+ page_result = %x(gs -q -dNOPAUSE -sDEVICE=pnggray -dTextAlphaBits=4 -r300 -sOutputFile=%stdout -dBATCH -dFirstPage=#{i} -dLastPage=#{i} #{filepath}|tesseract stdin stdout)
44
+ result_array << page_result.strip unless page_result.nil?
45
+ end
46
+ result_array.join("\n")
47
+ end
48
+
49
+ def is_image?
50
+ return true if (@file_type.eql?(FILE_TYPE[:jpg]) or @file_type.eql?(FILE_TYPE[:png]))
51
+ return false
52
+ end
53
+
54
+ def is_document?
55
+ return true if (@file_type.eql?(FILE_TYPE[:pdf]))
56
+ return false
57
+ end
58
+
59
+ end
metadata ADDED
@@ -0,0 +1,112 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: bagira
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Juan Manuel Vallejo
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2016-04-19 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '1.11'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '1.11'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ~>
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: pdf-reader
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ version: '1.4'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ~>
67
+ - !ruby/object:Gem::Version
68
+ version: '1.4'
69
+ description: This gem uses Ghostscript and Tesseract to perform OCR on various file
70
+ formats.
71
+ email:
72
+ - jmvallejo@gmail.com
73
+ executables: []
74
+ extensions: []
75
+ extra_rdoc_files: []
76
+ files:
77
+ - .gitignore
78
+ - .rspec
79
+ - Gemfile
80
+ - LICENSE.txt
81
+ - README.md
82
+ - Rakefile
83
+ - bagira.gemspec
84
+ - bin/console
85
+ - bin/setup
86
+ - lib/bagira.rb
87
+ - lib/bagira/version.rb
88
+ homepage: http://bitbucket.org/juanmvallejo/bagira/overview
89
+ licenses:
90
+ - MIT
91
+ metadata: {}
92
+ post_install_message:
93
+ rdoc_options: []
94
+ require_paths:
95
+ - lib
96
+ required_ruby_version: !ruby/object:Gem::Requirement
97
+ requirements:
98
+ - - '>='
99
+ - !ruby/object:Gem::Version
100
+ version: '0'
101
+ required_rubygems_version: !ruby/object:Gem::Requirement
102
+ requirements:
103
+ - - '>='
104
+ - !ruby/object:Gem::Version
105
+ version: '0'
106
+ requirements: []
107
+ rubyforge_project:
108
+ rubygems_version: 2.4.8
109
+ signing_key:
110
+ specification_version: 4
111
+ summary: This gem uses Ghostscript and Tesseract to perform OCR on various file formats.
112
+ test_files: []