scandex 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (6) hide show
  1. checksums.yaml +7 -0
  2. data/bin/scandex +3 -0
  3. data/lib/runner.rb +50 -0
  4. data/lib/scandex.rb +135 -0
  5. data/lib/version.rb +3 -0
  6. metadata +134 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 8219330418a57d92510475b295080d6664feef75
4
+ data.tar.gz: 33e6ae442689c6839a6e60bd573ca6764d0f98d0
5
+ SHA512:
6
+ metadata.gz: 132313db43cece7cc6e91dd3f21a12b8cae2d4b62f615febfc64f68efbc532b1310a77675d426be2e8bd032a02a0f485e1f96d7124c664e0b4cd16e61948b200
7
+ data.tar.gz: abfef9971c9d4223fe600761dfa2179dabf3b36cb0de1ec44eda99be9113a7ff093ce4d6f793be256f5d79672ff68640d32dd94a0670478e70f9dcc5910590cb
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env ruby
2
+ require_relative '../lib/runner'
3
+ Runner.start(ARGV)
@@ -0,0 +1,50 @@
1
+ require 'thor'
2
+ require 'filewatcher'
3
+ require_relative 'scandex'
4
+
5
+ class Runner < Thor
6
+ class_option :f, :banner => "Directory where to store index"
7
+
8
+ desc "doctor", "Checks system to make sure everything is installed properly"
9
+ def doctor
10
+ ScanDex::doctor()
11
+ end
12
+
13
+ desc "list", "List all documents indexed"
14
+ def list
15
+ files = ScanDex::documents(options[:f])
16
+ files.each do |file|
17
+ puts "#{file[0]} #{file[1]}"
18
+ end
19
+ end
20
+
21
+ desc "index FILES", "Index scanned FILES"
22
+ option :force, :type => :boolean
23
+ def index(*files)
24
+ # If doctor fails then there is no point in even trying
25
+ if !ScanDex::doctor()
26
+ return
27
+ end
28
+ files.each do |file|
29
+ ScanDex::index_and_store(options[:f], file, options[:force])
30
+ end
31
+ end
32
+
33
+ desc "search TERM", "Search documents containing TERM"
34
+ def search(term)
35
+ files = ScanDex::search_documents(options[:f], term)
36
+ files.each do |file|
37
+ puts "#{file[0]} #{file[1]}"
38
+ end
39
+ end
40
+
41
+ desc "watch DIRECTORIES", "Watch DIRECTORIES for new files to index"
42
+ def watch(*directories)
43
+ FileWatcher.new(directories).watch do |file, event|
44
+ if (event == :changed || event == :new)
45
+ ScanDex::index_and_store(options[:f], file, true)
46
+ end
47
+ end
48
+ end
49
+ end
50
+
@@ -0,0 +1,135 @@
1
+ require 'rtesseract'
2
+ require 'sqlite3'
3
+
4
+ module ScanDex
5
+ def self.tesseract
6
+ `which tesseract`.strip
7
+ end
8
+
9
+ def self.gs
10
+ `which gs`.strip
11
+ end
12
+
13
+ def self.convert
14
+ `which convert`.strip
15
+ end
16
+
17
+ def self.doctor
18
+ convert = self.convert
19
+ if convert.empty?
20
+ puts "ImageMagick is missing"
21
+ return false
22
+ end
23
+ gs = self.gs
24
+ if gs.empty?
25
+ puts "GhostScript is missing"
26
+ return false
27
+ end
28
+ tesseract = self.tesseract
29
+ if tesseract.empty?
30
+ puts "Tesseract is missing"
31
+ return false
32
+ end
33
+ #puts "All Tools Available"
34
+ true
35
+ end
36
+
37
+ def self.index_and_store(store_path, file, force = false)
38
+ content = ScanDex::index(store_path, file, force)
39
+ if !content.nil?
40
+ file = File.expand_path(file)
41
+ ScanDex::store_document(store_path, file, content)
42
+ end
43
+ end
44
+
45
+ def self.index(store_path, source, force = false)
46
+ accepted_formats = ['.pdf', '.png', '.jpg', '.jpeg', '.tiff']
47
+ if (force || !self.has_document(store_path, source)) && accepted_formats.include?(File.extname(source).downcase)
48
+ puts "Indexing #{source}"
49
+ tmp = Dir.mktmpdir('scandex_')
50
+ pages = convert_to_gray_scale(source, tmp)
51
+ puts "Found #{pages.size} page(s)"
52
+ if pages.size > 0
53
+ ocr(pages)
54
+ else
55
+ nil
56
+ end
57
+ else
58
+ puts "Ignoring '#{source}'"
59
+ nil
60
+ end
61
+ end
62
+
63
+ def self.convert_to_gray_scale(source, destination)
64
+ cmd = "#{self.convert} -density 300 -depth 8 -type grayscale \"#{source}\" #{destination}/convert-%04d.jpg"
65
+ #puts "cmd = #{cmd}"
66
+ puts "Converting '#{File.basename(source)}'"
67
+ ret = system(cmd)
68
+ if !ret
69
+ puts "Failed to convert #{source}"
70
+ []
71
+ else
72
+ Dir["#{destination}/convert-*.jpg"]
73
+ end
74
+ end
75
+
76
+ def self.ocr(pages, language = "eng")
77
+ text = ''
78
+ pages.each do |page|
79
+ puts "OCR on '#{File.basename(page)}'"
80
+ text += image_to_string(page, language)
81
+ end
82
+ text
83
+ end
84
+
85
+ # TODO orientation and language detection
86
+ def self.image_to_string(image, language = "eng")
87
+ img = RTesseract.new(image, :lang => language)
88
+ img.to_s
89
+ end
90
+
91
+ def self.db(store_path)
92
+ store_path = '~/' if store_path.nil? || store_path.empty?
93
+ filename = File.expand_path("#{store_path}/.scandex.db")
94
+ migrate = !File.exists?(filename)
95
+ db = SQLite3::Database.new(filename)
96
+ if migrate
97
+ puts "Creating DB"
98
+ db.execute("CREATE TABLE documents (name VARCHAR(255), content TEXT, created TEXT, modified TEXT)")
99
+ end
100
+ db
101
+ end
102
+
103
+ def self.has_document(store_path, name)
104
+ db = self.db(store_path)
105
+ rows = db.execute("SELECT name FROM documents WHERE name = ?", [name])
106
+ rows.size > 0
107
+ end
108
+
109
+ def self.documents(store_path)
110
+ db = self.db(store_path)
111
+ db.execute("SELECT name, created, modified FROM documents")
112
+ end
113
+
114
+ def self.search_documents(store_path, text)
115
+ db = self.db(store_path)
116
+ pattern = "%#{text.downcase}%"
117
+ db.execute("SELECT name, created, modified FROM documents WHERE LOWER(content) LIKE ? OR LOWER(name) LIKE ?", [pattern, pattern])
118
+ end
119
+
120
+ def self.store_document(store_path, source, content)
121
+ created = File.mtime(source).utc.iso8601
122
+ modified = File.ctime(source).utc.iso8601
123
+
124
+ db = self.db(store_path)
125
+ rows = db.execute("SELECT * FROM documents WHERE name = ?", source)
126
+ if rows.size == 0
127
+ puts "Insert: #{source} #{created} #{modified}"
128
+ db.execute("INSERT INTO documents (name, content, created, modified) VALUES (?, ?, ?, ?)", [source, content, created, modified])
129
+ else
130
+ puts "Update: #{source} #{created} #{modified}"
131
+ db.execute("UPDATE documents SET content = ?, modified = ? WHERE name = ?", [content, modified, source])
132
+ end
133
+ end
134
+ end
135
+
@@ -0,0 +1,3 @@
1
+ module ScanDex
2
+ VERSION = '0.1.1'
3
+ end
metadata ADDED
@@ -0,0 +1,134 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: scandex
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ platform: ruby
6
+ authors:
7
+ - Jerome Poichet
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-02-22 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rtesseract
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '='
18
+ - !ruby/object:Gem::Version
19
+ version: 1.3.2
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '='
25
+ - !ruby/object:Gem::Version
26
+ version: 1.3.2
27
+ - !ruby/object:Gem::Dependency
28
+ name: rmagick
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '='
32
+ - !ruby/object:Gem::Version
33
+ version: 2.15.4
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '='
39
+ - !ruby/object:Gem::Version
40
+ version: 2.15.4
41
+ - !ruby/object:Gem::Dependency
42
+ name: sqlite3
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '='
46
+ - !ruby/object:Gem::Version
47
+ version: 1.3.11
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '='
53
+ - !ruby/object:Gem::Version
54
+ version: 1.3.11
55
+ - !ruby/object:Gem::Dependency
56
+ name: thor
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '='
60
+ - !ruby/object:Gem::Version
61
+ version: 0.19.1
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '='
67
+ - !ruby/object:Gem::Version
68
+ version: 0.19.1
69
+ - !ruby/object:Gem::Dependency
70
+ name: filewatcher
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '='
74
+ - !ruby/object:Gem::Version
75
+ version: 0.5.3
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '='
81
+ - !ruby/object:Gem::Version
82
+ version: 0.5.3
83
+ - !ruby/object:Gem::Dependency
84
+ name: gems
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - '='
88
+ - !ruby/object:Gem::Version
89
+ version: 2.4.6
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - '='
95
+ - !ruby/object:Gem::Version
96
+ version: 2.4.6
97
+ description: A very simple tool to index scanned documents in either PDF or image
98
+ format
99
+ email: poitch@gmail.com
100
+ executables:
101
+ - scandex
102
+ extensions: []
103
+ extra_rdoc_files: []
104
+ files:
105
+ - bin/scandex
106
+ - lib/runner.rb
107
+ - lib/scandex.rb
108
+ - lib/version.rb
109
+ homepage: http://github.com/poitch/scandex
110
+ licenses:
111
+ - MIT
112
+ metadata: {}
113
+ post_install_message:
114
+ rdoc_options: []
115
+ require_paths:
116
+ - lib
117
+ - lib
118
+ required_ruby_version: !ruby/object:Gem::Requirement
119
+ requirements:
120
+ - - ">="
121
+ - !ruby/object:Gem::Version
122
+ version: '0'
123
+ required_rubygems_version: !ruby/object:Gem::Requirement
124
+ requirements:
125
+ - - ">="
126
+ - !ruby/object:Gem::Version
127
+ version: '0'
128
+ requirements: []
129
+ rubyforge_project:
130
+ rubygems_version: 2.4.6
131
+ signing_key:
132
+ specification_version: 4
133
+ summary: Index your scanned paper documents
134
+ test_files: []