scandex 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (6) hide show
  1. checksums.yaml +7 -0
  2. data/bin/scandex +3 -0
  3. data/lib/runner.rb +50 -0
  4. data/lib/scandex.rb +135 -0
  5. data/lib/version.rb +3 -0
  6. metadata +134 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 8219330418a57d92510475b295080d6664feef75
4
+ data.tar.gz: 33e6ae442689c6839a6e60bd573ca6764d0f98d0
5
+ SHA512:
6
+ metadata.gz: 132313db43cece7cc6e91dd3f21a12b8cae2d4b62f615febfc64f68efbc532b1310a77675d426be2e8bd032a02a0f485e1f96d7124c664e0b4cd16e61948b200
7
+ data.tar.gz: abfef9971c9d4223fe600761dfa2179dabf3b36cb0de1ec44eda99be9113a7ff093ce4d6f793be256f5d79672ff68640d32dd94a0670478e70f9dcc5910590cb
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env ruby
2
+ require_relative '../lib/runner'
3
+ Runner.start(ARGV)
@@ -0,0 +1,50 @@
1
+ require 'thor'
2
+ require 'filewatcher'
3
+ require_relative 'scandex'
4
+
5
+ class Runner < Thor
6
+ class_option :f, :banner => "Directory where to store index"
7
+
8
+ desc "doctor", "Checks system to make sure everything is installed properly"
9
+ def doctor
10
+ ScanDex::doctor()
11
+ end
12
+
13
+ desc "list", "List all documents indexed"
14
+ def list
15
+ files = ScanDex::documents(options[:f])
16
+ files.each do |file|
17
+ puts "#{file[0]} #{file[1]}"
18
+ end
19
+ end
20
+
21
+ desc "index FILES", "Index scanned FILES"
22
+ option :force, :type => :boolean
23
+ def index(*files)
24
+ # If doctor fails then there is no point in even trying
25
+ if !ScanDex::doctor()
26
+ return
27
+ end
28
+ files.each do |file|
29
+ ScanDex::index_and_store(options[:f], file, options[:force])
30
+ end
31
+ end
32
+
33
+ desc "search TERM", "Search documents containing TERM"
34
+ def search(term)
35
+ files = ScanDex::search_documents(options[:f], term)
36
+ files.each do |file|
37
+ puts "#{file[0]} #{file[1]}"
38
+ end
39
+ end
40
+
41
+ desc "watch DIRECTORIES", "Watch DIRECTORIES for new files to index"
42
+ def watch(*directories)
43
+ FileWatcher.new(directories).watch do |file, event|
44
+ if (event == :changed || event == :new)
45
+ ScanDex::index_and_store(options[:f], file, true)
46
+ end
47
+ end
48
+ end
49
+ end
50
+
@@ -0,0 +1,135 @@
1
+ require 'rtesseract'
2
+ require 'sqlite3'
3
+
4
+ module ScanDex
5
+ def self.tesseract
6
+ `which tesseract`.strip
7
+ end
8
+
9
+ def self.gs
10
+ `which gs`.strip
11
+ end
12
+
13
+ def self.convert
14
+ `which convert`.strip
15
+ end
16
+
17
+ def self.doctor
18
+ convert = self.convert
19
+ if convert.empty?
20
+ puts "ImageMagick is missing"
21
+ return false
22
+ end
23
+ gs = self.gs
24
+ if gs.empty?
25
+ puts "GhostScript is missing"
26
+ return false
27
+ end
28
+ tesseract = self.tesseract
29
+ if tesseract.empty?
30
+ puts "Tesseract is missing"
31
+ return false
32
+ end
33
+ #puts "All Tools Available"
34
+ true
35
+ end
36
+
37
+ def self.index_and_store(store_path, file, force = false)
38
+ content = ScanDex::index(store_path, file, force)
39
+ if !content.nil?
40
+ file = File.expand_path(file)
41
+ ScanDex::store_document(store_path, file, content)
42
+ end
43
+ end
44
+
45
+ def self.index(store_path, source, force = false)
46
+ accepted_formats = ['.pdf', '.png', '.jpg', '.jpeg', '.tiff']
47
+ if (force || !self.has_document(store_path, source)) && accepted_formats.include?(File.extname(source).downcase)
48
+ puts "Indexing #{source}"
49
+ tmp = Dir.mktmpdir('scandex_')
50
+ pages = convert_to_gray_scale(source, tmp)
51
+ puts "Found #{pages.size} page(s)"
52
+ if pages.size > 0
53
+ ocr(pages)
54
+ else
55
+ nil
56
+ end
57
+ else
58
+ puts "Ignoring '#{source}'"
59
+ nil
60
+ end
61
+ end
62
+
63
+ def self.convert_to_gray_scale(source, destination)
64
+ cmd = "#{self.convert} -density 300 -depth 8 -type grayscale \"#{source}\" #{destination}/convert-%04d.jpg"
65
+ #puts "cmd = #{cmd}"
66
+ puts "Converting '#{File.basename(source)}'"
67
+ ret = system(cmd)
68
+ if !ret
69
+ puts "Failed to convert #{source}"
70
+ []
71
+ else
72
+ Dir["#{destination}/convert-*.jpg"]
73
+ end
74
+ end
75
+
76
+ def self.ocr(pages, language = "eng")
77
+ text = ''
78
+ pages.each do |page|
79
+ puts "OCR on '#{File.basename(page)}'"
80
+ text += image_to_string(page, language)
81
+ end
82
+ text
83
+ end
84
+
85
+ # TODO orientation and language detection
86
+ def self.image_to_string(image, language = "eng")
87
+ img = RTesseract.new(image, :lang => language)
88
+ img.to_s
89
+ end
90
+
91
+ def self.db(store_path)
92
+ store_path = '~/' if store_path.nil? || store_path.empty?
93
+ filename = File.expand_path("#{store_path}/.scandex.db")
94
+ migrate = !File.exists?(filename)
95
+ db = SQLite3::Database.new(filename)
96
+ if migrate
97
+ puts "Creating DB"
98
+ db.execute("CREATE TABLE documents (name VARCHAR(255), content TEXT, created TEXT, modified TEXT)")
99
+ end
100
+ db
101
+ end
102
+
103
+ def self.has_document(store_path, name)
104
+ db = self.db(store_path)
105
+ rows = db.execute("SELECT name FROM documents WHERE name = ?", [name])
106
+ rows.size > 0
107
+ end
108
+
109
+ def self.documents(store_path)
110
+ db = self.db(store_path)
111
+ db.execute("SELECT name, created, modified FROM documents")
112
+ end
113
+
114
+ def self.search_documents(store_path, text)
115
+ db = self.db(store_path)
116
+ pattern = "%#{text.downcase}%"
117
+ db.execute("SELECT name, created, modified FROM documents WHERE LOWER(content) LIKE ? OR LOWER(name) LIKE ?", [pattern, pattern])
118
+ end
119
+
120
+ def self.store_document(store_path, source, content)
121
+ created = File.mtime(source).utc.iso8601
122
+ modified = File.ctime(source).utc.iso8601
123
+
124
+ db = self.db(store_path)
125
+ rows = db.execute("SELECT * FROM documents WHERE name = ?", source)
126
+ if rows.size == 0
127
+ puts "Insert: #{source} #{created} #{modified}"
128
+ db.execute("INSERT INTO documents (name, content, created, modified) VALUES (?, ?, ?, ?)", [source, content, created, modified])
129
+ else
130
+ puts "Update: #{source} #{created} #{modified}"
131
+ db.execute("UPDATE documents SET content = ?, modified = ? WHERE name = ?", [content, modified, source])
132
+ end
133
+ end
134
+ end
135
+
@@ -0,0 +1,3 @@
1
+ module ScanDex
2
+ VERSION = '0.1.1'
3
+ end
metadata ADDED
@@ -0,0 +1,134 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: scandex
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ platform: ruby
6
+ authors:
7
+ - Jerome Poichet
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-02-22 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rtesseract
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '='
18
+ - !ruby/object:Gem::Version
19
+ version: 1.3.2
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '='
25
+ - !ruby/object:Gem::Version
26
+ version: 1.3.2
27
+ - !ruby/object:Gem::Dependency
28
+ name: rmagick
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '='
32
+ - !ruby/object:Gem::Version
33
+ version: 2.15.4
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '='
39
+ - !ruby/object:Gem::Version
40
+ version: 2.15.4
41
+ - !ruby/object:Gem::Dependency
42
+ name: sqlite3
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '='
46
+ - !ruby/object:Gem::Version
47
+ version: 1.3.11
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '='
53
+ - !ruby/object:Gem::Version
54
+ version: 1.3.11
55
+ - !ruby/object:Gem::Dependency
56
+ name: thor
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '='
60
+ - !ruby/object:Gem::Version
61
+ version: 0.19.1
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '='
67
+ - !ruby/object:Gem::Version
68
+ version: 0.19.1
69
+ - !ruby/object:Gem::Dependency
70
+ name: filewatcher
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '='
74
+ - !ruby/object:Gem::Version
75
+ version: 0.5.3
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '='
81
+ - !ruby/object:Gem::Version
82
+ version: 0.5.3
83
+ - !ruby/object:Gem::Dependency
84
+ name: gems
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - '='
88
+ - !ruby/object:Gem::Version
89
+ version: 2.4.6
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - '='
95
+ - !ruby/object:Gem::Version
96
+ version: 2.4.6
97
+ description: A very simple tool to index scanned documents in either PDF or image
98
+ format
99
+ email: poitch@gmail.com
100
+ executables:
101
+ - scandex
102
+ extensions: []
103
+ extra_rdoc_files: []
104
+ files:
105
+ - bin/scandex
106
+ - lib/runner.rb
107
+ - lib/scandex.rb
108
+ - lib/version.rb
109
+ homepage: http://github.com/poitch/scandex
110
+ licenses:
111
+ - MIT
112
+ metadata: {}
113
+ post_install_message:
114
+ rdoc_options: []
115
+ require_paths:
116
+ - lib
117
+ - lib
118
+ required_ruby_version: !ruby/object:Gem::Requirement
119
+ requirements:
120
+ - - ">="
121
+ - !ruby/object:Gem::Version
122
+ version: '0'
123
+ required_rubygems_version: !ruby/object:Gem::Requirement
124
+ requirements:
125
+ - - ">="
126
+ - !ruby/object:Gem::Version
127
+ version: '0'
128
+ requirements: []
129
+ rubyforge_project:
130
+ rubygems_version: 2.4.6
131
+ signing_key:
132
+ specification_version: 4
133
+ summary: Index your scanned paper documents
134
+ test_files: []