scandex 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/bin/scandex +3 -0
- data/lib/runner.rb +50 -0
- data/lib/scandex.rb +135 -0
- data/lib/version.rb +3 -0
- metadata +134 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 8219330418a57d92510475b295080d6664feef75
|
4
|
+
data.tar.gz: 33e6ae442689c6839a6e60bd573ca6764d0f98d0
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 132313db43cece7cc6e91dd3f21a12b8cae2d4b62f615febfc64f68efbc532b1310a77675d426be2e8bd032a02a0f485e1f96d7124c664e0b4cd16e61948b200
|
7
|
+
data.tar.gz: abfef9971c9d4223fe600761dfa2179dabf3b36cb0de1ec44eda99be9113a7ff093ce4d6f793be256f5d79672ff68640d32dd94a0670478e70f9dcc5910590cb
|
data/bin/scandex
ADDED
data/lib/runner.rb
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
require 'thor'
|
2
|
+
require 'filewatcher'
|
3
|
+
require_relative 'scandex'
|
4
|
+
|
5
|
+
class Runner < Thor
|
6
|
+
class_option :f, :banner => "Directory where to store index"
|
7
|
+
|
8
|
+
desc "doctor", "Checks system to make sure everything is installed properly"
|
9
|
+
def doctor
|
10
|
+
ScanDex::doctor()
|
11
|
+
end
|
12
|
+
|
13
|
+
desc "list", "List all documents indexed"
|
14
|
+
def list
|
15
|
+
files = ScanDex::documents(options[:f])
|
16
|
+
files.each do |file|
|
17
|
+
puts "#{file[0]} #{file[1]}"
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
desc "index FILES", "Index scanned FILES"
|
22
|
+
option :force, :type => :boolean
|
23
|
+
def index(*files)
|
24
|
+
# If doctor fails then there is no point in even trying
|
25
|
+
if !ScanDex::doctor()
|
26
|
+
return
|
27
|
+
end
|
28
|
+
files.each do |file|
|
29
|
+
ScanDex::index_and_store(options[:f], file, options[:force])
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
desc "search TERM", "Search documents containing TERM"
|
34
|
+
def search(term)
|
35
|
+
files = ScanDex::search_documents(options[:f], term)
|
36
|
+
files.each do |file|
|
37
|
+
puts "#{file[0]} #{file[1]}"
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
desc "watch DIRECTORIES", "Watch DIRECTORIES for new files to index"
|
42
|
+
def watch(*directories)
|
43
|
+
FileWatcher.new(directories).watch do |file, event|
|
44
|
+
if (event == :changed || event == :new)
|
45
|
+
ScanDex::index_and_store(options[:f], file, true)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
data/lib/scandex.rb
ADDED
@@ -0,0 +1,135 @@
|
|
1
|
+
require 'rtesseract'
|
2
|
+
require 'sqlite3'
|
3
|
+
|
4
|
+
module ScanDex
|
5
|
+
def self.tesseract
|
6
|
+
`which tesseract`.strip
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.gs
|
10
|
+
`which gs`.strip
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.convert
|
14
|
+
`which convert`.strip
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.doctor
|
18
|
+
convert = self.convert
|
19
|
+
if convert.empty?
|
20
|
+
puts "ImageMagick is missing"
|
21
|
+
return false
|
22
|
+
end
|
23
|
+
gs = self.gs
|
24
|
+
if gs.empty?
|
25
|
+
puts "GhostScript is missing"
|
26
|
+
return false
|
27
|
+
end
|
28
|
+
tesseract = self.tesseract
|
29
|
+
if tesseract.empty?
|
30
|
+
puts "Tesseract is missing"
|
31
|
+
return false
|
32
|
+
end
|
33
|
+
#puts "All Tools Available"
|
34
|
+
true
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.index_and_store(store_path, file, force = false)
|
38
|
+
content = ScanDex::index(store_path, file, force)
|
39
|
+
if !content.nil?
|
40
|
+
file = File.expand_path(file)
|
41
|
+
ScanDex::store_document(store_path, file, content)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def self.index(store_path, source, force = false)
|
46
|
+
accepted_formats = ['.pdf', '.png', '.jpg', '.jpeg', '.tiff']
|
47
|
+
if (force || !self.has_document(store_path, source)) && accepted_formats.include?(File.extname(source).downcase)
|
48
|
+
puts "Indexing #{source}"
|
49
|
+
tmp = Dir.mktmpdir('scandex_')
|
50
|
+
pages = convert_to_gray_scale(source, tmp)
|
51
|
+
puts "Found #{pages.size} page(s)"
|
52
|
+
if pages.size > 0
|
53
|
+
ocr(pages)
|
54
|
+
else
|
55
|
+
nil
|
56
|
+
end
|
57
|
+
else
|
58
|
+
puts "Ignoring '#{source}'"
|
59
|
+
nil
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def self.convert_to_gray_scale(source, destination)
|
64
|
+
cmd = "#{self.convert} -density 300 -depth 8 -type grayscale \"#{source}\" #{destination}/convert-%04d.jpg"
|
65
|
+
#puts "cmd = #{cmd}"
|
66
|
+
puts "Converting '#{File.basename(source)}'"
|
67
|
+
ret = system(cmd)
|
68
|
+
if !ret
|
69
|
+
puts "Failed to convert #{source}"
|
70
|
+
[]
|
71
|
+
else
|
72
|
+
Dir["#{destination}/convert-*.jpg"]
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def self.ocr(pages, language = "eng")
|
77
|
+
text = ''
|
78
|
+
pages.each do |page|
|
79
|
+
puts "OCR on '#{File.basename(page)}'"
|
80
|
+
text += image_to_string(page, language)
|
81
|
+
end
|
82
|
+
text
|
83
|
+
end
|
84
|
+
|
85
|
+
# TODO orientation and language detection
|
86
|
+
def self.image_to_string(image, language = "eng")
|
87
|
+
img = RTesseract.new(image, :lang => language)
|
88
|
+
img.to_s
|
89
|
+
end
|
90
|
+
|
91
|
+
def self.db(store_path)
|
92
|
+
store_path = '~/' if store_path.nil? || store_path.empty?
|
93
|
+
filename = File.expand_path("#{store_path}/.scandex.db")
|
94
|
+
migrate = !File.exists?(filename)
|
95
|
+
db = SQLite3::Database.new(filename)
|
96
|
+
if migrate
|
97
|
+
puts "Creating DB"
|
98
|
+
db.execute("CREATE TABLE documents (name VARCHAR(255), content TEXT, created TEXT, modified TEXT)")
|
99
|
+
end
|
100
|
+
db
|
101
|
+
end
|
102
|
+
|
103
|
+
def self.has_document(store_path, name)
|
104
|
+
db = self.db(store_path)
|
105
|
+
rows = db.execute("SELECT name FROM documents WHERE name = ?", [name])
|
106
|
+
rows.size > 0
|
107
|
+
end
|
108
|
+
|
109
|
+
def self.documents(store_path)
|
110
|
+
db = self.db(store_path)
|
111
|
+
db.execute("SELECT name, created, modified FROM documents")
|
112
|
+
end
|
113
|
+
|
114
|
+
def self.search_documents(store_path, text)
|
115
|
+
db = self.db(store_path)
|
116
|
+
pattern = "%#{text.downcase}%"
|
117
|
+
db.execute("SELECT name, created, modified FROM documents WHERE LOWER(content) LIKE ? OR LOWER(name) LIKE ?", [pattern, pattern])
|
118
|
+
end
|
119
|
+
|
120
|
+
def self.store_document(store_path, source, content)
|
121
|
+
created = File.mtime(source).utc.iso8601
|
122
|
+
modified = File.ctime(source).utc.iso8601
|
123
|
+
|
124
|
+
db = self.db(store_path)
|
125
|
+
rows = db.execute("SELECT * FROM documents WHERE name = ?", source)
|
126
|
+
if rows.size == 0
|
127
|
+
puts "Insert: #{source} #{created} #{modified}"
|
128
|
+
db.execute("INSERT INTO documents (name, content, created, modified) VALUES (?, ?, ?, ?)", [source, content, created, modified])
|
129
|
+
else
|
130
|
+
puts "Update: #{source} #{created} #{modified}"
|
131
|
+
db.execute("UPDATE documents SET content = ?, modified = ? WHERE name = ?", [content, modified, source])
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
data/lib/version.rb
ADDED
metadata
ADDED
@@ -0,0 +1,134 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: scandex
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jerome Poichet
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2016-02-22 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rtesseract
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - '='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 1.3.2
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 1.3.2
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rmagick
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 2.15.4
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 2.15.4
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: sqlite3
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: 1.3.11
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 1.3.11
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: thor
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - '='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 0.19.1
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - '='
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: 0.19.1
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: filewatcher
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - '='
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: 0.5.3
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - '='
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: 0.5.3
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: gems
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - '='
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 2.4.6
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - '='
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: 2.4.6
|
97
|
+
description: A very simple tool to index scanned documents in either PDF or image
|
98
|
+
format
|
99
|
+
email: poitch@gmail.com
|
100
|
+
executables:
|
101
|
+
- scandex
|
102
|
+
extensions: []
|
103
|
+
extra_rdoc_files: []
|
104
|
+
files:
|
105
|
+
- bin/scandex
|
106
|
+
- lib/runner.rb
|
107
|
+
- lib/scandex.rb
|
108
|
+
- lib/version.rb
|
109
|
+
homepage: http://github.com/poitch/scandex
|
110
|
+
licenses:
|
111
|
+
- MIT
|
112
|
+
metadata: {}
|
113
|
+
post_install_message:
|
114
|
+
rdoc_options: []
|
115
|
+
require_paths:
|
116
|
+
- lib
|
117
|
+
- lib
|
118
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
119
|
+
requirements:
|
120
|
+
- - ">="
|
121
|
+
- !ruby/object:Gem::Version
|
122
|
+
version: '0'
|
123
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
124
|
+
requirements:
|
125
|
+
- - ">="
|
126
|
+
- !ruby/object:Gem::Version
|
127
|
+
version: '0'
|
128
|
+
requirements: []
|
129
|
+
rubyforge_project:
|
130
|
+
rubygems_version: 2.4.6
|
131
|
+
signing_key:
|
132
|
+
specification_version: 4
|
133
|
+
summary: Index your scanned paper documents
|
134
|
+
test_files: []
|