scandex 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/bin/scandex +3 -0
- data/lib/runner.rb +50 -0
- data/lib/scandex.rb +135 -0
- data/lib/version.rb +3 -0
- metadata +134 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 8219330418a57d92510475b295080d6664feef75
|
4
|
+
data.tar.gz: 33e6ae442689c6839a6e60bd573ca6764d0f98d0
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 132313db43cece7cc6e91dd3f21a12b8cae2d4b62f615febfc64f68efbc532b1310a77675d426be2e8bd032a02a0f485e1f96d7124c664e0b4cd16e61948b200
|
7
|
+
data.tar.gz: abfef9971c9d4223fe600761dfa2179dabf3b36cb0de1ec44eda99be9113a7ff093ce4d6f793be256f5d79672ff68640d32dd94a0670478e70f9dcc5910590cb
|
data/bin/scandex
ADDED
data/lib/runner.rb
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
require 'thor'
|
2
|
+
require 'filewatcher'
|
3
|
+
require_relative 'scandex'
|
4
|
+
|
5
|
+
class Runner < Thor
|
6
|
+
class_option :f, :banner => "Directory where to store index"
|
7
|
+
|
8
|
+
desc "doctor", "Checks system to make sure everything is installed properly"
|
9
|
+
def doctor
|
10
|
+
ScanDex::doctor()
|
11
|
+
end
|
12
|
+
|
13
|
+
desc "list", "List all documents indexed"
|
14
|
+
def list
|
15
|
+
files = ScanDex::documents(options[:f])
|
16
|
+
files.each do |file|
|
17
|
+
puts "#{file[0]} #{file[1]}"
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
desc "index FILES", "Index scanned FILES"
|
22
|
+
option :force, :type => :boolean
|
23
|
+
def index(*files)
|
24
|
+
# If doctor fails then there is no point in even trying
|
25
|
+
if !ScanDex::doctor()
|
26
|
+
return
|
27
|
+
end
|
28
|
+
files.each do |file|
|
29
|
+
ScanDex::index_and_store(options[:f], file, options[:force])
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
desc "search TERM", "Search documents containing TERM"
|
34
|
+
def search(term)
|
35
|
+
files = ScanDex::search_documents(options[:f], term)
|
36
|
+
files.each do |file|
|
37
|
+
puts "#{file[0]} #{file[1]}"
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
desc "watch DIRECTORIES", "Watch DIRECTORIES for new files to index"
|
42
|
+
def watch(*directories)
|
43
|
+
FileWatcher.new(directories).watch do |file, event|
|
44
|
+
if (event == :changed || event == :new)
|
45
|
+
ScanDex::index_and_store(options[:f], file, true)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
data/lib/scandex.rb
ADDED
@@ -0,0 +1,135 @@
|
|
1
|
+
require 'rtesseract'
|
2
|
+
require 'sqlite3'
|
3
|
+
|
4
|
+
module ScanDex
|
5
|
+
def self.tesseract
|
6
|
+
`which tesseract`.strip
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.gs
|
10
|
+
`which gs`.strip
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.convert
|
14
|
+
`which convert`.strip
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.doctor
|
18
|
+
convert = self.convert
|
19
|
+
if convert.empty?
|
20
|
+
puts "ImageMagick is missing"
|
21
|
+
return false
|
22
|
+
end
|
23
|
+
gs = self.gs
|
24
|
+
if gs.empty?
|
25
|
+
puts "GhostScript is missing"
|
26
|
+
return false
|
27
|
+
end
|
28
|
+
tesseract = self.tesseract
|
29
|
+
if tesseract.empty?
|
30
|
+
puts "Tesseract is missing"
|
31
|
+
return false
|
32
|
+
end
|
33
|
+
#puts "All Tools Available"
|
34
|
+
true
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.index_and_store(store_path, file, force = false)
|
38
|
+
content = ScanDex::index(store_path, file, force)
|
39
|
+
if !content.nil?
|
40
|
+
file = File.expand_path(file)
|
41
|
+
ScanDex::store_document(store_path, file, content)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def self.index(store_path, source, force = false)
|
46
|
+
accepted_formats = ['.pdf', '.png', '.jpg', '.jpeg', '.tiff']
|
47
|
+
if (force || !self.has_document(store_path, source)) && accepted_formats.include?(File.extname(source).downcase)
|
48
|
+
puts "Indexing #{source}"
|
49
|
+
tmp = Dir.mktmpdir('scandex_')
|
50
|
+
pages = convert_to_gray_scale(source, tmp)
|
51
|
+
puts "Found #{pages.size} page(s)"
|
52
|
+
if pages.size > 0
|
53
|
+
ocr(pages)
|
54
|
+
else
|
55
|
+
nil
|
56
|
+
end
|
57
|
+
else
|
58
|
+
puts "Ignoring '#{source}'"
|
59
|
+
nil
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def self.convert_to_gray_scale(source, destination)
|
64
|
+
cmd = "#{self.convert} -density 300 -depth 8 -type grayscale \"#{source}\" #{destination}/convert-%04d.jpg"
|
65
|
+
#puts "cmd = #{cmd}"
|
66
|
+
puts "Converting '#{File.basename(source)}'"
|
67
|
+
ret = system(cmd)
|
68
|
+
if !ret
|
69
|
+
puts "Failed to convert #{source}"
|
70
|
+
[]
|
71
|
+
else
|
72
|
+
Dir["#{destination}/convert-*.jpg"]
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def self.ocr(pages, language = "eng")
|
77
|
+
text = ''
|
78
|
+
pages.each do |page|
|
79
|
+
puts "OCR on '#{File.basename(page)}'"
|
80
|
+
text += image_to_string(page, language)
|
81
|
+
end
|
82
|
+
text
|
83
|
+
end
|
84
|
+
|
85
|
+
# TODO orientation and language detection
|
86
|
+
def self.image_to_string(image, language = "eng")
|
87
|
+
img = RTesseract.new(image, :lang => language)
|
88
|
+
img.to_s
|
89
|
+
end
|
90
|
+
|
91
|
+
def self.db(store_path)
|
92
|
+
store_path = '~/' if store_path.nil? || store_path.empty?
|
93
|
+
filename = File.expand_path("#{store_path}/.scandex.db")
|
94
|
+
migrate = !File.exists?(filename)
|
95
|
+
db = SQLite3::Database.new(filename)
|
96
|
+
if migrate
|
97
|
+
puts "Creating DB"
|
98
|
+
db.execute("CREATE TABLE documents (name VARCHAR(255), content TEXT, created TEXT, modified TEXT)")
|
99
|
+
end
|
100
|
+
db
|
101
|
+
end
|
102
|
+
|
103
|
+
def self.has_document(store_path, name)
|
104
|
+
db = self.db(store_path)
|
105
|
+
rows = db.execute("SELECT name FROM documents WHERE name = ?", [name])
|
106
|
+
rows.size > 0
|
107
|
+
end
|
108
|
+
|
109
|
+
def self.documents(store_path)
|
110
|
+
db = self.db(store_path)
|
111
|
+
db.execute("SELECT name, created, modified FROM documents")
|
112
|
+
end
|
113
|
+
|
114
|
+
def self.search_documents(store_path, text)
|
115
|
+
db = self.db(store_path)
|
116
|
+
pattern = "%#{text.downcase}%"
|
117
|
+
db.execute("SELECT name, created, modified FROM documents WHERE LOWER(content) LIKE ? OR LOWER(name) LIKE ?", [pattern, pattern])
|
118
|
+
end
|
119
|
+
|
120
|
+
def self.store_document(store_path, source, content)
|
121
|
+
created = File.mtime(source).utc.iso8601
|
122
|
+
modified = File.ctime(source).utc.iso8601
|
123
|
+
|
124
|
+
db = self.db(store_path)
|
125
|
+
rows = db.execute("SELECT * FROM documents WHERE name = ?", source)
|
126
|
+
if rows.size == 0
|
127
|
+
puts "Insert: #{source} #{created} #{modified}"
|
128
|
+
db.execute("INSERT INTO documents (name, content, created, modified) VALUES (?, ?, ?, ?)", [source, content, created, modified])
|
129
|
+
else
|
130
|
+
puts "Update: #{source} #{created} #{modified}"
|
131
|
+
db.execute("UPDATE documents SET content = ?, modified = ? WHERE name = ?", [content, modified, source])
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
data/lib/version.rb
ADDED
metadata
ADDED
@@ -0,0 +1,134 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: scandex
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jerome Poichet
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2016-02-22 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rtesseract
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - '='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 1.3.2
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 1.3.2
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rmagick
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 2.15.4
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 2.15.4
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: sqlite3
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: 1.3.11
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 1.3.11
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: thor
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - '='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 0.19.1
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - '='
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: 0.19.1
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: filewatcher
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - '='
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: 0.5.3
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - '='
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: 0.5.3
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: gems
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - '='
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 2.4.6
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - '='
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: 2.4.6
|
97
|
+
description: A very simple tool to index scanned documents in either PDF or image
|
98
|
+
format
|
99
|
+
email: poitch@gmail.com
|
100
|
+
executables:
|
101
|
+
- scandex
|
102
|
+
extensions: []
|
103
|
+
extra_rdoc_files: []
|
104
|
+
files:
|
105
|
+
- bin/scandex
|
106
|
+
- lib/runner.rb
|
107
|
+
- lib/scandex.rb
|
108
|
+
- lib/version.rb
|
109
|
+
homepage: http://github.com/poitch/scandex
|
110
|
+
licenses:
|
111
|
+
- MIT
|
112
|
+
metadata: {}
|
113
|
+
post_install_message:
|
114
|
+
rdoc_options: []
|
115
|
+
require_paths:
|
116
|
+
- lib
|
117
|
+
- lib
|
118
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
119
|
+
requirements:
|
120
|
+
- - ">="
|
121
|
+
- !ruby/object:Gem::Version
|
122
|
+
version: '0'
|
123
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
124
|
+
requirements:
|
125
|
+
- - ">="
|
126
|
+
- !ruby/object:Gem::Version
|
127
|
+
version: '0'
|
128
|
+
requirements: []
|
129
|
+
rubyforge_project:
|
130
|
+
rubygems_version: 2.4.6
|
131
|
+
signing_key:
|
132
|
+
specification_version: 4
|
133
|
+
summary: Index your scanned paper documents
|
134
|
+
test_files: []
|