paperless 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/paperless +18 -5
- data/lib/paperless/date_search.rb +26 -14
- data/lib/paperless/engine.rb +34 -7
- data/lib/paperless/ocr_engines/pdfpen.rb +1 -0
- data/lib/paperless/ocr_engines/pdfpen6.rb +33 -0
- data/lib/paperless/ocr_engines/pdfpenpro.rb +1 -0
- data/lib/paperless/ocr_engines/pdfpenpro6.rb +33 -0
- data/lib/paperless/rule.rb +5 -0
- data/lib/paperless/services/evernote.rb +13 -2
- data/lib/paperless/services/finder.rb +9 -3
- data/lib/paperless/version.rb +1 -1
- data/lib/paperless.rb +2 -0
- metadata +5 -3
data/bin/paperless
CHANGED
@@ -131,6 +131,9 @@ desc 'Create a new note from a file'
|
|
131
131
|
arg_name 'file_name'
|
132
132
|
command :create do |c|
|
133
133
|
|
134
|
+
c.desc 'Dump the OCR text for the document to the terminal.'
|
135
|
+
c.switch :dump, :negatable => false, :default_value => false
|
136
|
+
|
134
137
|
c.desc 'Open a prompt to rename the file before its processed through the rules.'
|
135
138
|
c.switch :prompt, :negatable => false, :default_value => false
|
136
139
|
|
@@ -147,14 +150,15 @@ command :create do |c|
|
|
147
150
|
|
148
151
|
args.each do |file|
|
149
152
|
|
150
|
-
if File.exists?(File.expand_path file)
|
153
|
+
if File.exists?(File.expand_path file) && !File.directory?(File.expand_path file)
|
151
154
|
file = File.expand_path file
|
152
155
|
else
|
153
|
-
raise "File does not exist (#{file})"
|
156
|
+
raise "File does not exist or is a directory (#{file})"
|
154
157
|
end
|
155
158
|
|
156
|
-
|
157
|
-
|
159
|
+
old_filename = nil
|
160
|
+
|
161
|
+
if options[:prompt]
|
158
162
|
file_ext = File.extname(file)
|
159
163
|
filename = File.basename(file, file_ext)
|
160
164
|
user_input = `#{COCOADIALOG} standard-inputbox --title "Paperless Prompt" --informative-text "Rename your file before its processed with rules..." --text "#{filename}" --no-newline --string-output`.split("\n")
|
@@ -162,6 +166,10 @@ command :create do |c|
|
|
162
166
|
if user_input[0].match(/ok/i) && user_input[1] != File.basename(file, file_ext)
|
163
167
|
new_filename = File.join(File.dirname(file), user_input[1] + file_ext)
|
164
168
|
puts "Renaming file based on input to #{new_filename}"
|
169
|
+
if global_options[:simulate]
|
170
|
+
# save the file name to rename it back later
|
171
|
+
old_filename = file
|
172
|
+
end
|
165
173
|
File.rename(file, new_filename)
|
166
174
|
file = new_filename
|
167
175
|
end
|
@@ -182,7 +190,7 @@ command :create do |c|
|
|
182
190
|
file_ext = File.extname(file).gsub(/\./,'')
|
183
191
|
if file_ext == Paperless::PDF_EXT && options[:ocr]
|
184
192
|
puts "OCRing file..."
|
185
|
-
engine.ocr
|
193
|
+
engine.ocr(options[:dump])
|
186
194
|
end
|
187
195
|
|
188
196
|
if options[:proc_rules]
|
@@ -193,6 +201,11 @@ command :create do |c|
|
|
193
201
|
if global_options[:simulate]
|
194
202
|
puts "Simulating changes..."
|
195
203
|
engine.print
|
204
|
+
|
205
|
+
if global_options[:simulate] && options[:prompt] && !old_filename.nil?
|
206
|
+
puts "Renaming file back to #{old_filename}"
|
207
|
+
File.rename(file, old_filename)
|
208
|
+
end
|
196
209
|
else
|
197
210
|
puts "Saving #{file} to #{engine.service}"
|
198
211
|
engine.create({:delete => options[:delete]})
|
@@ -2,6 +2,7 @@ require 'date'
|
|
2
2
|
|
3
3
|
module DateSearch
|
4
4
|
|
5
|
+
SEP_NOSPACE = '\.\/\-\,'
|
5
6
|
SEP = '\. \/\-\,'
|
6
7
|
DAY = '(\d{1,2})'
|
7
8
|
MONTH = '([a-zA-Z]{3,15})'
|
@@ -49,7 +50,25 @@ module DateSearch
|
|
49
50
|
|
50
51
|
def date_search(text,date_locale)
|
51
52
|
date = nil
|
52
|
-
if match = text.match(/#{
|
53
|
+
if match = text.match(/#{DAY}[#{SEP_NOSPACE}]+#{DAY}[#{SEP_NOSPACE}]+#{YEAR}/)
|
54
|
+
# US: 12-29-2011
|
55
|
+
# Euro: 29-12-2011
|
56
|
+
year = valid_year(match[3])
|
57
|
+
day = date_locale == 'us' ? valid_day(match[2]) : valid_day(match[1])
|
58
|
+
month = date_locale == 'us' ? valid_month(match[1]) : valid_month(match[2])
|
59
|
+
|
60
|
+
if month && day && year
|
61
|
+
puts "Basing the date off the discovered string (3): #{match[0]}"
|
62
|
+
begin
|
63
|
+
date = DateTime.new(year,month,day)
|
64
|
+
rescue
|
65
|
+
puts "WARNING: Unable to create date object. #{$!}"
|
66
|
+
date = nil
|
67
|
+
end
|
68
|
+
else
|
69
|
+
puts "WARNING: The discovered date string does not validate: #{match[0]}"
|
70
|
+
end
|
71
|
+
elsif match = text.match(/#{MONTH}[#{SEP}]{0,3}#{DAY}[#{SEP}]{1,3}#{YEAR}/i)
|
53
72
|
# December 29, 2011
|
54
73
|
if valid_day(match[2]) && valid_year(match[3])
|
55
74
|
puts "Basing the date off the discovered string (1): #{match[0]}"
|
@@ -60,7 +79,7 @@ module DateSearch
|
|
60
79
|
date = nil
|
61
80
|
end
|
62
81
|
end
|
63
|
-
elsif match = text.match(/#{DAY}[#{SEP}]{0,3}#{MONTH}[#{SEP}]{0,3}#{YEAR}
|
82
|
+
elsif match = text.match(/#{DAY}[#{SEP}]{0,3}#{MONTH}[#{SEP}]{0,3}#{YEAR}/i)
|
64
83
|
# 29 December 2011
|
65
84
|
if valid_day(match[1]) && valid_year(match[3])
|
66
85
|
puts "Basing the date off the discovered string (2): #{match[0]}"
|
@@ -71,23 +90,16 @@ module DateSearch
|
|
71
90
|
date = nil
|
72
91
|
end
|
73
92
|
end
|
74
|
-
elsif match = text.match(/#{
|
75
|
-
#
|
76
|
-
|
77
|
-
|
78
|
-
day = date_locale == 'us' ? valid_day(match[2]) : valid_day(match[1])
|
79
|
-
month = date_locale == 'us' ? valid_month(match[1]) : valid_month(match[2])
|
80
|
-
|
81
|
-
if month && day && year
|
82
|
-
puts "Basing the date off the discovered string (3): #{match[0]}"
|
93
|
+
elsif match = text.match(/#{MONTH}[#{SEP}]{0,3}#{YEAR}/i)
|
94
|
+
# December 2011
|
95
|
+
if valid_year(match[2])
|
96
|
+
puts "Basing the date off the discovered string (2): #{match[0]}"
|
83
97
|
begin
|
84
|
-
date = DateTime.
|
98
|
+
date = DateTime.parse(repair_ocr_string(match[0]))
|
85
99
|
rescue
|
86
100
|
puts "WARNING: Unable to create date object. #{$!}"
|
87
101
|
date = nil
|
88
102
|
end
|
89
|
-
else
|
90
|
-
puts "WARNING: The discovered date string does not validate: #{match[0]}"
|
91
103
|
end
|
92
104
|
end
|
93
105
|
date
|
data/lib/paperless/engine.rb
CHANGED
@@ -8,6 +8,7 @@ module Paperless
|
|
8
8
|
PDF_EXT = 'pdf'
|
9
9
|
DATE_VAR = '<date>'
|
10
10
|
MATCH_VAR = '<match>'
|
11
|
+
FILENAME_VAR = '<filename>'
|
11
12
|
FILEDATE = 'filedate'
|
12
13
|
TODAY = 'today'
|
13
14
|
|
@@ -15,6 +16,7 @@ module Paperless
|
|
15
16
|
|
16
17
|
PDFPEN_ENGINE = 'pdfpen'
|
17
18
|
PDFPENPRO_ENGINE = 'pdfpenpro'
|
19
|
+
PDFPENPRO6_ENGINE = 'pdfpenpro6'
|
18
20
|
ACROBAT_ENGINE = 'acrobat'
|
19
21
|
DEVONTHINKPRO_ENGINE = 'devonthinkpro'
|
20
22
|
DEVONTHINKPRO_SERVICE = 'devonthinkpro'
|
@@ -25,7 +27,7 @@ module Paperless
|
|
25
27
|
|
26
28
|
def initialize(options)
|
27
29
|
@destination = nil
|
28
|
-
@service =
|
30
|
+
@service = options[:default_service]
|
29
31
|
@title = nil
|
30
32
|
@date = DateTime.now
|
31
33
|
@tags = Array.new()
|
@@ -119,7 +121,7 @@ module Paperless
|
|
119
121
|
# First check if there are actually any date rules
|
120
122
|
@rules.each do |rule|
|
121
123
|
if rule.condition == Paperless::DATE_VAR
|
122
|
-
@date = date_search(text,@date_locale)
|
124
|
+
@date = date_search(text,@date_locale) || date_search(@file,@date_locale)
|
123
125
|
end
|
124
126
|
end
|
125
127
|
|
@@ -141,6 +143,8 @@ module Paperless
|
|
141
143
|
reader.pages.each do |page|
|
142
144
|
break if @date = date_search(page.text,@date_locale)
|
143
145
|
end
|
146
|
+
# Check for the date in the file name if not found in the content
|
147
|
+
@date = date_search(@file,@date_locale) if @date.nil?
|
144
148
|
break
|
145
149
|
end
|
146
150
|
end
|
@@ -151,9 +155,21 @@ module Paperless
|
|
151
155
|
end
|
152
156
|
end
|
153
157
|
|
154
|
-
def ocr
|
158
|
+
def ocr(dump = false)
|
159
|
+
reader = PDF::Reader.new(@file)
|
160
|
+
if reader.pages.length > 0
|
161
|
+
text = reader.pages[0].text
|
162
|
+
if !text.nil? && text != ''
|
163
|
+
puts text if dump
|
164
|
+
puts "This doc already seems to be OCR'd. Not processing through #{@ocr_engine}"
|
165
|
+
return
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
155
169
|
puts "Running OCR on file with #{@ocr_engine}"
|
156
170
|
ocr_engine = case @ocr_engine
|
171
|
+
when /^#{PDFPENPRO6_ENGINE}$/i then PaperlessOCR::PDFpenPro6.new
|
172
|
+
when /^#{PDFPEN6_ENGINE}$/i then PaperlessOCR::PDFpen6.new
|
157
173
|
when /^#{PDFPENPRO_ENGINE}$/i then PaperlessOCR::PDFpenPro.new
|
158
174
|
when /^#{PDFPEN_ENGINE}$/i then PaperlessOCR::PDFpen.new
|
159
175
|
when /^#{ACROBAT_ENGINE}$/i then PaperlessOCR::Acrobat.new
|
@@ -163,6 +179,16 @@ module Paperless
|
|
163
179
|
|
164
180
|
if ocr_engine
|
165
181
|
ocr_engine.ocr({:file => @file})
|
182
|
+
|
183
|
+
if dump
|
184
|
+
puts "Dumping Page Content..."
|
185
|
+
# Print the contents of the doc
|
186
|
+
reader = PDF::Reader.new(@file)
|
187
|
+
reader.pages.each do |page|
|
188
|
+
puts page.text
|
189
|
+
end
|
190
|
+
end
|
191
|
+
|
166
192
|
else
|
167
193
|
puts "WARNING: No valid OCR engine was defined."
|
168
194
|
end
|
@@ -180,15 +206,16 @@ module Paperless
|
|
180
206
|
if service
|
181
207
|
self.print
|
182
208
|
|
183
|
-
destination = @destination.nil? ? @default_destination
|
184
|
-
|
209
|
+
destination = @destination.nil? ? @default_destination : @destination
|
210
|
+
title = @title.nil? ? File.basename(@file, File.extname(@file)) : @title
|
211
|
+
|
185
212
|
service.create({
|
186
213
|
:delete => options[:delete],
|
187
214
|
:destination => destination,
|
188
215
|
:text_ext => @text_ext,
|
189
216
|
:file => @file,
|
190
217
|
:date => @date,
|
191
|
-
:title =>
|
218
|
+
:title => title,
|
192
219
|
:tags => @tags
|
193
220
|
})
|
194
221
|
else
|
@@ -198,7 +225,7 @@ module Paperless
|
|
198
225
|
|
199
226
|
def print
|
200
227
|
service = @service.nil? ? @default_service : @service
|
201
|
-
title = @title.nil? ? File.basename(@file) : @title
|
228
|
+
title = @title.nil? ? File.basename(@file, File.extname(@file)) : @title
|
202
229
|
|
203
230
|
destination = @destination.nil? ? @default_destination : @destination
|
204
231
|
if destination == PaperlessService::Finder::NO_MOVE && service == PaperlessService::FINDER.downcase
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'appscript'
|
2
|
+
include Appscript
|
3
|
+
|
4
|
+
module PaperlessOCR
|
5
|
+
|
6
|
+
PDFPEN6 = 'PDFpen 6.app'
|
7
|
+
|
8
|
+
class PDFpen6
|
9
|
+
def initialize
|
10
|
+
@engine = PaperlessOCR::PDFPEN6
|
11
|
+
@app = app(@engine)
|
12
|
+
@app.activate
|
13
|
+
end
|
14
|
+
|
15
|
+
def ocr(options)
|
16
|
+
begin
|
17
|
+
doc = @app.open MacTypes::Alias.path(options[:file])
|
18
|
+
doc.ocr
|
19
|
+
|
20
|
+
app("System Events").processes['PDFpen 6'].visible.set(false)
|
21
|
+
|
22
|
+
while doc.performing_ocr.get
|
23
|
+
sleep 1
|
24
|
+
end
|
25
|
+
doc.close(:saving => :yes)
|
26
|
+
sleep 3
|
27
|
+
rescue
|
28
|
+
puts "WARNING: There was an error OCRing the document with #{@engine}: #{$!}"
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'appscript'
|
2
|
+
include Appscript
|
3
|
+
|
4
|
+
module PaperlessOCR
|
5
|
+
|
6
|
+
PDFPENPRO6 = 'PDFpenPro 6.app'
|
7
|
+
|
8
|
+
class PDFpenPro6
|
9
|
+
def initialize
|
10
|
+
@engine = PaperlessOCR::PDFPENPRO6
|
11
|
+
@app = app(@engine)
|
12
|
+
@app.activate
|
13
|
+
end
|
14
|
+
|
15
|
+
def ocr(options)
|
16
|
+
begin
|
17
|
+
doc = @app.open MacTypes::Alias.path(options[:file])
|
18
|
+
doc.ocr
|
19
|
+
|
20
|
+
app("System Events").processes['PDFpenPro 6'].visible.set(false)
|
21
|
+
|
22
|
+
while doc.performing_ocr.get
|
23
|
+
sleep 1
|
24
|
+
end
|
25
|
+
doc.close(:saving => :yes)
|
26
|
+
sleep 3
|
27
|
+
rescue
|
28
|
+
puts "WARNING: There was an error OCRing the document with #{@engine}: #{$!}"
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
end
|
data/lib/paperless/rule.rb
CHANGED
@@ -14,6 +14,7 @@ module Paperless
|
|
14
14
|
@description = options['description']
|
15
15
|
@tags = options['tags'].nil? ? Array.new : options['tags'].split
|
16
16
|
@date_stamp = DateTime.now
|
17
|
+
@filename = ''
|
17
18
|
@date_default_format = '%Y-%m-%d'
|
18
19
|
@matched = false
|
19
20
|
end
|
@@ -28,6 +29,9 @@ module Paperless
|
|
28
29
|
def match(file,text)
|
29
30
|
return @matched if @matched
|
30
31
|
|
32
|
+
file_ext = File.extname(file)
|
33
|
+
@filename = File.basename(file, file_ext)
|
34
|
+
|
31
35
|
if @condition == Paperless::DATE_VAR
|
32
36
|
@date = date
|
33
37
|
@matched = true
|
@@ -57,6 +61,7 @@ module Paperless
|
|
57
61
|
|
58
62
|
def sub_var(attribute, value)
|
59
63
|
unless attribute.nil?
|
64
|
+
attribute.gsub!(/#{Paperless::FILENAME_VAR}/, @filename)
|
60
65
|
attribute.gsub!(/#{Paperless::MATCH_VAR}/, value)
|
61
66
|
attribute.gsub!(/#{Paperless::DATE_VAR}/, @date_stamp.strftime(@date_default_format))
|
62
67
|
|
@@ -24,17 +24,28 @@ module PaperlessService
|
|
24
24
|
text_ext = options[:text_ext]
|
25
25
|
|
26
26
|
create_options = { :created => date }
|
27
|
-
file_ext = File.extname(from_file
|
27
|
+
file_ext = File.extname(from_file)
|
28
|
+
file_dir = File.dirname(from_file)
|
29
|
+
file_name = File.basename(from_file)
|
28
30
|
|
29
|
-
if
|
31
|
+
if file_name != title
|
32
|
+
new_filename = File.join(file_dir, title + file_ext)
|
33
|
+
File.rename(from_file, new_filename)
|
34
|
+
from_file = new_filename
|
35
|
+
end
|
36
|
+
|
37
|
+
if text_ext.index file_ext.gsub!(/\./,'')
|
38
|
+
puts "Adding text note into Evernote"
|
30
39
|
create_options[:with_text] = File.open(from_file, "rb") {|io| io.read}
|
31
40
|
else
|
32
41
|
if file_ext.match(/md$/i)
|
33
42
|
# If this is a mardown file insert it into Evernote as html
|
43
|
+
puts "Converting Markdown to HTML"
|
34
44
|
text = File.open(from_file, "rb") {|io| io.read}
|
35
45
|
create_options[:with_html] = Markdown.new(text).to_html
|
36
46
|
else
|
37
47
|
# Create a note from a file and let Evernote choose how to attach the file
|
48
|
+
puts "Adding note into Evernote"
|
38
49
|
create_options[:from_file] = MacTypes::FileURL.path(from_file)
|
39
50
|
end
|
40
51
|
end
|
@@ -18,28 +18,34 @@ module PaperlessService
|
|
18
18
|
destination = options[:destination]
|
19
19
|
date = options[:date]
|
20
20
|
from_file = options[:file]
|
21
|
-
title = options[:title]
|
21
|
+
title = options[:title] || File.basename(from_file, File.extname(from_file))
|
22
22
|
tags = options[:tags].collect!{|x| x="'#{x}'"} # Add quotes around each tag in case there is a space
|
23
23
|
|
24
24
|
if destination == NO_MOVE || destination == File.dirname(from_file)
|
25
25
|
new_filename = File.join(File.dirname(from_file), title + File.extname(from_file))
|
26
|
+
puts "New filename (1): #{new_filename}"
|
26
27
|
else
|
27
28
|
FileUtils.mkdir_p destination unless File.exists?(destination)
|
28
29
|
new_filename = File.join(destination, title + File.extname(from_file))
|
30
|
+
puts "New filename (2): #{new_filename}"
|
29
31
|
end
|
30
32
|
|
31
|
-
|
33
|
+
puts "Copying File..."
|
34
|
+
FileUtils.cp from_file, new_filename, :verbose => true
|
32
35
|
|
33
36
|
time = Time.new(date.year, date.month, date.day)
|
37
|
+
puts "Modifying the time of the file to be #{time.to_s}"
|
34
38
|
FileUtils.touch new_filename, {:mtime => time}
|
35
39
|
|
36
40
|
if tags.length > 0
|
37
41
|
# Add open meta tags to file
|
42
|
+
puts "Tagging file"
|
38
43
|
system("#{OPENMETA} -p '#{new_filename}' -a #{tags.join(' ')}")
|
39
44
|
end
|
40
45
|
|
41
46
|
if options[:delete] && from_file != new_filename
|
42
|
-
|
47
|
+
puts "Removing original file"
|
48
|
+
FileUtils.rm from_file, :force => true, :verbose => true
|
43
49
|
end
|
44
50
|
end
|
45
51
|
|
data/lib/paperless/version.rb
CHANGED
data/lib/paperless.rb
CHANGED
@@ -6,6 +6,8 @@ require 'paperless/services/evernote.rb'
|
|
6
6
|
require 'paperless/services/devonthinkpro.rb'
|
7
7
|
require 'paperless/services/finder.rb'
|
8
8
|
require 'paperless/ocr_engines/acrobat.rb'
|
9
|
+
require 'paperless/ocr_engines/pdfpen6.rb'
|
10
|
+
require 'paperless/ocr_engines/pdfpenpro6.rb'
|
9
11
|
require 'paperless/ocr_engines/pdfpen.rb'
|
10
12
|
require 'paperless/ocr_engines/pdfpenpro.rb'
|
11
13
|
require 'paperless/ocr_engines/devonthinkpro.rb'
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: paperless
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-03-
|
12
|
+
date: 2013-03-27 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rake
|
@@ -190,6 +190,8 @@ files:
|
|
190
190
|
- lib/paperless/services/finder.rb
|
191
191
|
- lib/paperless/services/devonthinkpro.rb
|
192
192
|
- lib/paperless/ocr_engines/acrobat.rb
|
193
|
+
- lib/paperless/ocr_engines/pdfpen6.rb
|
194
|
+
- lib/paperless/ocr_engines/pdfpenpro6.rb
|
193
195
|
- lib/paperless/ocr_engines/pdfpen.rb
|
194
196
|
- lib/paperless/ocr_engines/pdfpenpro.rb
|
195
197
|
- lib/paperless/ocr_engines/devonthinkpro.rb
|
@@ -222,7 +224,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
222
224
|
version: '0'
|
223
225
|
segments:
|
224
226
|
- 0
|
225
|
-
hash:
|
227
|
+
hash: -3000972400223417895
|
226
228
|
requirements: []
|
227
229
|
rubyforge_project:
|
228
230
|
rubygems_version: 1.8.24
|