pdfmd 1.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/pdfmd.rb +853 -0
- metadata +45 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 0e9a319bb39e3972119dabeda67a88918e1662a9
|
4
|
+
data.tar.gz: 2ddb7e4e715fe65192685c19bcdb560b36f3708b
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: e88256a30ab208960bf09071e88ec291c98349fc8b5dd66077867182b1a467bef916e2e6524e3371f95237863dd8e7e957462f52dcb9dc3a05b6bc172326d7ec
|
7
|
+
data.tar.gz: 0ae519f568c6409c249e5a154365c70a4c06dcfac3288ba863d336d28c2b4daf0188a4d07d0fa7c7fbf5b34bcfd16e5fe930b28759aa9c9f36db914abfe556ec
|
data/lib/pdfmd.rb
ADDED
@@ -0,0 +1,853 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# == Version 1.3
|
3
|
+
#
|
4
|
+
# == File: pdfmetadata.rb
|
5
|
+
#
|
6
|
+
# Show and edit Metadata of PDF files and rename the files accordingly.
|
7
|
+
#
|
8
|
+
# === Requirements
|
9
|
+
#
|
10
|
+
# ==== Ruby gems:
|
11
|
+
# - thor
|
12
|
+
# - highline/import
|
13
|
+
# - fileutils
|
14
|
+
# - i18n
|
15
|
+
# - pathname
|
16
|
+
# - logger
|
17
|
+
#
|
18
|
+
# ==== OS applications:
|
19
|
+
#
|
20
|
+
# - exiftools
|
21
|
+
#
|
22
|
+
# === Usage
|
23
|
+
#
|
24
|
+
# $ ./pdfmetadata <action> <parameter> file
|
25
|
+
#
|
26
|
+
# $ ./pdfmetadata help <action>
|
27
|
+
#
|
28
|
+
# An overview about the actions can be seen when running the script without
|
29
|
+
# any parameters
|
30
|
+
#
|
31
|
+
# === Changelog
|
32
|
+
#
|
33
|
+
# Version 1.3
|
34
|
+
# - Small bugfix about special characters in filenames (author).
|
35
|
+
# - Bugfix for the tag 'createdate' written as 'CreateDate' which did not
|
36
|
+
# take the date then.
|
37
|
+
# - Removed inactive code.
|
38
|
+
# - Added paramter 'version'
|
39
|
+
#
|
40
|
+
# Version 1.2
|
41
|
+
# - Small bugfix with the sort function and the logfile being created.
|
42
|
+
#
|
43
|
+
# Version 1.1
|
44
|
+
# - Added Function to sort pdf documents into a directory structure based on
|
45
|
+
# the author of the document.
|
46
|
+
# - Added dependency 'pathname'
|
47
|
+
# - Added dependency 'logger'
|
48
|
+
# - Added dependency 'i18n'
|
49
|
+
# - Added method 'sort'
|
50
|
+
# - Changing a tag will now output the old value in the edit dialog.
|
51
|
+
# - Updated documentation and descriptions of methods
|
52
|
+
#
|
53
|
+
# Version 1.0
|
54
|
+
# - Added documentation in long description of the commands
|
55
|
+
# - Added method "explain" for further information
|
56
|
+
#
|
57
|
+
# Version 0.9
|
58
|
+
# - Added 'rename' option to edit metatags
|
59
|
+
# - Fixed some output strings
|
60
|
+
#
|
61
|
+
# Version 0.x
|
62
|
+
# - All other stuff
|
63
|
+
#
|
64
|
+
# Check and set metadata of PDF documents
|
65
|
+
#
|
66
|
+
# A complete set of metada contains
|
67
|
+
#
|
68
|
+
# * CreateDate
|
69
|
+
# * Title
|
70
|
+
# * Author
|
71
|
+
# * Subject
|
72
|
+
# * Keywords (optional)
|
73
|
+
#
|
74
|
+
# TODO: Include password protected PDF documents as well
|
75
|
+
# TODO: Fix broken PDF files automatically
|
76
|
+
# TODO: Enable logging in more functions than only "sort"
|
77
|
+
# TODO: Read this: http://lostechies.com/derickbailey/2011/04/29/writing-a-thor-application/
|
78
|
+
# TODO: ... and this: http://blog.paracode.com/2012/05/17/building-your-tools-with-thor/
|
79
|
+
# TODO: Create Gem: http://yehudakatz.com/2010/04/02/using-gemspecs-as-intended/
|
80
|
+
# gs \
|
81
|
+
# -o repaired.pdf \
|
82
|
+
# -sDEVICE=pdfwrite \
|
83
|
+
# -dPDFSETTINGS=/prepress \
|
84
|
+
# corrupted.pdf
|
85
|
+
#
|
86
|
+
# == Author
|
87
|
+
#
|
88
|
+
# Daniel Roos <daniel-git@micronarrativ.org>
|
89
|
+
# Source: https://github.com/Micronarrativ/micronarrativ/tree/scripts
|
90
|
+
#
|
91
|
+
require "thor"
|
92
|
+
require "highline/import"
|
93
|
+
require "fileutils"
|
94
|
+
require "i18n"
|
95
|
+
require 'pathname'
|
96
|
+
require 'logger'
|
97
|
+
|
98
|
+
VERSION = '1.3'
|
99
|
+
#
|
100
|
+
# Function to read the metadata from a given file
|
101
|
+
# hash readMetadata(string)
|
102
|
+
#
|
103
|
+
def readMetadata(pathFile = false)
|
104
|
+
metadata = Hash.new
|
105
|
+
metadata['keywords'] = ''
|
106
|
+
metadata['subject'] = ''
|
107
|
+
metadata['title'] = ''
|
108
|
+
metadata['author'] = ''
|
109
|
+
metadata['creator'] = ''
|
110
|
+
metadata['createdate'] = ''
|
111
|
+
if not File.file?(pathFile)
|
112
|
+
puts "Cannot access file #{pathFile}. Abort"
|
113
|
+
abort
|
114
|
+
end
|
115
|
+
|
116
|
+
# Fetch the Metada with the help of exiftools (unless something better is
|
117
|
+
# found
|
118
|
+
metaStrings = `exiftool '#{pathFile}' | egrep -i '^Creator\s+\:|^Author|Create Date|Subject|Keywords|Title'`
|
119
|
+
|
120
|
+
# Time to cherrypick the available data
|
121
|
+
entries = metaStrings.split("\n")
|
122
|
+
entries.each do |entry|
|
123
|
+
values = entry.split(" : ")
|
124
|
+
values[0].match(/Creator/) and metadata['creator'] == '' ? metadata['creator'] = values[1]: metadata['creator'] = ''
|
125
|
+
values[0].match(/Author/) and metadata['author'] == '' ? metadata['author'] = values[1]: metadata['author'] = ''
|
126
|
+
values[0].match(/Create Date/) and metadata['createdate'] == '' ? metadata['createdate'] = values[1]: metadata['createdate'] = ''
|
127
|
+
values[0].match(/Subject/) and metadata['subject'] == '' ? metadata['subject'] = values[1]: metadata['subject'] = ''
|
128
|
+
values[0].match(/Keywords/) and metadata['keywords'] == '' ? metadata['keywords'] = values[1]: metadata['keywords'] =''
|
129
|
+
values[0].match(/Title/) and metadata['title'] == '' ? metadata['title'] = values[1]: metadata['title'] =''
|
130
|
+
end
|
131
|
+
return metadata
|
132
|
+
end
|
133
|
+
|
134
|
+
#
|
135
|
+
# Set Keywords Preface based on title and subject
|
136
|
+
# If subject matches a number/character combination and contains no spaces,
|
137
|
+
# the preface will be combined with the doktype.
|
138
|
+
# If not: preface will contain the whole subject with dots and spaces being
|
139
|
+
# replaced with underscores
|
140
|
+
#
|
141
|
+
def setKeywordsPreface(metadata, doktype)
|
142
|
+
if metadata['subject'].match(/^\d+[^+s]+.*/)
|
143
|
+
return doktype + metadata['subject']
|
144
|
+
else
|
145
|
+
subject = metadata['subject']
|
146
|
+
|
147
|
+
# Take care of special characters
|
148
|
+
I18n.enforce_available_locales = false
|
149
|
+
subject = I18n.transliterate(metadata['subject'])
|
150
|
+
|
151
|
+
# Replace everything else
|
152
|
+
subject = subject.gsub(/[^a-zA-Z0-9]+/,'_')
|
153
|
+
return subject
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
#
|
158
|
+
# Read user input
|
159
|
+
#
|
160
|
+
def readUserInput(textstring = 'Enter value: ')
|
161
|
+
return ask textstring
|
162
|
+
end
|
163
|
+
|
164
|
+
#
|
165
|
+
# Identify a date
|
166
|
+
# Function takes a string and tries to identify a date in there.
|
167
|
+
# returns false if no date could be identified
|
168
|
+
# otherwise the date is returned in the format as
|
169
|
+
#
|
170
|
+
# YYYY:MM:DD HH:mm:ss
|
171
|
+
#
|
172
|
+
# For missing time values zero is assumed
|
173
|
+
#
|
174
|
+
def identifyDate(datestring)
|
175
|
+
identifiedDate = ''
|
176
|
+
year = '[1-2][90][0-9][0-9]'
|
177
|
+
month = '0[0-9]|10|11|12'
|
178
|
+
day = '[1-9]|0[1-9]|1[0-9]|2[0-9]|3[0-1]'
|
179
|
+
hour = '[0-1][0-9]|2[0-3]|[1-9]'
|
180
|
+
minute = '[0-5][0-9]'
|
181
|
+
second = '[0-5][0-9]'
|
182
|
+
case datestring
|
183
|
+
when /^(#{year})(#{month})(#{day})$/
|
184
|
+
identifiedDate = $1 + ':' + $2 + ':' + $3 + ' 00:00:00'
|
185
|
+
when /^(#{year})(#{month})(#{day})(#{hour})(#{minute})(#{second})$/
|
186
|
+
identifiedDate = $1 + ':' + $2 + ':' + $3 + ' ' + $4 + ':' + $5 + ':' + $6
|
187
|
+
when /^(#{year})[\:|\.|\-](#{month})[\:|\.|\-](#{day})\s(#{hour})[\:](#{minute})[\:](#{second})$/
|
188
|
+
identifiedDate = $1 + ':' + $2 + ':' + $3 + ' ' + $4 + ':' + $5 + ':' + $6
|
189
|
+
when /^(#{year})[\:|\.|\-](#{month})[\:|\.|\-](#{day})$/
|
190
|
+
day = "%02d" % $3
|
191
|
+
month = "%02d" % $2
|
192
|
+
identifiedDate = $1 + ':' + month + ':' + day + ' 00:00:00'
|
193
|
+
else
|
194
|
+
identifiedDate = false
|
195
|
+
end
|
196
|
+
return identifiedDate
|
197
|
+
end
|
198
|
+
|
199
|
+
class DOC < Thor
|
200
|
+
|
201
|
+
|
202
|
+
#
|
203
|
+
# Show the current metadata tags
|
204
|
+
#
|
205
|
+
# TODO: format output as JSON and YAML
|
206
|
+
# TODO: Enable additional options
|
207
|
+
#
|
208
|
+
desc 'show', 'Show metadata of a file'
|
209
|
+
method_option :all, :type => :boolean, :aliases => '-a', :desc => 'Show all metatags', :default => false, :required => false
|
210
|
+
method_option :tag, :type => :string, :aliases => '-t', :desc => 'Show specific tag(s), comma separated', :required => false
|
211
|
+
long_desc <<-LONGDESC
|
212
|
+
== General
|
213
|
+
|
214
|
+
Show metatags of a PDF document.
|
215
|
+
|
216
|
+
The following tags are being shown:
|
217
|
+
\x5 * Author
|
218
|
+
\x5 * Creator
|
219
|
+
\x5 * CreateDate
|
220
|
+
\x5 * Title
|
221
|
+
\x5 * Subject
|
222
|
+
\x5 * Keywords
|
223
|
+
|
224
|
+
== Parameters
|
225
|
+
|
226
|
+
--all, -a
|
227
|
+
\x5 Show all relevant metatags for a document.
|
228
|
+
|
229
|
+
Relevant tags are Author,Creator, CreateDate, Title, Subject, Keywords.
|
230
|
+
|
231
|
+
--tag, -t
|
232
|
+
\x5 Specify the metatag to show. The selected metatag must be one of the relevant tags. Other tags are ignored and nothing is returned.
|
233
|
+
|
234
|
+
== Example
|
235
|
+
|
236
|
+
# Show default metatags for a pdf document
|
237
|
+
\x5>CLI show <filename>
|
238
|
+
|
239
|
+
# Show default metatags for example.pdf
|
240
|
+
\x5>CLI show example.pdf
|
241
|
+
|
242
|
+
# Show value for metatag 'Author' for the file example.pdf
|
243
|
+
\x5>CLI show -t author example.pdf
|
244
|
+
|
245
|
+
# Show value for metatags 'Author','Title' for the file example.pdf
|
246
|
+
\x5>CLI show -t author,title example.pdf
|
247
|
+
|
248
|
+
LONGDESC
|
249
|
+
def show(filename)
|
250
|
+
metadata = readMetadata(filename)
|
251
|
+
|
252
|
+
# Output all metatags
|
253
|
+
if options[:all] or options[:tag].nil?
|
254
|
+
puts "Author : " + metadata['author'].to_s
|
255
|
+
puts "Creator : " + metadata['creator'].to_s
|
256
|
+
puts "CreateDate : " + metadata['createdate'].to_s
|
257
|
+
puts "Subject : " + metadata['subject'].to_s
|
258
|
+
puts "Title : " + metadata['title'].to_s
|
259
|
+
puts "Keywords : " + metadata['keywords'].to_s
|
260
|
+
|
261
|
+
# Ouput only specific tags
|
262
|
+
elsif not options[:tag].nil?
|
263
|
+
tags = options[:tag].split(',')
|
264
|
+
tags.each do |tag|
|
265
|
+
puts metadata[tag]
|
266
|
+
end
|
267
|
+
end
|
268
|
+
|
269
|
+
end
|
270
|
+
|
271
|
+
#
|
272
|
+
# Change a MetaTag Attribute
|
273
|
+
#
|
274
|
+
# TODO: keywords are added differently according to the documentation
|
275
|
+
# http://www.sno.phy.queensu.ca/~phil/exiftool/faq.html
|
276
|
+
desc 'edit', 'Edit Meta Tag(s)'
|
277
|
+
long_desc <<-LONGDESC
|
278
|
+
== General
|
279
|
+
|
280
|
+
Command will edit the metadata of a PDF document. Multiple values can be
|
281
|
+
specified or 'all'.
|
282
|
+
|
283
|
+
The command will invoke an interactive user input and request the values
|
284
|
+
for the metatag.
|
285
|
+
|
286
|
+
Additionally the file can be renamed at the end according to the new meta
|
287
|
+
tags. See `$ #{__FILE__} help rename` for details.
|
288
|
+
|
289
|
+
== Parameters
|
290
|
+
|
291
|
+
--tag, -t
|
292
|
+
\x5 Names or list of names of Metatag fields to set, separated by commata.
|
293
|
+
|
294
|
+
--rename, -r
|
295
|
+
\x5 Rename file after updating the meta tag information according to the fields.
|
296
|
+
|
297
|
+
This parameter is identical to running `> CLI rename <filename>`
|
298
|
+
|
299
|
+
General example:
|
300
|
+
|
301
|
+
# Edit tag 'TAG' and set a new value interactive.
|
302
|
+
\x5>CLI edit -t TAG <filename>
|
303
|
+
|
304
|
+
# Edit tag 'Author' and set new value interactive.
|
305
|
+
\x5>CLI edit -t author example.pdf
|
306
|
+
|
307
|
+
# Edit mulitple Tags and set a new value.
|
308
|
+
\x5>CLI edit -t tag1,tag2,tag3 <filename>
|
309
|
+
|
310
|
+
|
311
|
+
== Multiple Tags
|
312
|
+
|
313
|
+
For setting multiple tags list the tags comma separated.
|
314
|
+
|
315
|
+
For setting all tags (Author, Title, Subject, CreateDate, Keywords) use the keyword 'all' as tagname.
|
316
|
+
|
317
|
+
# Set tags 'Author', 'Title', 'Subject' in example.pdf interactivly.
|
318
|
+
\x5>CLI edit -t author,title,subject example.pdf`
|
319
|
+
|
320
|
+
# Set tags 'Author', 'Title', 'Subject', 'CreateDate', 'Keywords' in
|
321
|
+
example.pdf interactive.
|
322
|
+
\x5>CLI edit -t all example.pdf
|
323
|
+
|
324
|
+
== Tag: CreateDate
|
325
|
+
|
326
|
+
In order to enter a value for the 'CreateDate' field, some internal matching is going on in order to make it easier and faster to enter dates and times.
|
327
|
+
|
328
|
+
The following formats are identified/matched:
|
329
|
+
|
330
|
+
\x5 yyyymmdd
|
331
|
+
\x5 yyyymmd
|
332
|
+
\x5 yyyymmddHHMMSS
|
333
|
+
\x5 yyyy-mm-dd HH:MM:SS
|
334
|
+
\x5 yyyy:mm:dd HH:MM:SS
|
335
|
+
\x5 yyyy.mm.dd HH:MM:SS
|
336
|
+
\x5 yyyy-mm-d
|
337
|
+
\x5 yyyy-mm-dd
|
338
|
+
\x5 yyyy.mm.d
|
339
|
+
\x5 yyyy.mm.dd
|
340
|
+
\x5 yyyy:mm:d
|
341
|
+
\x5 yyyy:mm:dd
|
342
|
+
|
343
|
+
\x5 - If HH:MM:SS or HHMMSS is not provided, those values are automatically set to zero.
|
344
|
+
\x5 - The output format of every timestamp is <yyyy:mm:dd HH:MM:SS>
|
345
|
+
\x5 - When providing and invalid date, the incorrect date is rejected and the user asked to provide the correct date.
|
346
|
+
|
347
|
+
== Rename file
|
348
|
+
|
349
|
+
In addition to setting the tags the current file can be renamed according to
|
350
|
+
the new metadata.
|
351
|
+
|
352
|
+
# Set tag 'Author' and rename file example.pdf
|
353
|
+
\x5> CLI edit -t author -r example.pdf
|
354
|
+
|
355
|
+
See `> CLI help rename` for details about renaming.
|
356
|
+
|
357
|
+
LONGDESC
|
358
|
+
method_option :tag, :type => :string, :aliases => '-t', :desc => 'Name of the Tag(s) to Edit', :default => false, :required => true
|
359
|
+
method_option :rename, :type => :boolean, :aliases => '-r', :desc => 'Rename file after changing meta-tags', :default => false, :required => false
|
360
|
+
def edit(filename)
|
361
|
+
metadata = readMetadata(filename)
|
362
|
+
|
363
|
+
if options[:tag] == 'all'
|
364
|
+
tags = ['author','title','subject','createdate','keywords']
|
365
|
+
else
|
366
|
+
tags = options[:tag].split(',')
|
367
|
+
end
|
368
|
+
tags.each do |currentTag|
|
369
|
+
|
370
|
+
# Change the tag to something we can use here
|
371
|
+
puts "Current value: '#{metadata[currentTag.downcase]}'"
|
372
|
+
answer = readUserInput("Enter new value for #{currentTag} :")
|
373
|
+
if currentTag.downcase == 'createdate'
|
374
|
+
while not answer = identifyDate(answer)
|
375
|
+
puts 'Invalid date format'
|
376
|
+
answer = readUserInput("Enter new value for #{currentTag} :")
|
377
|
+
end
|
378
|
+
end
|
379
|
+
puts "Changing value for #{currentTag}: '#{metadata[currentTag]}' => #{answer}"
|
380
|
+
`exiftool -#{currentTag.downcase}='#{answer}' -overwrite_original '#{filename}'`
|
381
|
+
end
|
382
|
+
|
383
|
+
#
|
384
|
+
# If required, run the renaming task afterwards
|
385
|
+
# This is not pretty, but seems to be the only way to do this in THOR
|
386
|
+
#
|
387
|
+
if options[:rename]
|
388
|
+
puts `#{__FILE__} rename '#{filename}'`
|
389
|
+
end
|
390
|
+
|
391
|
+
end
|
392
|
+
|
393
|
+
#
|
394
|
+
# Check the metadata for the minium necessary tags
|
395
|
+
# See documentation at the top of this file for defailts
|
396
|
+
#
|
397
|
+
# void check(string)
|
398
|
+
desc 'check', 'Check Metadata for completeness'
|
399
|
+
long_desc <<-LONGDESC
|
400
|
+
== General
|
401
|
+
|
402
|
+
Show value of the following metatags of a PDF document:
|
403
|
+
|
404
|
+
- Author
|
405
|
+
\x5- Creator
|
406
|
+
\x5- CreateDate
|
407
|
+
\x5- Subject
|
408
|
+
\x5- Title
|
409
|
+
\x5- Keywords
|
410
|
+
|
411
|
+
== Example
|
412
|
+
|
413
|
+
# Show the values of the metatags for example.pdf
|
414
|
+
\x5>CLI show example.pdf
|
415
|
+
|
416
|
+
LONGDESC
|
417
|
+
def check(filename)
|
418
|
+
returnvalue = 0
|
419
|
+
readMetadata(filename).each do|key,value|
|
420
|
+
if key.match(/author|subject|createdate|title/) and value.empty?
|
421
|
+
puts 'Missing value: ' + key
|
422
|
+
returnvalue == 0 ? returnvalue = 1 : ''
|
423
|
+
end
|
424
|
+
end
|
425
|
+
exit returnvalue
|
426
|
+
end
|
427
|
+
|
428
|
+
#
|
429
|
+
# Explain fields and Metatags
|
430
|
+
# Show information about how they are used.
|
431
|
+
#
|
432
|
+
desc 'explain','Show more information about usuable Meta-Tags'
|
433
|
+
long_desc <<-LONGDESC
|
434
|
+
== General
|
435
|
+
|
436
|
+
Explain some terms used with the script.
|
437
|
+
|
438
|
+
== Example
|
439
|
+
|
440
|
+
# Show the available subjects
|
441
|
+
\x5>CLI explain
|
442
|
+
|
443
|
+
# Show information about the subject 'author'
|
444
|
+
\x5>CLI explain author
|
445
|
+
|
446
|
+
LONGDESC
|
447
|
+
def explain(term='')
|
448
|
+
|
449
|
+
case term
|
450
|
+
when ''
|
451
|
+
puts 'Available subjects:'
|
452
|
+
puts '- author'
|
453
|
+
puts '- createdate'
|
454
|
+
puts '- keywords'
|
455
|
+
puts '- subject'
|
456
|
+
puts '- title'
|
457
|
+
puts ' '
|
458
|
+
puts "Run `$ #{__FILE__} explain <subject>` to get more details."
|
459
|
+
when 'author'
|
460
|
+
puts '[Author]'
|
461
|
+
puts ' The sender or creator of the document.'
|
462
|
+
when 'createdate'
|
463
|
+
puts '[CreateDate]'
|
464
|
+
puts ' Date of the document. This is not the date when the file was created, but'
|
465
|
+
puts ' the date found in the document or printed on the document.'
|
466
|
+
when 'title'
|
467
|
+
puts '[Title]'
|
468
|
+
puts ' General type of the document, e.g. Manual, Invoice.'
|
469
|
+
when 'subject'
|
470
|
+
puts '[Subject]'
|
471
|
+
puts ' What is the document about.'
|
472
|
+
puts ' For example:'
|
473
|
+
puts ' Manual: What is the manual about?'
|
474
|
+
puts ' Invoice: Invoice number?'
|
475
|
+
puts ' Contract: Contract number of Subject of the contract?'
|
476
|
+
puts ' Order: Ordernumber of the document?'
|
477
|
+
when 'keywords'
|
478
|
+
puts '[Keywords]'
|
479
|
+
puts ' Anything else that might be of interesst.'
|
480
|
+
puts ' In Orders the elements that have been orders. Contracts might contain the'
|
481
|
+
puts ' Names and adress of the involved parties.'
|
482
|
+
puts ' '
|
483
|
+
puts ' When writing Invoices with their numbers, these will be automatically be '
|
484
|
+
puts ' picked up and can be integrated in the filename, e.g. "Invoicenumber 12334'
|
485
|
+
end
|
486
|
+
|
487
|
+
end
|
488
|
+
|
489
|
+
#
|
490
|
+
# Sort the files into directories based on the author
|
491
|
+
#
|
492
|
+
desc 'sort','Sort files into directories sorted by Author'
|
493
|
+
long_desc <<-LONGDESC
|
494
|
+
== General
|
495
|
+
|
496
|
+
Will sort pdf documents into subdirectories according to the value of their
|
497
|
+
tag 'author'.
|
498
|
+
|
499
|
+
When using this action a logfile with all actions will be generated in the
|
500
|
+
current working directory with the same name as the script and the ending
|
501
|
+
'.log'. This can be disabled with the parameter 'log' if required.
|
502
|
+
|
503
|
+
If a document does not have an entry in the meta tag 'author', the file will
|
504
|
+
not be processed. This can be seen in the output of the logfile as well.
|
505
|
+
|
506
|
+
=== Parameters
|
507
|
+
|
508
|
+
[*destination|d*]
|
509
|
+
\x5 Speficy the root output directory to where the folderstructure is being created.
|
510
|
+
|
511
|
+
This parameter is required.
|
512
|
+
|
513
|
+
[*copy|c*]
|
514
|
+
\x5 Copy the files instead of moving them.
|
515
|
+
|
516
|
+
[*log|l*]
|
517
|
+
\x5 Disable/Enable the logging.
|
518
|
+
\x5 Default: enabled.
|
519
|
+
|
520
|
+
=== Replacement rules
|
521
|
+
|
522
|
+
The subdirectories for the documents are generated from the values in the
|
523
|
+
tag 'author' of each document.
|
524
|
+
|
525
|
+
In order to ensure a clean directory structure, there are certain rules
|
526
|
+
for altering the values.
|
527
|
+
\x5 1. Whitespaces are replaced by underscores.
|
528
|
+
\x5 2. Dots are replaced by underscores.
|
529
|
+
\x5 3. All letters are converted to their lowercase version.
|
530
|
+
\x5 4. Special characters are serialized
|
531
|
+
|
532
|
+
=== Example
|
533
|
+
|
534
|
+
This command does the following:
|
535
|
+
\x5 1. Take all pdf documents in the subdirectory ./documents.
|
536
|
+
\x5 2. Create the output folder structure in `/tmp/test/`.
|
537
|
+
\x5 3. Copy the files instead of moving them.
|
538
|
+
\x5 4. Disable the logging.
|
539
|
+
\x5> CLI sort -d /tmp/test -c -l false ./documents
|
540
|
+
|
541
|
+
LONGDESC
|
542
|
+
method_option :destination, :aliases => '-d', :required => true, :type => :string, :desc => 'Defines the output directory'
|
543
|
+
method_option :copy, :aliases => '-c', :required => false, :type => :boolean, :desc => 'Copy files instead of moving them'
|
544
|
+
method_option :log, :aliases => '-l', :require => false, :type => :boolean, :desc => 'Enable/Disable creation of log files', :default => true
|
545
|
+
def sort(inputDir = '.')
|
546
|
+
|
547
|
+
destination = options[:destination]
|
548
|
+
logenable = options[:log]
|
549
|
+
scriptname = Pathname.new(__FILE__).basename
|
550
|
+
logenable ? $logger = Logger.new(Dir.pwd.chomp('/') + "/#{scriptname}.log") : ''
|
551
|
+
|
552
|
+
# Input validation
|
553
|
+
!File.exist?(inputDir) ? abort('Input directory does not exist. Abort.'): ''
|
554
|
+
File.directory?(inputDir) ? '' : abort('Input is a single file')
|
555
|
+
File.file?(destination) ? abort("Output '#{destination}' is an existing file. Cannot create directory with the same name. Abort") : ''
|
556
|
+
unless File.directory?(destination)
|
557
|
+
FileUtils.mkdir_p(destination)
|
558
|
+
$logger.info("Destination '#{destination}' has been created.")
|
559
|
+
end
|
560
|
+
|
561
|
+
# Iterate through all files
|
562
|
+
Dir[inputDir.chomp('/') + '/*.pdf'].sort.each do |file|
|
563
|
+
|
564
|
+
metadata = readMetadata(file)
|
565
|
+
if metadata['author'] and not metadata['author'].empty?
|
566
|
+
author = metadata['author'].gsub(' ','_').gsub('.','_')
|
567
|
+
I18n.enforce_available_locales = false # Serialize special characters
|
568
|
+
author = I18n.transliterate(author).downcase
|
569
|
+
folderdestination = destination.chomp('/') + '/' + author
|
570
|
+
unless File.directory?(folderdestination)
|
571
|
+
FileUtils.mkdir_p(folderdestination)
|
572
|
+
logenable ? $logger.info("Folder '#{folderdestination}' has been created."): ''
|
573
|
+
end
|
574
|
+
filedestination = destination.chomp('/') + '/' + author + '/' + Pathname.new(file).basename.to_s
|
575
|
+
|
576
|
+
# Final check before touching the filesystem
|
577
|
+
if not File.exist?(filedestination)
|
578
|
+
$logger.info("File '#{file}' => '#{filedestination}'")
|
579
|
+
|
580
|
+
# Move/Copy the file
|
581
|
+
if options[:copy]
|
582
|
+
FileUtils.cp(file, filedestination)
|
583
|
+
else
|
584
|
+
FileUtils.mv(file,filedestination)
|
585
|
+
end
|
586
|
+
|
587
|
+
else
|
588
|
+
logenable ? $logger.warn("File '#{filedestination}' already exists. Ignoring.") : ''
|
589
|
+
end
|
590
|
+
else
|
591
|
+
logenable ? $logger.warn("Missing tag 'Author' for file '#{file}'. Skipping.") : (puts "Missing tag 'Author' for file '#{file}'. Skipping")
|
592
|
+
end
|
593
|
+
end
|
594
|
+
|
595
|
+
end
|
596
|
+
|
597
|
+
#
|
598
|
+
# Rename the file according to the Metadata
|
599
|
+
#
|
600
|
+
# Scheme: YYYYMMDD-author-subject-keywords.extension
|
601
|
+
desc 'rename', 'Rename the file according to Metadata'
|
602
|
+
long_desc <<-LONGDESC
|
603
|
+
== General
|
604
|
+
|
605
|
+
Rename a file with the meta tags in the document.
|
606
|
+
|
607
|
+
== Parameter
|
608
|
+
|
609
|
+
--dry-run, -n
|
610
|
+
\x5 Simulate the renaming process and show the result without changing the file.
|
611
|
+
|
612
|
+
--all-keywords, -a
|
613
|
+
\x5 Use all keywords from the meta information in the file name and ignore the limit.
|
614
|
+
|
615
|
+
--keywwords, -k
|
616
|
+
\x5 Set the number of keywords used in the filename to a new value.
|
617
|
+
\x5 Default: 3
|
618
|
+
|
619
|
+
--outputdir, -o
|
620
|
+
\x5 Not implemented yet. Default output dir for the renamed file is the source directory.
|
621
|
+
|
622
|
+
== Example
|
623
|
+
|
624
|
+
# Rename the file according to the metatags
|
625
|
+
\x5> CLI rename <filename>
|
626
|
+
|
627
|
+
# Rename example.pdf according to the metatags
|
628
|
+
\x5> CLI rename example.pdf
|
629
|
+
|
630
|
+
# Simulate renaming example.pdf according to the metatags (dry-run)
|
631
|
+
\x5> CLI rename -n example.pdf
|
632
|
+
|
633
|
+
== Rules
|
634
|
+
|
635
|
+
There are some rules regarding how documents are being renamed
|
636
|
+
|
637
|
+
Rule 1: All documents have the following filenaming structure:
|
638
|
+
|
639
|
+
<yyyymmdd>-<author>-<type>-<additionalInformation>.<extension>
|
640
|
+
|
641
|
+
\x5 # <yyyymmdd>: Year, month and day identival to the meta information in the
|
642
|
+
document.
|
643
|
+
\x5 # <author>: Author of the document, identical to the meta information
|
644
|
+
in the document. Special characters and whitespaces are replaced.
|
645
|
+
\x5 # <type>: Document type, is being generated from the title field in the metadata of the document. Document type is a three character abbreviation following the following logic:
|
646
|
+
|
647
|
+
\x5 til => Tilbudt|Angebot
|
648
|
+
\x5 odb => Orderbekreftelse
|
649
|
+
\x5 fak => Faktura
|
650
|
+
\x5 ord => Order
|
651
|
+
\x5 avt => Kontrakt|Avtale|Vertrag|contract
|
652
|
+
\x5 kvi => Kvittering
|
653
|
+
\x5 man => Manual
|
654
|
+
\x5 bil => Billett|Ticket
|
655
|
+
\x5 inf => Informasjon|Information
|
656
|
+
\x5 dok => unknown
|
657
|
+
|
658
|
+
If the dokument type can not be determined automatically, it defaults to 'dok'.
|
659
|
+
|
660
|
+
# <additionalInformation>: Information generated from the metadata fields
|
661
|
+
'title', 'subject' and 'keywords'.
|
662
|
+
|
663
|
+
If 'Title' or 'Keywords' contains one of the following keywords, the will be replaced with the corresponding abbreviation followed by the specified value separated by a whitespace:
|
664
|
+
|
665
|
+
\x5 fak => Faktura|Fakturanummer|Rechnung|Rechnungsnummer
|
666
|
+
\x5 kdn => Kunde|Kundenummer|Kunde|Kundennummer
|
667
|
+
\x5 ord => Ordre|Ordrenummer|Bestellung|Bestellungsnummer
|
668
|
+
\x5 kvi => Kvittering|Kvitteringsnummer|Quittung|Quittungsnummer
|
669
|
+
|
670
|
+
Rule 2: The number of keywords used in the filename is defined by the parameter '-k'. See the section of that parameter for more details and the default value.
|
671
|
+
|
672
|
+
Rule 3: Keywords matching 'kvi','fak','ord','kdn' are prioritised.
|
673
|
+
|
674
|
+
Rule 4: Special characters and whitespaces are replaced:
|
675
|
+
|
676
|
+
\x5 ' ' => '_'
|
677
|
+
\x5 '/' => '_'
|
678
|
+
|
679
|
+
Rule 5: The new filename has only lowercase characters.
|
680
|
+
|
681
|
+
== Example (detailed)
|
682
|
+
|
683
|
+
# Example PDF with following MetaTags:
|
684
|
+
|
685
|
+
\x5 Filename : example.pdf
|
686
|
+
\x5 Author : John
|
687
|
+
\x5 Subject : new Product
|
688
|
+
\x5 Title : Presentation
|
689
|
+
\x5 CreateDate : 1970:01:01 01:00:00
|
690
|
+
\x5 Keywords : John Doe, Jane Doe, Mister Doe
|
691
|
+
|
692
|
+
# Renaming the file
|
693
|
+
\x5> CLI rename example.pdf
|
694
|
+
\x5 example.pdf => 19700101-john-dok-new_product-john_doe-jane_doe.pdf
|
695
|
+
|
696
|
+
# Simulation to rename the file (no actual change)
|
697
|
+
\x5> CLI rename -n example.pdf
|
698
|
+
\x5example.pdf => 19700101-john-dok-new_product-john_doe-jane_doe.pdf
|
699
|
+
|
700
|
+
# Renaming the file with all keywords
|
701
|
+
\x5> CLI rename -n -a example.pdf
|
702
|
+
\x5example.pdf => 19700101-john-dok-new_product-john_doe-jane_doe-mister_doe.pdf
|
703
|
+
|
704
|
+
LONGDESC
|
705
|
+
method_option :dryrun, :type => :boolean, :aliases => '-n', :desc => 'Run without making changes', :default => false, :required => false
|
706
|
+
method_option ':all-keywords', :type => :boolean, :aliases => '-a', :desc => 'Add all keywords (no limit)', :default => false, :required => false
|
707
|
+
method_option :keywords, :type => :numeric, :aliases => '-k', :desc => 'Number of keywords to include (Default: 3)', :default => 3, :required => false
|
708
|
+
method_option :outputdir, :aliases => '-o', :type => :string, :desc => 'Speficy output directory', :default => :false, :required => :false
|
709
|
+
def rename(filename)
|
710
|
+
metadata = readMetadata(filename).each do |key,value|
|
711
|
+
|
712
|
+
# Check if the metadata is complete
|
713
|
+
if key.match(/author|subject|createdate|title/) and value.empty?
|
714
|
+
puts 'Missing value for ' + key
|
715
|
+
puts 'Abort'
|
716
|
+
exit 1
|
717
|
+
end
|
718
|
+
|
719
|
+
end
|
720
|
+
|
721
|
+
date = metadata['createdate'].gsub(/\ \d{2}\:\d{2}\:\d{2}.*$/,'').gsub(/\:/,'')
|
722
|
+
author = metadata['author'].gsub(/\./,'_').gsub(/\-/,'').gsub(/\s/,'_')
|
723
|
+
I18n.enforce_available_locales = false
|
724
|
+
author = I18n.transliterate(author) # Normalising
|
725
|
+
|
726
|
+
keywords_preface = ''
|
727
|
+
# This statement can probably be optimised
|
728
|
+
case metadata['title']
|
729
|
+
when /(Tilbudt|Angebot)/i
|
730
|
+
doktype = 'til'
|
731
|
+
keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
|
732
|
+
when /Orderbekrefelse/i
|
733
|
+
doktype = 'odb'
|
734
|
+
keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
|
735
|
+
when /faktura/i
|
736
|
+
doktype = 'fak'
|
737
|
+
keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
|
738
|
+
when /order/i
|
739
|
+
doktype = 'ord'
|
740
|
+
keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
|
741
|
+
when /(kontrakt|avtale|vertrag|contract)/i
|
742
|
+
doktype = 'avt'
|
743
|
+
keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
|
744
|
+
when /kvittering/i
|
745
|
+
doktype = 'kvi'
|
746
|
+
keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
|
747
|
+
when /manual/i
|
748
|
+
doktype = 'man'
|
749
|
+
keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
|
750
|
+
when /(billett|ticket)/i
|
751
|
+
doktype = 'bil'
|
752
|
+
keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
|
753
|
+
when /(informasjon|information)/i
|
754
|
+
doktype = 'inf'
|
755
|
+
keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
|
756
|
+
else
|
757
|
+
doktype = 'dok'
|
758
|
+
keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
|
759
|
+
end
|
760
|
+
if not metadata['keywords'].empty?
|
761
|
+
keywords_preface == '' ? keywords = '' : keywords = keywords_preface
|
762
|
+
keywordsarray = metadata['keywords'].split(',')
|
763
|
+
|
764
|
+
#
|
765
|
+
# Sort array
|
766
|
+
#
|
767
|
+
keywordssorted = Array.new
|
768
|
+
keywordsarray.each_with_index do |value,index|
|
769
|
+
value = value.lstrip.chomp
|
770
|
+
value = value.gsub(/(Faktura|Rechnungs)(nummer)? /i,'fak')
|
771
|
+
value = value.gsub(/(Kunde)(n)?(nummer)? /i,'kdn')
|
772
|
+
value = value.gsub(/(Kunde)(n)?(nummer)?-/i,'kdn')
|
773
|
+
value = value.gsub(/(Ordre|Bestellung)(s?nummer)? /i,'ord')
|
774
|
+
value = value.gsub(/(Kvittering|Quittung)(snummer)? /i,'kvi')
|
775
|
+
value = value.gsub(/\s/,'_')
|
776
|
+
value = value.gsub(/\//,'_')
|
777
|
+
keywordsarray[index] = value
|
778
|
+
if value.match(/^(fak|kdn|ord|kvi)/)
|
779
|
+
keywordssorted.insert(0, value)
|
780
|
+
else
|
781
|
+
keywordssorted.push(value)
|
782
|
+
end
|
783
|
+
end
|
784
|
+
|
785
|
+
counter = 0
|
786
|
+
keywordssorted.each_with_index do |value,index|
|
787
|
+
|
788
|
+
# Exit condition limits the number of keywords used in the filename
|
789
|
+
# unless all keywords shall be added
|
790
|
+
if not options[':all-keywords']
|
791
|
+
counter > options[:keywords]-1 ? break : counter = counter + 1
|
792
|
+
end
|
793
|
+
if value.match(/(kvi|fak|ord|kdn)/i)
|
794
|
+
keywords == '' ? keywords = '-' + value : keywords = value + '-' + keywords
|
795
|
+
else
|
796
|
+
keywords == '' ? keywords = '-' + value : keywords.concat('-' + value)
|
797
|
+
end
|
798
|
+
end
|
799
|
+
# Normalise the keywords as well
|
800
|
+
#
|
801
|
+
I18n.enforce_available_locales = false
|
802
|
+
keywords = I18n.transliterate(keywords)
|
803
|
+
|
804
|
+
# There are no keywords
|
805
|
+
# Rare, but it happens
|
806
|
+
else
|
807
|
+
|
808
|
+
# There are no keywords.
|
809
|
+
# we are using the title and the subject
|
810
|
+
if keywords_preface != ''
|
811
|
+
keywords = keywords_preface
|
812
|
+
end
|
813
|
+
|
814
|
+
end
|
815
|
+
extension = 'pdf'
|
816
|
+
if keywords != nil and keywords[0] != '-'
|
817
|
+
keywords = '-' + keywords
|
818
|
+
end
|
819
|
+
keywords == nil ? keywords = '' : ''
|
820
|
+
newFilename = date + '-' +
|
821
|
+
author + '-' +
|
822
|
+
doktype +
|
823
|
+
keywords + '.' +
|
824
|
+
extension
|
825
|
+
|
826
|
+
# Output directory checks
|
827
|
+
if options[:outputdir]
|
828
|
+
#if not File.exist?(options[:outputdir])
|
829
|
+
# puts "Error: output dir '#{options[:outputdir]}' not found. Abort"
|
830
|
+
# exit 1
|
831
|
+
#end
|
832
|
+
end
|
833
|
+
|
834
|
+
if not options[:dryrun] and filename != newFilename.downcase
|
835
|
+
`mv -v '#{filename}' '#{newFilename.downcase}'`
|
836
|
+
else
|
837
|
+
puts filename + "\n => " + newFilename.downcase
|
838
|
+
end
|
839
|
+
end
|
840
|
+
|
841
|
+
#
|
842
|
+
# One parameter to show the current version
|
843
|
+
#
|
844
|
+
map %w[--version -v] => :__print_version
|
845
|
+
desc "--version, -v", 'Show the current script version'
|
846
|
+
def __print_version
|
847
|
+
puts VERSION
|
848
|
+
end
|
849
|
+
|
850
|
+
end
|
851
|
+
|
852
|
+
DOC.start
|
853
|
+
|
metadata
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: pdfmd
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.3.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Daniel Roos
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-03-16 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Managing the commong pdf metadata settings and renaming the pdf file
|
14
|
+
accordingly.
|
15
|
+
email: daniel@micronerd.org
|
16
|
+
executables: []
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- lib/pdfmd.rb
|
21
|
+
homepage: http://rubygems.org/gems/pdfmd
|
22
|
+
licenses:
|
23
|
+
- MIT
|
24
|
+
metadata: {}
|
25
|
+
post_install_message:
|
26
|
+
rdoc_options: []
|
27
|
+
require_paths:
|
28
|
+
- lib
|
29
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
35
|
+
requirements:
|
36
|
+
- - ">="
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: '0'
|
39
|
+
requirements: []
|
40
|
+
rubyforge_project:
|
41
|
+
rubygems_version: 2.4.6
|
42
|
+
signing_key:
|
43
|
+
specification_version: 4
|
44
|
+
summary: pdfmd - pdf-meta-data management
|
45
|
+
test_files: []
|