pdfmd 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/pdfmd.rb +853 -0
- metadata +45 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 0e9a319bb39e3972119dabeda67a88918e1662a9
|
4
|
+
data.tar.gz: 2ddb7e4e715fe65192685c19bcdb560b36f3708b
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: e88256a30ab208960bf09071e88ec291c98349fc8b5dd66077867182b1a467bef916e2e6524e3371f95237863dd8e7e957462f52dcb9dc3a05b6bc172326d7ec
|
7
|
+
data.tar.gz: 0ae519f568c6409c249e5a154365c70a4c06dcfac3288ba863d336d28c2b4daf0188a4d07d0fa7c7fbf5b34bcfd16e5fe930b28759aa9c9f36db914abfe556ec
|
data/lib/pdfmd.rb
ADDED
@@ -0,0 +1,853 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# == Version 1.3
|
3
|
+
#
|
4
|
+
# == File: pdfmetadata.rb
|
5
|
+
#
|
6
|
+
# Show and edit Metadata of PDF files and rename the files accordingly.
|
7
|
+
#
|
8
|
+
# === Requirements
|
9
|
+
#
|
10
|
+
# ==== Ruby gems:
|
11
|
+
# - thor
|
12
|
+
# - highline/import
|
13
|
+
# - fileutils
|
14
|
+
# - i18n
|
15
|
+
# - pathname
|
16
|
+
# - logger
|
17
|
+
#
|
18
|
+
# ==== OS applications:
|
19
|
+
#
|
20
|
+
# - exiftools
|
21
|
+
#
|
22
|
+
# === Usage
|
23
|
+
#
|
24
|
+
# $ ./pdfmetadata <action> <parameter> file
|
25
|
+
#
|
26
|
+
# $ ./pdfmetadata help <action>
|
27
|
+
#
|
28
|
+
# An overview about the actions can be seen when running the script without
|
29
|
+
# any parameters
|
30
|
+
#
|
31
|
+
# === Changelog
|
32
|
+
#
|
33
|
+
# Version 1.3
|
34
|
+
# - Small bugfix about special characters in filenames (author).
|
35
|
+
# - Bugfix for the tag 'createdate' written as 'CreateDate' which did not
|
36
|
+
# take the date then.
|
37
|
+
# - Removed inactive code.
|
38
|
+
# - Added paramter 'version'
|
39
|
+
#
|
40
|
+
# Version 1.2
|
41
|
+
# - Small bugfix with the sort function and the logfile being created.
|
42
|
+
#
|
43
|
+
# Version 1.1
|
44
|
+
# - Added Function to sort pdf documents into a directory structure based on
|
45
|
+
# the author of the document.
|
46
|
+
# - Added dependency 'pathname'
|
47
|
+
# - Added dependency 'logger'
|
48
|
+
# - Added dependency 'i18n'
|
49
|
+
# - Added method 'sort'
|
50
|
+
# - Changing a tag will now output the old value in the edit dialog.
|
51
|
+
# - Updated documentation and descriptions of methods
|
52
|
+
#
|
53
|
+
# Version 1.0
|
54
|
+
# - Added documentation in long description of the commands
|
55
|
+
# - Added method "explain" for further information
|
56
|
+
#
|
57
|
+
# Version 0.9
|
58
|
+
# - Added 'rename' option to edit metatags
|
59
|
+
# - Fixed some output strings
|
60
|
+
#
|
61
|
+
# Version 0.x
|
62
|
+
# - All other stuff
|
63
|
+
#
|
64
|
+
# Check and set metadata of PDF documents
|
65
|
+
#
|
66
|
+
# A complete set of metada contains
|
67
|
+
#
|
68
|
+
# * CreateDate
|
69
|
+
# * Title
|
70
|
+
# * Author
|
71
|
+
# * Subject
|
72
|
+
# * Keywords (optional)
|
73
|
+
#
|
74
|
+
# TODO: Include password protected PDF documents as well
|
75
|
+
# TODO: Fix broken PDF files automatically
|
76
|
+
# TODO: Enable logging in more functions than only "sort"
|
77
|
+
# TODO: Read this: http://lostechies.com/derickbailey/2011/04/29/writing-a-thor-application/
|
78
|
+
# TODO: ... and this: http://blog.paracode.com/2012/05/17/building-your-tools-with-thor/
|
79
|
+
# TODO: Create Gem: http://yehudakatz.com/2010/04/02/using-gemspecs-as-intended/
|
80
|
+
# gs \
|
81
|
+
# -o repaired.pdf \
|
82
|
+
# -sDEVICE=pdfwrite \
|
83
|
+
# -dPDFSETTINGS=/prepress \
|
84
|
+
# corrupted.pdf
|
85
|
+
#
|
86
|
+
# == Author
|
87
|
+
#
|
88
|
+
# Daniel Roos <daniel-git@micronarrativ.org>
|
89
|
+
# Source: https://github.com/Micronarrativ/micronarrativ/tree/scripts
|
90
|
+
#
|
91
|
+
require "thor"
|
92
|
+
require "highline/import"
|
93
|
+
require "fileutils"
|
94
|
+
require "i18n"
|
95
|
+
require 'pathname'
|
96
|
+
require 'logger'
|
97
|
+
|
98
|
+
VERSION = '1.3'
|
99
|
+
#
|
100
|
+
# Function to read the metadata from a given file
|
101
|
+
# hash readMetadata(string)
|
102
|
+
#
|
103
|
+
def readMetadata(pathFile = false)
|
104
|
+
metadata = Hash.new
|
105
|
+
metadata['keywords'] = ''
|
106
|
+
metadata['subject'] = ''
|
107
|
+
metadata['title'] = ''
|
108
|
+
metadata['author'] = ''
|
109
|
+
metadata['creator'] = ''
|
110
|
+
metadata['createdate'] = ''
|
111
|
+
if not File.file?(pathFile)
|
112
|
+
puts "Cannot access file #{pathFile}. Abort"
|
113
|
+
abort
|
114
|
+
end
|
115
|
+
|
116
|
+
# Fetch the Metada with the help of exiftools (unless something better is
|
117
|
+
# found
|
118
|
+
metaStrings = `exiftool '#{pathFile}' | egrep -i '^Creator\s+\:|^Author|Create Date|Subject|Keywords|Title'`
|
119
|
+
|
120
|
+
# Time to cherrypick the available data
|
121
|
+
entries = metaStrings.split("\n")
|
122
|
+
entries.each do |entry|
|
123
|
+
values = entry.split(" : ")
|
124
|
+
values[0].match(/Creator/) and metadata['creator'] == '' ? metadata['creator'] = values[1]: metadata['creator'] = ''
|
125
|
+
values[0].match(/Author/) and metadata['author'] == '' ? metadata['author'] = values[1]: metadata['author'] = ''
|
126
|
+
values[0].match(/Create Date/) and metadata['createdate'] == '' ? metadata['createdate'] = values[1]: metadata['createdate'] = ''
|
127
|
+
values[0].match(/Subject/) and metadata['subject'] == '' ? metadata['subject'] = values[1]: metadata['subject'] = ''
|
128
|
+
values[0].match(/Keywords/) and metadata['keywords'] == '' ? metadata['keywords'] = values[1]: metadata['keywords'] =''
|
129
|
+
values[0].match(/Title/) and metadata['title'] == '' ? metadata['title'] = values[1]: metadata['title'] =''
|
130
|
+
end
|
131
|
+
return metadata
|
132
|
+
end
|
133
|
+
|
134
|
+
#
|
135
|
+
# Set Keywords Preface based on title and subject
|
136
|
+
# If subject matches a number/character combination and contains no spaces,
|
137
|
+
# the preface will be combined with the doktype.
|
138
|
+
# If not: preface will contain the whole subject with dots and spaces being
|
139
|
+
# replaced with underscores
|
140
|
+
#
|
141
|
+
def setKeywordsPreface(metadata, doktype)
|
142
|
+
if metadata['subject'].match(/^\d+[^+s]+.*/)
|
143
|
+
return doktype + metadata['subject']
|
144
|
+
else
|
145
|
+
subject = metadata['subject']
|
146
|
+
|
147
|
+
# Take care of special characters
|
148
|
+
I18n.enforce_available_locales = false
|
149
|
+
subject = I18n.transliterate(metadata['subject'])
|
150
|
+
|
151
|
+
# Replace everything else
|
152
|
+
subject = subject.gsub(/[^a-zA-Z0-9]+/,'_')
|
153
|
+
return subject
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
#
|
158
|
+
# Read user input
|
159
|
+
#
|
160
|
+
def readUserInput(textstring = 'Enter value: ')
|
161
|
+
return ask textstring
|
162
|
+
end
|
163
|
+
|
164
|
+
#
|
165
|
+
# Identify a date
|
166
|
+
# Function takes a string and tries to identify a date in there.
|
167
|
+
# returns false if no date could be identified
|
168
|
+
# otherwise the date is returned in the format as
|
169
|
+
#
|
170
|
+
# YYYY:MM:DD HH:mm:ss
|
171
|
+
#
|
172
|
+
# For missing time values zero is assumed
|
173
|
+
#
|
174
|
+
def identifyDate(datestring)
|
175
|
+
identifiedDate = ''
|
176
|
+
year = '[1-2][90][0-9][0-9]'
|
177
|
+
month = '0[0-9]|10|11|12'
|
178
|
+
day = '[1-9]|0[1-9]|1[0-9]|2[0-9]|3[0-1]'
|
179
|
+
hour = '[0-1][0-9]|2[0-3]|[1-9]'
|
180
|
+
minute = '[0-5][0-9]'
|
181
|
+
second = '[0-5][0-9]'
|
182
|
+
case datestring
|
183
|
+
when /^(#{year})(#{month})(#{day})$/
|
184
|
+
identifiedDate = $1 + ':' + $2 + ':' + $3 + ' 00:00:00'
|
185
|
+
when /^(#{year})(#{month})(#{day})(#{hour})(#{minute})(#{second})$/
|
186
|
+
identifiedDate = $1 + ':' + $2 + ':' + $3 + ' ' + $4 + ':' + $5 + ':' + $6
|
187
|
+
when /^(#{year})[\:|\.|\-](#{month})[\:|\.|\-](#{day})\s(#{hour})[\:](#{minute})[\:](#{second})$/
|
188
|
+
identifiedDate = $1 + ':' + $2 + ':' + $3 + ' ' + $4 + ':' + $5 + ':' + $6
|
189
|
+
when /^(#{year})[\:|\.|\-](#{month})[\:|\.|\-](#{day})$/
|
190
|
+
day = "%02d" % $3
|
191
|
+
month = "%02d" % $2
|
192
|
+
identifiedDate = $1 + ':' + month + ':' + day + ' 00:00:00'
|
193
|
+
else
|
194
|
+
identifiedDate = false
|
195
|
+
end
|
196
|
+
return identifiedDate
|
197
|
+
end
|
198
|
+
|
199
|
+
class DOC < Thor
|
200
|
+
|
201
|
+
|
202
|
+
#
|
203
|
+
# Show the current metadata tags
|
204
|
+
#
|
205
|
+
# TODO: format output as JSON and YAML
|
206
|
+
# TODO: Enable additional options
|
207
|
+
#
|
208
|
+
desc 'show', 'Show metadata of a file'
|
209
|
+
method_option :all, :type => :boolean, :aliases => '-a', :desc => 'Show all metatags', :default => false, :required => false
|
210
|
+
method_option :tag, :type => :string, :aliases => '-t', :desc => 'Show specific tag(s), comma separated', :required => false
|
211
|
+
long_desc <<-LONGDESC
|
212
|
+
== General
|
213
|
+
|
214
|
+
Show metatags of a PDF document.
|
215
|
+
|
216
|
+
The following tags are being shown:
|
217
|
+
\x5 * Author
|
218
|
+
\x5 * Creator
|
219
|
+
\x5 * CreateDate
|
220
|
+
\x5 * Title
|
221
|
+
\x5 * Subject
|
222
|
+
\x5 * Keywords
|
223
|
+
|
224
|
+
== Parameters
|
225
|
+
|
226
|
+
--all, -a
|
227
|
+
\x5 Show all relevant metatags for a document.
|
228
|
+
|
229
|
+
Relevant tags are Author,Creator, CreateDate, Title, Subject, Keywords.
|
230
|
+
|
231
|
+
--tag, -t
|
232
|
+
\x5 Specify the metatag to show. The selected metatag must be one of the relevant tags. Other tags are ignored and nothing is returned.
|
233
|
+
|
234
|
+
== Example
|
235
|
+
|
236
|
+
# Show default metatags for a pdf document
|
237
|
+
\x5>CLI show <filename>
|
238
|
+
|
239
|
+
# Show default metatags for example.pdf
|
240
|
+
\x5>CLI show example.pdf
|
241
|
+
|
242
|
+
# Show value for metatag 'Author' for the file example.pdf
|
243
|
+
\x5>CLI show -t author example.pdf
|
244
|
+
|
245
|
+
# Show value for metatags 'Author','Title' for the file example.pdf
|
246
|
+
\x5>CLI show -t author,title example.pdf
|
247
|
+
|
248
|
+
LONGDESC
|
249
|
+
def show(filename)
|
250
|
+
metadata = readMetadata(filename)
|
251
|
+
|
252
|
+
# Output all metatags
|
253
|
+
if options[:all] or options[:tag].nil?
|
254
|
+
puts "Author : " + metadata['author'].to_s
|
255
|
+
puts "Creator : " + metadata['creator'].to_s
|
256
|
+
puts "CreateDate : " + metadata['createdate'].to_s
|
257
|
+
puts "Subject : " + metadata['subject'].to_s
|
258
|
+
puts "Title : " + metadata['title'].to_s
|
259
|
+
puts "Keywords : " + metadata['keywords'].to_s
|
260
|
+
|
261
|
+
# Ouput only specific tags
|
262
|
+
elsif not options[:tag].nil?
|
263
|
+
tags = options[:tag].split(',')
|
264
|
+
tags.each do |tag|
|
265
|
+
puts metadata[tag]
|
266
|
+
end
|
267
|
+
end
|
268
|
+
|
269
|
+
end
|
270
|
+
|
271
|
+
#
|
272
|
+
# Change a MetaTag Attribute
|
273
|
+
#
|
274
|
+
# TODO: keywords are added differently according to the documentation
|
275
|
+
# http://www.sno.phy.queensu.ca/~phil/exiftool/faq.html
|
276
|
+
desc 'edit', 'Edit Meta Tag(s)'
|
277
|
+
long_desc <<-LONGDESC
|
278
|
+
== General
|
279
|
+
|
280
|
+
Command will edit the metadata of a PDF document. Multiple values can be
|
281
|
+
specified or 'all'.
|
282
|
+
|
283
|
+
The command will invoke an interactive user input and request the values
|
284
|
+
for the metatag.
|
285
|
+
|
286
|
+
Additionally the file can be renamed at the end according to the new meta
|
287
|
+
tags. See `$ #{__FILE__} help rename` for details.
|
288
|
+
|
289
|
+
== Parameters
|
290
|
+
|
291
|
+
--tag, -t
|
292
|
+
\x5 Names or list of names of Metatag fields to set, separated by commata.
|
293
|
+
|
294
|
+
--rename, -r
|
295
|
+
\x5 Rename file after updating the meta tag information according to the fields.
|
296
|
+
|
297
|
+
This parameter is identical to running `> CLI rename <filename>`
|
298
|
+
|
299
|
+
General example:
|
300
|
+
|
301
|
+
# Edit tag 'TAG' and set a new value interactive.
|
302
|
+
\x5>CLI edit -t TAG <filename>
|
303
|
+
|
304
|
+
# Edit tag 'Author' and set new value interactive.
|
305
|
+
\x5>CLI edit -t author example.pdf
|
306
|
+
|
307
|
+
# Edit mulitple Tags and set a new value.
|
308
|
+
\x5>CLI edit -t tag1,tag2,tag3 <filename>
|
309
|
+
|
310
|
+
|
311
|
+
== Multiple Tags
|
312
|
+
|
313
|
+
For setting multiple tags list the tags comma separated.
|
314
|
+
|
315
|
+
For setting all tags (Author, Title, Subject, CreateDate, Keywords) use the keyword 'all' as tagname.
|
316
|
+
|
317
|
+
# Set tags 'Author', 'Title', 'Subject' in example.pdf interactivly.
|
318
|
+
\x5>CLI edit -t author,title,subject example.pdf`
|
319
|
+
|
320
|
+
# Set tags 'Author', 'Title', 'Subject', 'CreateDate', 'Keywords' in
|
321
|
+
example.pdf interactive.
|
322
|
+
\x5>CLI edit -t all example.pdf
|
323
|
+
|
324
|
+
== Tag: CreateDate
|
325
|
+
|
326
|
+
In order to enter a value for the 'CreateDate' field, some internal matching is going on in order to make it easier and faster to enter dates and times.
|
327
|
+
|
328
|
+
The following formats are identified/matched:
|
329
|
+
|
330
|
+
\x5 yyyymmdd
|
331
|
+
\x5 yyyymmd
|
332
|
+
\x5 yyyymmddHHMMSS
|
333
|
+
\x5 yyyy-mm-dd HH:MM:SS
|
334
|
+
\x5 yyyy:mm:dd HH:MM:SS
|
335
|
+
\x5 yyyy.mm.dd HH:MM:SS
|
336
|
+
\x5 yyyy-mm-d
|
337
|
+
\x5 yyyy-mm-dd
|
338
|
+
\x5 yyyy.mm.d
|
339
|
+
\x5 yyyy.mm.dd
|
340
|
+
\x5 yyyy:mm:d
|
341
|
+
\x5 yyyy:mm:dd
|
342
|
+
|
343
|
+
\x5 - If HH:MM:SS or HHMMSS is not provided, those values are automatically set to zero.
|
344
|
+
\x5 - The output format of every timestamp is <yyyy:mm:dd HH:MM:SS>
|
345
|
+
\x5 - When providing and invalid date, the incorrect date is rejected and the user asked to provide the correct date.
|
346
|
+
|
347
|
+
== Rename file
|
348
|
+
|
349
|
+
In addition to setting the tags the current file can be renamed according to
|
350
|
+
the new metadata.
|
351
|
+
|
352
|
+
# Set tag 'Author' and rename file example.pdf
|
353
|
+
\x5> CLI edit -t author -r example.pdf
|
354
|
+
|
355
|
+
See `> CLI help rename` for details about renaming.
|
356
|
+
|
357
|
+
LONGDESC
|
358
|
+
method_option :tag, :type => :string, :aliases => '-t', :desc => 'Name of the Tag(s) to Edit', :default => false, :required => true
|
359
|
+
method_option :rename, :type => :boolean, :aliases => '-r', :desc => 'Rename file after changing meta-tags', :default => false, :required => false
|
360
|
+
def edit(filename)
|
361
|
+
metadata = readMetadata(filename)
|
362
|
+
|
363
|
+
if options[:tag] == 'all'
|
364
|
+
tags = ['author','title','subject','createdate','keywords']
|
365
|
+
else
|
366
|
+
tags = options[:tag].split(',')
|
367
|
+
end
|
368
|
+
tags.each do |currentTag|
|
369
|
+
|
370
|
+
# Change the tag to something we can use here
|
371
|
+
puts "Current value: '#{metadata[currentTag.downcase]}'"
|
372
|
+
answer = readUserInput("Enter new value for #{currentTag} :")
|
373
|
+
if currentTag.downcase == 'createdate'
|
374
|
+
while not answer = identifyDate(answer)
|
375
|
+
puts 'Invalid date format'
|
376
|
+
answer = readUserInput("Enter new value for #{currentTag} :")
|
377
|
+
end
|
378
|
+
end
|
379
|
+
puts "Changing value for #{currentTag}: '#{metadata[currentTag]}' => #{answer}"
|
380
|
+
`exiftool -#{currentTag.downcase}='#{answer}' -overwrite_original '#{filename}'`
|
381
|
+
end
|
382
|
+
|
383
|
+
#
|
384
|
+
# If required, run the renaming task afterwards
|
385
|
+
# This is not pretty, but seems to be the only way to do this in THOR
|
386
|
+
#
|
387
|
+
if options[:rename]
|
388
|
+
puts `#{__FILE__} rename '#{filename}'`
|
389
|
+
end
|
390
|
+
|
391
|
+
end
|
392
|
+
|
393
|
+
#
|
394
|
+
# Check the metadata for the minium necessary tags
|
395
|
+
# See documentation at the top of this file for defailts
|
396
|
+
#
|
397
|
+
# void check(string)
|
398
|
+
desc 'check', 'Check Metadata for completeness'
|
399
|
+
long_desc <<-LONGDESC
|
400
|
+
== General
|
401
|
+
|
402
|
+
Show value of the following metatags of a PDF document:
|
403
|
+
|
404
|
+
- Author
|
405
|
+
\x5- Creator
|
406
|
+
\x5- CreateDate
|
407
|
+
\x5- Subject
|
408
|
+
\x5- Title
|
409
|
+
\x5- Keywords
|
410
|
+
|
411
|
+
== Example
|
412
|
+
|
413
|
+
# Show the values of the metatags for example.pdf
|
414
|
+
\x5>CLI show example.pdf
|
415
|
+
|
416
|
+
LONGDESC
|
417
|
+
def check(filename)
|
418
|
+
returnvalue = 0
|
419
|
+
readMetadata(filename).each do|key,value|
|
420
|
+
if key.match(/author|subject|createdate|title/) and value.empty?
|
421
|
+
puts 'Missing value: ' + key
|
422
|
+
returnvalue == 0 ? returnvalue = 1 : ''
|
423
|
+
end
|
424
|
+
end
|
425
|
+
exit returnvalue
|
426
|
+
end
|
427
|
+
|
428
|
+
#
|
429
|
+
# Explain fields and Metatags
|
430
|
+
# Show information about how they are used.
|
431
|
+
#
|
432
|
+
desc 'explain','Show more information about usuable Meta-Tags'
|
433
|
+
long_desc <<-LONGDESC
|
434
|
+
== General
|
435
|
+
|
436
|
+
Explain some terms used with the script.
|
437
|
+
|
438
|
+
== Example
|
439
|
+
|
440
|
+
# Show the available subjects
|
441
|
+
\x5>CLI explain
|
442
|
+
|
443
|
+
# Show information about the subject 'author'
|
444
|
+
\x5>CLI explain author
|
445
|
+
|
446
|
+
LONGDESC
|
447
|
+
def explain(term='')
|
448
|
+
|
449
|
+
case term
|
450
|
+
when ''
|
451
|
+
puts 'Available subjects:'
|
452
|
+
puts '- author'
|
453
|
+
puts '- createdate'
|
454
|
+
puts '- keywords'
|
455
|
+
puts '- subject'
|
456
|
+
puts '- title'
|
457
|
+
puts ' '
|
458
|
+
puts "Run `$ #{__FILE__} explain <subject>` to get more details."
|
459
|
+
when 'author'
|
460
|
+
puts '[Author]'
|
461
|
+
puts ' The sender or creator of the document.'
|
462
|
+
when 'createdate'
|
463
|
+
puts '[CreateDate]'
|
464
|
+
puts ' Date of the document. This is not the date when the file was created, but'
|
465
|
+
puts ' the date found in the document or printed on the document.'
|
466
|
+
when 'title'
|
467
|
+
puts '[Title]'
|
468
|
+
puts ' General type of the document, e.g. Manual, Invoice.'
|
469
|
+
when 'subject'
|
470
|
+
puts '[Subject]'
|
471
|
+
puts ' What is the document about.'
|
472
|
+
puts ' For example:'
|
473
|
+
puts ' Manual: What is the manual about?'
|
474
|
+
puts ' Invoice: Invoice number?'
|
475
|
+
puts ' Contract: Contract number of Subject of the contract?'
|
476
|
+
puts ' Order: Ordernumber of the document?'
|
477
|
+
when 'keywords'
|
478
|
+
puts '[Keywords]'
|
479
|
+
puts ' Anything else that might be of interesst.'
|
480
|
+
puts ' In Orders the elements that have been orders. Contracts might contain the'
|
481
|
+
puts ' Names and adress of the involved parties.'
|
482
|
+
puts ' '
|
483
|
+
puts ' When writing Invoices with their numbers, these will be automatically be '
|
484
|
+
puts ' picked up and can be integrated in the filename, e.g. "Invoicenumber 12334'
|
485
|
+
end
|
486
|
+
|
487
|
+
end
|
488
|
+
|
489
|
+
#
|
490
|
+
# Sort the files into directories based on the author
|
491
|
+
#
|
492
|
+
desc 'sort','Sort files into directories sorted by Author'
|
493
|
+
long_desc <<-LONGDESC
|
494
|
+
== General
|
495
|
+
|
496
|
+
Will sort pdf documents into subdirectories according to the value of their
|
497
|
+
tag 'author'.
|
498
|
+
|
499
|
+
When using this action a logfile with all actions will be generated in the
|
500
|
+
current working directory with the same name as the script and the ending
|
501
|
+
'.log'. This can be disabled with the parameter 'log' if required.
|
502
|
+
|
503
|
+
If a document does not have an entry in the meta tag 'author', the file will
|
504
|
+
not be processed. This can be seen in the output of the logfile as well.
|
505
|
+
|
506
|
+
=== Parameters
|
507
|
+
|
508
|
+
[*destination|d*]
|
509
|
+
\x5 Speficy the root output directory to where the folderstructure is being created.
|
510
|
+
|
511
|
+
This parameter is required.
|
512
|
+
|
513
|
+
[*copy|c*]
|
514
|
+
\x5 Copy the files instead of moving them.
|
515
|
+
|
516
|
+
[*log|l*]
|
517
|
+
\x5 Disable/Enable the logging.
|
518
|
+
\x5 Default: enabled.
|
519
|
+
|
520
|
+
=== Replacement rules
|
521
|
+
|
522
|
+
The subdirectories for the documents are generated from the values in the
|
523
|
+
tag 'author' of each document.
|
524
|
+
|
525
|
+
In order to ensure a clean directory structure, there are certain rules
|
526
|
+
for altering the values.
|
527
|
+
\x5 1. Whitespaces are replaced by underscores.
|
528
|
+
\x5 2. Dots are replaced by underscores.
|
529
|
+
\x5 3. All letters are converted to their lowercase version.
|
530
|
+
\x5 4. Special characters are serialized
|
531
|
+
|
532
|
+
=== Example
|
533
|
+
|
534
|
+
This command does the following:
|
535
|
+
\x5 1. Take all pdf documents in the subdirectory ./documents.
|
536
|
+
\x5 2. Create the output folder structure in `/tmp/test/`.
|
537
|
+
\x5 3. Copy the files instead of moving them.
|
538
|
+
\x5 4. Disable the logging.
|
539
|
+
\x5> CLI sort -d /tmp/test -c -l false ./documents
|
540
|
+
|
541
|
+
LONGDESC
|
542
|
+
method_option :destination, :aliases => '-d', :required => true, :type => :string, :desc => 'Defines the output directory'
|
543
|
+
method_option :copy, :aliases => '-c', :required => false, :type => :boolean, :desc => 'Copy files instead of moving them'
|
544
|
+
method_option :log, :aliases => '-l', :require => false, :type => :boolean, :desc => 'Enable/Disable creation of log files', :default => true
|
545
|
+
def sort(inputDir = '.')
|
546
|
+
|
547
|
+
destination = options[:destination]
|
548
|
+
logenable = options[:log]
|
549
|
+
scriptname = Pathname.new(__FILE__).basename
|
550
|
+
logenable ? $logger = Logger.new(Dir.pwd.chomp('/') + "/#{scriptname}.log") : ''
|
551
|
+
|
552
|
+
# Input validation
|
553
|
+
!File.exist?(inputDir) ? abort('Input directory does not exist. Abort.'): ''
|
554
|
+
File.directory?(inputDir) ? '' : abort('Input is a single file')
|
555
|
+
File.file?(destination) ? abort("Output '#{destination}' is an existing file. Cannot create directory with the same name. Abort") : ''
|
556
|
+
unless File.directory?(destination)
|
557
|
+
FileUtils.mkdir_p(destination)
|
558
|
+
$logger.info("Destination '#{destination}' has been created.")
|
559
|
+
end
|
560
|
+
|
561
|
+
# Iterate through all files
|
562
|
+
Dir[inputDir.chomp('/') + '/*.pdf'].sort.each do |file|
|
563
|
+
|
564
|
+
metadata = readMetadata(file)
|
565
|
+
if metadata['author'] and not metadata['author'].empty?
|
566
|
+
author = metadata['author'].gsub(' ','_').gsub('.','_')
|
567
|
+
I18n.enforce_available_locales = false # Serialize special characters
|
568
|
+
author = I18n.transliterate(author).downcase
|
569
|
+
folderdestination = destination.chomp('/') + '/' + author
|
570
|
+
unless File.directory?(folderdestination)
|
571
|
+
FileUtils.mkdir_p(folderdestination)
|
572
|
+
logenable ? $logger.info("Folder '#{folderdestination}' has been created."): ''
|
573
|
+
end
|
574
|
+
filedestination = destination.chomp('/') + '/' + author + '/' + Pathname.new(file).basename.to_s
|
575
|
+
|
576
|
+
# Final check before touching the filesystem
|
577
|
+
if not File.exist?(filedestination)
|
578
|
+
$logger.info("File '#{file}' => '#{filedestination}'")
|
579
|
+
|
580
|
+
# Move/Copy the file
|
581
|
+
if options[:copy]
|
582
|
+
FileUtils.cp(file, filedestination)
|
583
|
+
else
|
584
|
+
FileUtils.mv(file,filedestination)
|
585
|
+
end
|
586
|
+
|
587
|
+
else
|
588
|
+
logenable ? $logger.warn("File '#{filedestination}' already exists. Ignoring.") : ''
|
589
|
+
end
|
590
|
+
else
|
591
|
+
logenable ? $logger.warn("Missing tag 'Author' for file '#{file}'. Skipping.") : (puts "Missing tag 'Author' for file '#{file}'. Skipping")
|
592
|
+
end
|
593
|
+
end
|
594
|
+
|
595
|
+
end
|
596
|
+
|
597
|
+
#
|
598
|
+
# Rename the file according to the Metadata
|
599
|
+
#
|
600
|
+
# Scheme: YYYYMMDD-author-subject-keywords.extension
|
601
|
+
desc 'rename', 'Rename the file according to Metadata'
|
602
|
+
long_desc <<-LONGDESC
|
603
|
+
== General
|
604
|
+
|
605
|
+
Rename a file with the meta tags in the document.
|
606
|
+
|
607
|
+
== Parameter
|
608
|
+
|
609
|
+
--dry-run, -n
|
610
|
+
\x5 Simulate the renaming process and show the result without changing the file.
|
611
|
+
|
612
|
+
--all-keywords, -a
|
613
|
+
\x5 Use all keywords from the meta information in the file name and ignore the limit.
|
614
|
+
|
615
|
+
--keywwords, -k
|
616
|
+
\x5 Set the number of keywords used in the filename to a new value.
|
617
|
+
\x5 Default: 3
|
618
|
+
|
619
|
+
--outputdir, -o
|
620
|
+
\x5 Not implemented yet. Default output dir for the renamed file is the source directory.
|
621
|
+
|
622
|
+
== Example
|
623
|
+
|
624
|
+
# Rename the file according to the metatags
|
625
|
+
\x5> CLI rename <filename>
|
626
|
+
|
627
|
+
# Rename example.pdf according to the metatags
|
628
|
+
\x5> CLI rename example.pdf
|
629
|
+
|
630
|
+
# Simulate renaming example.pdf according to the metatags (dry-run)
|
631
|
+
\x5> CLI rename -n example.pdf
|
632
|
+
|
633
|
+
== Rules
|
634
|
+
|
635
|
+
There are some rules regarding how documents are being renamed
|
636
|
+
|
637
|
+
Rule 1: All documents have the following filenaming structure:
|
638
|
+
|
639
|
+
<yyyymmdd>-<author>-<type>-<additionalInformation>.<extension>
|
640
|
+
|
641
|
+
\x5 # <yyyymmdd>: Year, month and day identival to the meta information in the
|
642
|
+
document.
|
643
|
+
\x5 # <author>: Author of the document, identical to the meta information
|
644
|
+
in the document. Special characters and whitespaces are replaced.
|
645
|
+
\x5 # <type>: Document type, is being generated from the title field in the metadata of the document. Document type is a three character abbreviation following the following logic:
|
646
|
+
|
647
|
+
\x5 til => Tilbudt|Angebot
|
648
|
+
\x5 odb => Orderbekreftelse
|
649
|
+
\x5 fak => Faktura
|
650
|
+
\x5 ord => Order
|
651
|
+
\x5 avt => Kontrakt|Avtale|Vertrag|contract
|
652
|
+
\x5 kvi => Kvittering
|
653
|
+
\x5 man => Manual
|
654
|
+
\x5 bil => Billett|Ticket
|
655
|
+
\x5 inf => Informasjon|Information
|
656
|
+
\x5 dok => unknown
|
657
|
+
|
658
|
+
If the dokument type can not be determined automatically, it defaults to 'dok'.
|
659
|
+
|
660
|
+
# <additionalInformation>: Information generated from the metadata fields
|
661
|
+
'title', 'subject' and 'keywords'.
|
662
|
+
|
663
|
+
If 'Title' or 'Keywords' contains one of the following keywords, the will be replaced with the corresponding abbreviation followed by the specified value separated by a whitespace:
|
664
|
+
|
665
|
+
\x5 fak => Faktura|Fakturanummer|Rechnung|Rechnungsnummer
|
666
|
+
\x5 kdn => Kunde|Kundenummer|Kunde|Kundennummer
|
667
|
+
\x5 ord => Ordre|Ordrenummer|Bestellung|Bestellungsnummer
|
668
|
+
\x5 kvi => Kvittering|Kvitteringsnummer|Quittung|Quittungsnummer
|
669
|
+
|
670
|
+
Rule 2: The number of keywords used in the filename is defined by the parameter '-k'. See the section of that parameter for more details and the default value.
|
671
|
+
|
672
|
+
Rule 3: Keywords matching 'kvi','fak','ord','kdn' are prioritised.
|
673
|
+
|
674
|
+
Rule 4: Special characters and whitespaces are replaced:
|
675
|
+
|
676
|
+
\x5 ' ' => '_'
|
677
|
+
\x5 '/' => '_'
|
678
|
+
|
679
|
+
Rule 5: The new filename has only lowercase characters.
|
680
|
+
|
681
|
+
== Example (detailed)
|
682
|
+
|
683
|
+
# Example PDF with following MetaTags:
|
684
|
+
|
685
|
+
\x5 Filename : example.pdf
|
686
|
+
\x5 Author : John
|
687
|
+
\x5 Subject : new Product
|
688
|
+
\x5 Title : Presentation
|
689
|
+
\x5 CreateDate : 1970:01:01 01:00:00
|
690
|
+
\x5 Keywords : John Doe, Jane Doe, Mister Doe
|
691
|
+
|
692
|
+
# Renaming the file
|
693
|
+
\x5> CLI rename example.pdf
|
694
|
+
\x5 example.pdf => 19700101-john-dok-new_product-john_doe-jane_doe.pdf
|
695
|
+
|
696
|
+
# Simulation to rename the file (no actual change)
|
697
|
+
\x5> CLI rename -n example.pdf
|
698
|
+
\x5example.pdf => 19700101-john-dok-new_product-john_doe-jane_doe.pdf
|
699
|
+
|
700
|
+
# Renaming the file with all keywords
|
701
|
+
\x5> CLI rename -n -a example.pdf
|
702
|
+
\x5example.pdf => 19700101-john-dok-new_product-john_doe-jane_doe-mister_doe.pdf
|
703
|
+
|
704
|
+
LONGDESC
|
705
|
+
method_option :dryrun, :type => :boolean, :aliases => '-n', :desc => 'Run without making changes', :default => false, :required => false
|
706
|
+
method_option ':all-keywords', :type => :boolean, :aliases => '-a', :desc => 'Add all keywords (no limit)', :default => false, :required => false
|
707
|
+
method_option :keywords, :type => :numeric, :aliases => '-k', :desc => 'Number of keywords to include (Default: 3)', :default => 3, :required => false
|
708
|
+
method_option :outputdir, :aliases => '-o', :type => :string, :desc => 'Speficy output directory', :default => :false, :required => :false
|
709
|
+
def rename(filename)
|
710
|
+
metadata = readMetadata(filename).each do |key,value|
|
711
|
+
|
712
|
+
# Check if the metadata is complete
|
713
|
+
if key.match(/author|subject|createdate|title/) and value.empty?
|
714
|
+
puts 'Missing value for ' + key
|
715
|
+
puts 'Abort'
|
716
|
+
exit 1
|
717
|
+
end
|
718
|
+
|
719
|
+
end
|
720
|
+
|
721
|
+
date = metadata['createdate'].gsub(/\ \d{2}\:\d{2}\:\d{2}.*$/,'').gsub(/\:/,'')
|
722
|
+
author = metadata['author'].gsub(/\./,'_').gsub(/\-/,'').gsub(/\s/,'_')
|
723
|
+
I18n.enforce_available_locales = false
|
724
|
+
author = I18n.transliterate(author) # Normalising
|
725
|
+
|
726
|
+
keywords_preface = ''
|
727
|
+
# This statement can probably be optimised
|
728
|
+
case metadata['title']
|
729
|
+
when /(Tilbudt|Angebot)/i
|
730
|
+
doktype = 'til'
|
731
|
+
keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
|
732
|
+
when /Orderbekrefelse/i
|
733
|
+
doktype = 'odb'
|
734
|
+
keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
|
735
|
+
when /faktura/i
|
736
|
+
doktype = 'fak'
|
737
|
+
keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
|
738
|
+
when /order/i
|
739
|
+
doktype = 'ord'
|
740
|
+
keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
|
741
|
+
when /(kontrakt|avtale|vertrag|contract)/i
|
742
|
+
doktype = 'avt'
|
743
|
+
keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
|
744
|
+
when /kvittering/i
|
745
|
+
doktype = 'kvi'
|
746
|
+
keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
|
747
|
+
when /manual/i
|
748
|
+
doktype = 'man'
|
749
|
+
keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
|
750
|
+
when /(billett|ticket)/i
|
751
|
+
doktype = 'bil'
|
752
|
+
keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
|
753
|
+
when /(informasjon|information)/i
|
754
|
+
doktype = 'inf'
|
755
|
+
keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
|
756
|
+
else
|
757
|
+
doktype = 'dok'
|
758
|
+
keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
|
759
|
+
end
|
760
|
+
if not metadata['keywords'].empty?
|
761
|
+
keywords_preface == '' ? keywords = '' : keywords = keywords_preface
|
762
|
+
keywordsarray = metadata['keywords'].split(',')
|
763
|
+
|
764
|
+
#
|
765
|
+
# Sort array
|
766
|
+
#
|
767
|
+
keywordssorted = Array.new
|
768
|
+
keywordsarray.each_with_index do |value,index|
|
769
|
+
value = value.lstrip.chomp
|
770
|
+
value = value.gsub(/(Faktura|Rechnungs)(nummer)? /i,'fak')
|
771
|
+
value = value.gsub(/(Kunde)(n)?(nummer)? /i,'kdn')
|
772
|
+
value = value.gsub(/(Kunde)(n)?(nummer)?-/i,'kdn')
|
773
|
+
value = value.gsub(/(Ordre|Bestellung)(s?nummer)? /i,'ord')
|
774
|
+
value = value.gsub(/(Kvittering|Quittung)(snummer)? /i,'kvi')
|
775
|
+
value = value.gsub(/\s/,'_')
|
776
|
+
value = value.gsub(/\//,'_')
|
777
|
+
keywordsarray[index] = value
|
778
|
+
if value.match(/^(fak|kdn|ord|kvi)/)
|
779
|
+
keywordssorted.insert(0, value)
|
780
|
+
else
|
781
|
+
keywordssorted.push(value)
|
782
|
+
end
|
783
|
+
end
|
784
|
+
|
785
|
+
counter = 0
|
786
|
+
keywordssorted.each_with_index do |value,index|
|
787
|
+
|
788
|
+
# Exit condition limits the number of keywords used in the filename
|
789
|
+
# unless all keywords shall be added
|
790
|
+
if not options[':all-keywords']
|
791
|
+
counter > options[:keywords]-1 ? break : counter = counter + 1
|
792
|
+
end
|
793
|
+
if value.match(/(kvi|fak|ord|kdn)/i)
|
794
|
+
keywords == '' ? keywords = '-' + value : keywords = value + '-' + keywords
|
795
|
+
else
|
796
|
+
keywords == '' ? keywords = '-' + value : keywords.concat('-' + value)
|
797
|
+
end
|
798
|
+
end
|
799
|
+
# Normalise the keywords as well
|
800
|
+
#
|
801
|
+
I18n.enforce_available_locales = false
|
802
|
+
keywords = I18n.transliterate(keywords)
|
803
|
+
|
804
|
+
# There are no keywords
|
805
|
+
# Rare, but it happens
|
806
|
+
else
|
807
|
+
|
808
|
+
# There are no keywords.
|
809
|
+
# we are using the title and the subject
|
810
|
+
if keywords_preface != ''
|
811
|
+
keywords = keywords_preface
|
812
|
+
end
|
813
|
+
|
814
|
+
end
|
815
|
+
extension = 'pdf'
|
816
|
+
if keywords != nil and keywords[0] != '-'
|
817
|
+
keywords = '-' + keywords
|
818
|
+
end
|
819
|
+
keywords == nil ? keywords = '' : ''
|
820
|
+
newFilename = date + '-' +
|
821
|
+
author + '-' +
|
822
|
+
doktype +
|
823
|
+
keywords + '.' +
|
824
|
+
extension
|
825
|
+
|
826
|
+
# Output directory checks
|
827
|
+
if options[:outputdir]
|
828
|
+
#if not File.exist?(options[:outputdir])
|
829
|
+
# puts "Error: output dir '#{options[:outputdir]}' not found. Abort"
|
830
|
+
# exit 1
|
831
|
+
#end
|
832
|
+
end
|
833
|
+
|
834
|
+
if not options[:dryrun] and filename != newFilename.downcase
|
835
|
+
`mv -v '#{filename}' '#{newFilename.downcase}'`
|
836
|
+
else
|
837
|
+
puts filename + "\n => " + newFilename.downcase
|
838
|
+
end
|
839
|
+
end
|
840
|
+
|
841
|
+
#
|
842
|
+
# One parameter to show the current version
|
843
|
+
#
|
844
|
+
map %w[--version -v] => :__print_version
|
845
|
+
desc "--version, -v", 'Show the current script version'
|
846
|
+
def __print_version
|
847
|
+
puts VERSION
|
848
|
+
end
|
849
|
+
|
850
|
+
end
|
851
|
+
|
852
|
+
DOC.start
|
853
|
+
|
metadata
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: pdfmd
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.3.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Daniel Roos
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-03-16 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Managing the commong pdf metadata settings and renaming the pdf file
|
14
|
+
accordingly.
|
15
|
+
email: daniel@micronerd.org
|
16
|
+
executables: []
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- lib/pdfmd.rb
|
21
|
+
homepage: http://rubygems.org/gems/pdfmd
|
22
|
+
licenses:
|
23
|
+
- MIT
|
24
|
+
metadata: {}
|
25
|
+
post_install_message:
|
26
|
+
rdoc_options: []
|
27
|
+
require_paths:
|
28
|
+
- lib
|
29
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
35
|
+
requirements:
|
36
|
+
- - ">="
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: '0'
|
39
|
+
requirements: []
|
40
|
+
rubyforge_project:
|
41
|
+
rubygems_version: 2.4.6
|
42
|
+
signing_key:
|
43
|
+
specification_version: 4
|
44
|
+
summary: pdfmd - pdf-meta-data management
|
45
|
+
test_files: []
|