pdfmd 1.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/pdfmd.rb +853 -0
  3. metadata +45 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 0e9a319bb39e3972119dabeda67a88918e1662a9
4
+ data.tar.gz: 2ddb7e4e715fe65192685c19bcdb560b36f3708b
5
+ SHA512:
6
+ metadata.gz: e88256a30ab208960bf09071e88ec291c98349fc8b5dd66077867182b1a467bef916e2e6524e3371f95237863dd8e7e957462f52dcb9dc3a05b6bc172326d7ec
7
+ data.tar.gz: 0ae519f568c6409c249e5a154365c70a4c06dcfac3288ba863d336d28c2b4daf0188a4d07d0fa7c7fbf5b34bcfd16e5fe930b28759aa9c9f36db914abfe556ec
@@ -0,0 +1,853 @@
1
+ #!/usr/bin/env ruby
2
+ # == Version 1.3
3
+ #
4
+ # == File: pdfmetadata.rb
5
+ #
6
+ # Show and edit Metadata of PDF files and rename the files accordingly.
7
+ #
8
+ # === Requirements
9
+ #
10
+ # ==== Ruby gems:
11
+ # - thor
12
+ # - highline/import
13
+ # - fileutils
14
+ # - i18n
15
+ # - pathname
16
+ # - logger
17
+ #
18
+ # ==== OS applications:
19
+ #
20
+ # - exiftools
21
+ #
22
+ # === Usage
23
+ #
24
+ # $ ./pdfmetadata <action> <parameter> file
25
+ #
26
+ # $ ./pdfmetadata help <action>
27
+ #
28
+ # An overview about the actions can be seen when running the script without
29
+ # any parameters
30
+ #
31
+ # === Changelog
32
+ #
33
+ # Version 1.3
34
+ # - Small bugfix about special characters in filenames (author).
35
+ # - Bugfix for the tag 'createdate' written as 'CreateDate' which did not
36
+ # take the date then.
37
+ # - Removed inactive code.
38
+ # - Added paramter 'version'
39
+ #
40
+ # Version 1.2
41
+ # - Small bugfix with the sort function and the logfile being created.
42
+ #
43
+ # Version 1.1
44
+ # - Added Function to sort pdf documents into a directory structure based on
45
+ # the author of the document.
46
+ # - Added dependency 'pathname'
47
+ # - Added dependency 'logger'
48
+ # - Added dependency 'i18n'
49
+ # - Added method 'sort'
50
+ # - Changing a tag will now output the old value in the edit dialog.
51
+ # - Updated documentation and descriptions of methods
52
+ #
53
+ # Version 1.0
54
+ # - Added documentation in long description of the commands
55
+ # - Added method "explain" for further information
56
+ #
57
+ # Version 0.9
58
+ # - Added 'rename' option to edit metatags
59
+ # - Fixed some output strings
60
+ #
61
+ # Version 0.x
62
+ # - All other stuff
63
+ #
64
+ # Check and set metadata of PDF documents
65
+ #
66
+ # A complete set of metada contains
67
+ #
68
+ # * CreateDate
69
+ # * Title
70
+ # * Author
71
+ # * Subject
72
+ # * Keywords (optional)
73
+ #
74
+ # TODO: Include password protected PDF documents as well
75
+ # TODO: Fix broken PDF files automatically
76
+ # TODO: Enable logging in more functions than only "sort"
77
+ # TODO: Read this: http://lostechies.com/derickbailey/2011/04/29/writing-a-thor-application/
78
+ # TODO: ... and this: http://blog.paracode.com/2012/05/17/building-your-tools-with-thor/
79
+ # TODO: Create Gem: http://yehudakatz.com/2010/04/02/using-gemspecs-as-intended/
80
+ # gs \
81
+ # -o repaired.pdf \
82
+ # -sDEVICE=pdfwrite \
83
+ # -dPDFSETTINGS=/prepress \
84
+ # corrupted.pdf
85
+ #
86
+ # == Author
87
+ #
88
+ # Daniel Roos <daniel-git@micronarrativ.org>
89
+ # Source: https://github.com/Micronarrativ/micronarrativ/tree/scripts
90
+ #
91
+ require "thor"
92
+ require "highline/import"
93
+ require "fileutils"
94
+ require "i18n"
95
+ require 'pathname'
96
+ require 'logger'
97
+
98
+ VERSION = '1.3'
99
+ #
100
+ # Function to read the metadata from a given file
101
+ # hash readMetadata(string)
102
+ #
103
+ def readMetadata(pathFile = false)
104
+ metadata = Hash.new
105
+ metadata['keywords'] = ''
106
+ metadata['subject'] = ''
107
+ metadata['title'] = ''
108
+ metadata['author'] = ''
109
+ metadata['creator'] = ''
110
+ metadata['createdate'] = ''
111
+ if not File.file?(pathFile)
112
+ puts "Cannot access file #{pathFile}. Abort"
113
+ abort
114
+ end
115
+
116
+ # Fetch the Metada with the help of exiftools (unless something better is
117
+ # found
118
+ metaStrings = `exiftool '#{pathFile}' | egrep -i '^Creator\s+\:|^Author|Create Date|Subject|Keywords|Title'`
119
+
120
+ # Time to cherrypick the available data
121
+ entries = metaStrings.split("\n")
122
+ entries.each do |entry|
123
+ values = entry.split(" : ")
124
+ values[0].match(/Creator/) and metadata['creator'] == '' ? metadata['creator'] = values[1]: metadata['creator'] = ''
125
+ values[0].match(/Author/) and metadata['author'] == '' ? metadata['author'] = values[1]: metadata['author'] = ''
126
+ values[0].match(/Create Date/) and metadata['createdate'] == '' ? metadata['createdate'] = values[1]: metadata['createdate'] = ''
127
+ values[0].match(/Subject/) and metadata['subject'] == '' ? metadata['subject'] = values[1]: metadata['subject'] = ''
128
+ values[0].match(/Keywords/) and metadata['keywords'] == '' ? metadata['keywords'] = values[1]: metadata['keywords'] =''
129
+ values[0].match(/Title/) and metadata['title'] == '' ? metadata['title'] = values[1]: metadata['title'] =''
130
+ end
131
+ return metadata
132
+ end
133
+
134
+ #
135
+ # Set Keywords Preface based on title and subject
136
+ # If subject matches a number/character combination and contains no spaces,
137
+ # the preface will be combined with the doktype.
138
+ # If not: preface will contain the whole subject with dots and spaces being
139
+ # replaced with underscores
140
+ #
141
+ def setKeywordsPreface(metadata, doktype)
142
+ if metadata['subject'].match(/^\d+[^+s]+.*/)
143
+ return doktype + metadata['subject']
144
+ else
145
+ subject = metadata['subject']
146
+
147
+ # Take care of special characters
148
+ I18n.enforce_available_locales = false
149
+ subject = I18n.transliterate(metadata['subject'])
150
+
151
+ # Replace everything else
152
+ subject = subject.gsub(/[^a-zA-Z0-9]+/,'_')
153
+ return subject
154
+ end
155
+ end
156
+
157
+ #
158
+ # Read user input
159
+ #
160
+ def readUserInput(textstring = 'Enter value: ')
161
+ return ask textstring
162
+ end
163
+
164
+ #
165
+ # Identify a date
166
+ # Function takes a string and tries to identify a date in there.
167
+ # returns false if no date could be identified
168
+ # otherwise the date is returned in the format as
169
+ #
170
+ # YYYY:MM:DD HH:mm:ss
171
+ #
172
+ # For missing time values zero is assumed
173
+ #
174
+ def identifyDate(datestring)
175
+ identifiedDate = ''
176
+ year = '[1-2][90][0-9][0-9]'
177
+ month = '0[0-9]|10|11|12'
178
+ day = '[1-9]|0[1-9]|1[0-9]|2[0-9]|3[0-1]'
179
+ hour = '[0-1][0-9]|2[0-3]|[1-9]'
180
+ minute = '[0-5][0-9]'
181
+ second = '[0-5][0-9]'
182
+ case datestring
183
+ when /^(#{year})(#{month})(#{day})$/
184
+ identifiedDate = $1 + ':' + $2 + ':' + $3 + ' 00:00:00'
185
+ when /^(#{year})(#{month})(#{day})(#{hour})(#{minute})(#{second})$/
186
+ identifiedDate = $1 + ':' + $2 + ':' + $3 + ' ' + $4 + ':' + $5 + ':' + $6
187
+ when /^(#{year})[\:|\.|\-](#{month})[\:|\.|\-](#{day})\s(#{hour})[\:](#{minute})[\:](#{second})$/
188
+ identifiedDate = $1 + ':' + $2 + ':' + $3 + ' ' + $4 + ':' + $5 + ':' + $6
189
+ when /^(#{year})[\:|\.|\-](#{month})[\:|\.|\-](#{day})$/
190
+ day = "%02d" % $3
191
+ month = "%02d" % $2
192
+ identifiedDate = $1 + ':' + month + ':' + day + ' 00:00:00'
193
+ else
194
+ identifiedDate = false
195
+ end
196
+ return identifiedDate
197
+ end
198
+
199
+ class DOC < Thor
200
+
201
+
202
+ #
203
+ # Show the current metadata tags
204
+ #
205
+ # TODO: format output as JSON and YAML
206
+ # TODO: Enable additional options
207
+ #
208
+ desc 'show', 'Show metadata of a file'
209
+ method_option :all, :type => :boolean, :aliases => '-a', :desc => 'Show all metatags', :default => false, :required => false
210
+ method_option :tag, :type => :string, :aliases => '-t', :desc => 'Show specific tag(s), comma separated', :required => false
211
+ long_desc <<-LONGDESC
212
+ == General
213
+
214
+ Show metatags of a PDF document.
215
+
216
+ The following tags are being shown:
217
+ \x5 * Author
218
+ \x5 * Creator
219
+ \x5 * CreateDate
220
+ \x5 * Title
221
+ \x5 * Subject
222
+ \x5 * Keywords
223
+
224
+ == Parameters
225
+
226
+ --all, -a
227
+ \x5 Show all relevant metatags for a document.
228
+
229
+ Relevant tags are Author,Creator, CreateDate, Title, Subject, Keywords.
230
+
231
+ --tag, -t
232
+ \x5 Specify the metatag to show. The selected metatag must be one of the relevant tags. Other tags are ignored and nothing is returned.
233
+
234
+ == Example
235
+
236
+ # Show default metatags for a pdf document
237
+ \x5>CLI show <filename>
238
+
239
+ # Show default metatags for example.pdf
240
+ \x5>CLI show example.pdf
241
+
242
+ # Show value for metatag 'Author' for the file example.pdf
243
+ \x5>CLI show -t author example.pdf
244
+
245
+ # Show value for metatags 'Author','Title' for the file example.pdf
246
+ \x5>CLI show -t author,title example.pdf
247
+
248
+ LONGDESC
249
+ def show(filename)
250
+ metadata = readMetadata(filename)
251
+
252
+ # Output all metatags
253
+ if options[:all] or options[:tag].nil?
254
+ puts "Author : " + metadata['author'].to_s
255
+ puts "Creator : " + metadata['creator'].to_s
256
+ puts "CreateDate : " + metadata['createdate'].to_s
257
+ puts "Subject : " + metadata['subject'].to_s
258
+ puts "Title : " + metadata['title'].to_s
259
+ puts "Keywords : " + metadata['keywords'].to_s
260
+
261
+ # Ouput only specific tags
262
+ elsif not options[:tag].nil?
263
+ tags = options[:tag].split(',')
264
+ tags.each do |tag|
265
+ puts metadata[tag]
266
+ end
267
+ end
268
+
269
+ end
270
+
271
+ #
272
+ # Change a MetaTag Attribute
273
+ #
274
+ # TODO: keywords are added differently according to the documentation
275
+ # http://www.sno.phy.queensu.ca/~phil/exiftool/faq.html
276
+ desc 'edit', 'Edit Meta Tag(s)'
277
+ long_desc <<-LONGDESC
278
+ == General
279
+
280
+ Command will edit the metadata of a PDF document. Multiple values can be
281
+ specified or 'all'.
282
+
283
+ The command will invoke an interactive user input and request the values
284
+ for the metatag.
285
+
286
+ Additionally the file can be renamed at the end according to the new meta
287
+ tags. See `$ #{__FILE__} help rename` for details.
288
+
289
+ == Parameters
290
+
291
+ --tag, -t
292
+ \x5 Names or list of names of Metatag fields to set, separated by commata.
293
+
294
+ --rename, -r
295
+ \x5 Rename file after updating the meta tag information according to the fields.
296
+
297
+ This parameter is identical to running `> CLI rename <filename>`
298
+
299
+ General example:
300
+
301
+ # Edit tag 'TAG' and set a new value interactive.
302
+ \x5>CLI edit -t TAG <filename>
303
+
304
+ # Edit tag 'Author' and set new value interactive.
305
+ \x5>CLI edit -t author example.pdf
306
+
307
+ # Edit mulitple Tags and set a new value.
308
+ \x5>CLI edit -t tag1,tag2,tag3 <filename>
309
+
310
+
311
+ == Multiple Tags
312
+
313
+ For setting multiple tags list the tags comma separated.
314
+
315
+ For setting all tags (Author, Title, Subject, CreateDate, Keywords) use the keyword 'all' as tagname.
316
+
317
+ # Set tags 'Author', 'Title', 'Subject' in example.pdf interactivly.
318
+ \x5>CLI edit -t author,title,subject example.pdf`
319
+
320
+ # Set tags 'Author', 'Title', 'Subject', 'CreateDate', 'Keywords' in
321
+ example.pdf interactive.
322
+ \x5>CLI edit -t all example.pdf
323
+
324
+ == Tag: CreateDate
325
+
326
+ In order to enter a value for the 'CreateDate' field, some internal matching is going on in order to make it easier and faster to enter dates and times.
327
+
328
+ The following formats are identified/matched:
329
+
330
+ \x5 yyyymmdd
331
+ \x5 yyyymmd
332
+ \x5 yyyymmddHHMMSS
333
+ \x5 yyyy-mm-dd HH:MM:SS
334
+ \x5 yyyy:mm:dd HH:MM:SS
335
+ \x5 yyyy.mm.dd HH:MM:SS
336
+ \x5 yyyy-mm-d
337
+ \x5 yyyy-mm-dd
338
+ \x5 yyyy.mm.d
339
+ \x5 yyyy.mm.dd
340
+ \x5 yyyy:mm:d
341
+ \x5 yyyy:mm:dd
342
+
343
+ \x5 - If HH:MM:SS or HHMMSS is not provided, those values are automatically set to zero.
344
+ \x5 - The output format of every timestamp is <yyyy:mm:dd HH:MM:SS>
345
+ \x5 - When providing and invalid date, the incorrect date is rejected and the user asked to provide the correct date.
346
+
347
+ == Rename file
348
+
349
+ In addition to setting the tags the current file can be renamed according to
350
+ the new metadata.
351
+
352
+ # Set tag 'Author' and rename file example.pdf
353
+ \x5> CLI edit -t author -r example.pdf
354
+
355
+ See `> CLI help rename` for details about renaming.
356
+
357
+ LONGDESC
358
+ method_option :tag, :type => :string, :aliases => '-t', :desc => 'Name of the Tag(s) to Edit', :default => false, :required => true
359
+ method_option :rename, :type => :boolean, :aliases => '-r', :desc => 'Rename file after changing meta-tags', :default => false, :required => false
360
+ def edit(filename)
361
+ metadata = readMetadata(filename)
362
+
363
+ if options[:tag] == 'all'
364
+ tags = ['author','title','subject','createdate','keywords']
365
+ else
366
+ tags = options[:tag].split(',')
367
+ end
368
+ tags.each do |currentTag|
369
+
370
+ # Change the tag to something we can use here
371
+ puts "Current value: '#{metadata[currentTag.downcase]}'"
372
+ answer = readUserInput("Enter new value for #{currentTag} :")
373
+ if currentTag.downcase == 'createdate'
374
+ while not answer = identifyDate(answer)
375
+ puts 'Invalid date format'
376
+ answer = readUserInput("Enter new value for #{currentTag} :")
377
+ end
378
+ end
379
+ puts "Changing value for #{currentTag}: '#{metadata[currentTag]}' => #{answer}"
380
+ `exiftool -#{currentTag.downcase}='#{answer}' -overwrite_original '#{filename}'`
381
+ end
382
+
383
+ #
384
+ # If required, run the renaming task afterwards
385
+ # This is not pretty, but seems to be the only way to do this in THOR
386
+ #
387
+ if options[:rename]
388
+ puts `#{__FILE__} rename '#{filename}'`
389
+ end
390
+
391
+ end
392
+
393
+ #
394
+ # Check the metadata for the minium necessary tags
395
+ # See documentation at the top of this file for defailts
396
+ #
397
+ # void check(string)
398
+ desc 'check', 'Check Metadata for completeness'
399
+ long_desc <<-LONGDESC
400
+ == General
401
+
402
+ Show value of the following metatags of a PDF document:
403
+
404
+ - Author
405
+ \x5- Creator
406
+ \x5- CreateDate
407
+ \x5- Subject
408
+ \x5- Title
409
+ \x5- Keywords
410
+
411
+ == Example
412
+
413
+ # Show the values of the metatags for example.pdf
414
+ \x5>CLI show example.pdf
415
+
416
+ LONGDESC
417
+ def check(filename)
418
+ returnvalue = 0
419
+ readMetadata(filename).each do|key,value|
420
+ if key.match(/author|subject|createdate|title/) and value.empty?
421
+ puts 'Missing value: ' + key
422
+ returnvalue == 0 ? returnvalue = 1 : ''
423
+ end
424
+ end
425
+ exit returnvalue
426
+ end
427
+
428
+ #
429
+ # Explain fields and Metatags
430
+ # Show information about how they are used.
431
+ #
432
+ desc 'explain','Show more information about usuable Meta-Tags'
433
+ long_desc <<-LONGDESC
434
+ == General
435
+
436
+ Explain some terms used with the script.
437
+
438
+ == Example
439
+
440
+ # Show the available subjects
441
+ \x5>CLI explain
442
+
443
+ # Show information about the subject 'author'
444
+ \x5>CLI explain author
445
+
446
+ LONGDESC
447
+ def explain(term='')
448
+
449
+ case term
450
+ when ''
451
+ puts 'Available subjects:'
452
+ puts '- author'
453
+ puts '- createdate'
454
+ puts '- keywords'
455
+ puts '- subject'
456
+ puts '- title'
457
+ puts ' '
458
+ puts "Run `$ #{__FILE__} explain <subject>` to get more details."
459
+ when 'author'
460
+ puts '[Author]'
461
+ puts ' The sender or creator of the document.'
462
+ when 'createdate'
463
+ puts '[CreateDate]'
464
+ puts ' Date of the document. This is not the date when the file was created, but'
465
+ puts ' the date found in the document or printed on the document.'
466
+ when 'title'
467
+ puts '[Title]'
468
+ puts ' General type of the document, e.g. Manual, Invoice.'
469
+ when 'subject'
470
+ puts '[Subject]'
471
+ puts ' What is the document about.'
472
+ puts ' For example:'
473
+ puts ' Manual: What is the manual about?'
474
+ puts ' Invoice: Invoice number?'
475
+ puts ' Contract: Contract number of Subject of the contract?'
476
+ puts ' Order: Ordernumber of the document?'
477
+ when 'keywords'
478
+ puts '[Keywords]'
479
+ puts ' Anything else that might be of interesst.'
480
+ puts ' In Orders the elements that have been orders. Contracts might contain the'
481
+ puts ' Names and adress of the involved parties.'
482
+ puts ' '
483
+ puts ' When writing Invoices with their numbers, these will be automatically be '
484
+ puts ' picked up and can be integrated in the filename, e.g. "Invoicenumber 12334'
485
+ end
486
+
487
+ end
488
+
489
+ #
490
+ # Sort the files into directories based on the author
491
+ #
492
+ desc 'sort','Sort files into directories sorted by Author'
493
+ long_desc <<-LONGDESC
494
+ == General
495
+
496
+ Will sort pdf documents into subdirectories according to the value of their
497
+ tag 'author'.
498
+
499
+ When using this action a logfile with all actions will be generated in the
500
+ current working directory with the same name as the script and the ending
501
+ '.log'. This can be disabled with the parameter 'log' if required.
502
+
503
+ If a document does not have an entry in the meta tag 'author', the file will
504
+ not be processed. This can be seen in the output of the logfile as well.
505
+
506
+ === Parameters
507
+
508
+ [*destination|d*]
509
+ \x5 Speficy the root output directory to where the folderstructure is being created.
510
+
511
+ This parameter is required.
512
+
513
+ [*copy|c*]
514
+ \x5 Copy the files instead of moving them.
515
+
516
+ [*log|l*]
517
+ \x5 Disable/Enable the logging.
518
+ \x5 Default: enabled.
519
+
520
+ === Replacement rules
521
+
522
+ The subdirectories for the documents are generated from the values in the
523
+ tag 'author' of each document.
524
+
525
+ In order to ensure a clean directory structure, there are certain rules
526
+ for altering the values.
527
+ \x5 1. Whitespaces are replaced by underscores.
528
+ \x5 2. Dots are replaced by underscores.
529
+ \x5 3. All letters are converted to their lowercase version.
530
+ \x5 4. Special characters are serialized
531
+
532
+ === Example
533
+
534
+ This command does the following:
535
+ \x5 1. Take all pdf documents in the subdirectory ./documents.
536
+ \x5 2. Create the output folder structure in `/tmp/test/`.
537
+ \x5 3. Copy the files instead of moving them.
538
+ \x5 4. Disable the logging.
539
+ \x5> CLI sort -d /tmp/test -c -l false ./documents
540
+
541
+ LONGDESC
542
+ method_option :destination, :aliases => '-d', :required => true, :type => :string, :desc => 'Defines the output directory'
543
+ method_option :copy, :aliases => '-c', :required => false, :type => :boolean, :desc => 'Copy files instead of moving them'
544
+ method_option :log, :aliases => '-l', :require => false, :type => :boolean, :desc => 'Enable/Disable creation of log files', :default => true
545
+ def sort(inputDir = '.')
546
+
547
+ destination = options[:destination]
548
+ logenable = options[:log]
549
+ scriptname = Pathname.new(__FILE__).basename
550
+ logenable ? $logger = Logger.new(Dir.pwd.chomp('/') + "/#{scriptname}.log") : ''
551
+
552
+ # Input validation
553
+ !File.exist?(inputDir) ? abort('Input directory does not exist. Abort.'): ''
554
+ File.directory?(inputDir) ? '' : abort('Input is a single file')
555
+ File.file?(destination) ? abort("Output '#{destination}' is an existing file. Cannot create directory with the same name. Abort") : ''
556
+ unless File.directory?(destination)
557
+ FileUtils.mkdir_p(destination)
558
+ $logger.info("Destination '#{destination}' has been created.")
559
+ end
560
+
561
+ # Iterate through all files
562
+ Dir[inputDir.chomp('/') + '/*.pdf'].sort.each do |file|
563
+
564
+ metadata = readMetadata(file)
565
+ if metadata['author'] and not metadata['author'].empty?
566
+ author = metadata['author'].gsub(' ','_').gsub('.','_')
567
+ I18n.enforce_available_locales = false # Serialize special characters
568
+ author = I18n.transliterate(author).downcase
569
+ folderdestination = destination.chomp('/') + '/' + author
570
+ unless File.directory?(folderdestination)
571
+ FileUtils.mkdir_p(folderdestination)
572
+ logenable ? $logger.info("Folder '#{folderdestination}' has been created."): ''
573
+ end
574
+ filedestination = destination.chomp('/') + '/' + author + '/' + Pathname.new(file).basename.to_s
575
+
576
+ # Final check before touching the filesystem
577
+ if not File.exist?(filedestination)
578
+ $logger.info("File '#{file}' => '#{filedestination}'")
579
+
580
+ # Move/Copy the file
581
+ if options[:copy]
582
+ FileUtils.cp(file, filedestination)
583
+ else
584
+ FileUtils.mv(file,filedestination)
585
+ end
586
+
587
+ else
588
+ logenable ? $logger.warn("File '#{filedestination}' already exists. Ignoring.") : ''
589
+ end
590
+ else
591
+ logenable ? $logger.warn("Missing tag 'Author' for file '#{file}'. Skipping.") : (puts "Missing tag 'Author' for file '#{file}'. Skipping")
592
+ end
593
+ end
594
+
595
+ end
596
+
597
+ #
598
+ # Rename the file according to the Metadata
599
+ #
600
+ # Scheme: YYYYMMDD-author-subject-keywords.extension
601
+ desc 'rename', 'Rename the file according to Metadata'
602
+ long_desc <<-LONGDESC
603
+ == General
604
+
605
+ Rename a file with the meta tags in the document.
606
+
607
+ == Parameter
608
+
609
+ --dry-run, -n
610
+ \x5 Simulate the renaming process and show the result without changing the file.
611
+
612
+ --all-keywords, -a
613
+ \x5 Use all keywords from the meta information in the file name and ignore the limit.
614
+
615
+ --keywwords, -k
616
+ \x5 Set the number of keywords used in the filename to a new value.
617
+ \x5 Default: 3
618
+
619
+ --outputdir, -o
620
+ \x5 Not implemented yet. Default output dir for the renamed file is the source directory.
621
+
622
+ == Example
623
+
624
+ # Rename the file according to the metatags
625
+ \x5> CLI rename <filename>
626
+
627
+ # Rename example.pdf according to the metatags
628
+ \x5> CLI rename example.pdf
629
+
630
+ # Simulate renaming example.pdf according to the metatags (dry-run)
631
+ \x5> CLI rename -n example.pdf
632
+
633
+ == Rules
634
+
635
+ There are some rules regarding how documents are being renamed
636
+
637
+ Rule 1: All documents have the following filenaming structure:
638
+
639
+ <yyyymmdd>-<author>-<type>-<additionalInformation>.<extension>
640
+
641
+ \x5 # <yyyymmdd>: Year, month and day identival to the meta information in the
642
+ document.
643
+ \x5 # <author>: Author of the document, identical to the meta information
644
+ in the document. Special characters and whitespaces are replaced.
645
+ \x5 # <type>: Document type, is being generated from the title field in the metadata of the document. Document type is a three character abbreviation following the following logic:
646
+
647
+ \x5 til => Tilbudt|Angebot
648
+ \x5 odb => Orderbekreftelse
649
+ \x5 fak => Faktura
650
+ \x5 ord => Order
651
+ \x5 avt => Kontrakt|Avtale|Vertrag|contract
652
+ \x5 kvi => Kvittering
653
+ \x5 man => Manual
654
+ \x5 bil => Billett|Ticket
655
+ \x5 inf => Informasjon|Information
656
+ \x5 dok => unknown
657
+
658
+ If the dokument type can not be determined automatically, it defaults to 'dok'.
659
+
660
+ # <additionalInformation>: Information generated from the metadata fields
661
+ 'title', 'subject' and 'keywords'.
662
+
663
+ If 'Title' or 'Keywords' contains one of the following keywords, the will be replaced with the corresponding abbreviation followed by the specified value separated by a whitespace:
664
+
665
+ \x5 fak => Faktura|Fakturanummer|Rechnung|Rechnungsnummer
666
+ \x5 kdn => Kunde|Kundenummer|Kunde|Kundennummer
667
+ \x5 ord => Ordre|Ordrenummer|Bestellung|Bestellungsnummer
668
+ \x5 kvi => Kvittering|Kvitteringsnummer|Quittung|Quittungsnummer
669
+
670
+ Rule 2: The number of keywords used in the filename is defined by the parameter '-k'. See the section of that parameter for more details and the default value.
671
+
672
+ Rule 3: Keywords matching 'kvi','fak','ord','kdn' are prioritised.
673
+
674
+ Rule 4: Special characters and whitespaces are replaced:
675
+
676
+ \x5 ' ' => '_'
677
+ \x5 '/' => '_'
678
+
679
+ Rule 5: The new filename has only lowercase characters.
680
+
681
+ == Example (detailed)
682
+
683
+ # Example PDF with following MetaTags:
684
+
685
+ \x5 Filename : example.pdf
686
+ \x5 Author : John
687
+ \x5 Subject : new Product
688
+ \x5 Title : Presentation
689
+ \x5 CreateDate : 1970:01:01 01:00:00
690
+ \x5 Keywords : John Doe, Jane Doe, Mister Doe
691
+
692
+ # Renaming the file
693
+ \x5> CLI rename example.pdf
694
+ \x5 example.pdf => 19700101-john-dok-new_product-john_doe-jane_doe.pdf
695
+
696
+ # Simulation to rename the file (no actual change)
697
+ \x5> CLI rename -n example.pdf
698
+ \x5example.pdf => 19700101-john-dok-new_product-john_doe-jane_doe.pdf
699
+
700
+ # Renaming the file with all keywords
701
+ \x5> CLI rename -n -a example.pdf
702
+ \x5example.pdf => 19700101-john-dok-new_product-john_doe-jane_doe-mister_doe.pdf
703
+
704
+ LONGDESC
705
+ method_option :dryrun, :type => :boolean, :aliases => '-n', :desc => 'Run without making changes', :default => false, :required => false
706
+ method_option ':all-keywords', :type => :boolean, :aliases => '-a', :desc => 'Add all keywords (no limit)', :default => false, :required => false
707
+ method_option :keywords, :type => :numeric, :aliases => '-k', :desc => 'Number of keywords to include (Default: 3)', :default => 3, :required => false
708
+ method_option :outputdir, :aliases => '-o', :type => :string, :desc => 'Speficy output directory', :default => :false, :required => :false
709
+ def rename(filename)
710
+ metadata = readMetadata(filename).each do |key,value|
711
+
712
+ # Check if the metadata is complete
713
+ if key.match(/author|subject|createdate|title/) and value.empty?
714
+ puts 'Missing value for ' + key
715
+ puts 'Abort'
716
+ exit 1
717
+ end
718
+
719
+ end
720
+
721
+ date = metadata['createdate'].gsub(/\ \d{2}\:\d{2}\:\d{2}.*$/,'').gsub(/\:/,'')
722
+ author = metadata['author'].gsub(/\./,'_').gsub(/\-/,'').gsub(/\s/,'_')
723
+ I18n.enforce_available_locales = false
724
+ author = I18n.transliterate(author) # Normalising
725
+
726
+ keywords_preface = ''
727
+ # This statement can probably be optimised
728
+ case metadata['title']
729
+ when /(Tilbudt|Angebot)/i
730
+ doktype = 'til'
731
+ keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
732
+ when /Orderbekrefelse/i
733
+ doktype = 'odb'
734
+ keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
735
+ when /faktura/i
736
+ doktype = 'fak'
737
+ keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
738
+ when /order/i
739
+ doktype = 'ord'
740
+ keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
741
+ when /(kontrakt|avtale|vertrag|contract)/i
742
+ doktype = 'avt'
743
+ keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
744
+ when /kvittering/i
745
+ doktype = 'kvi'
746
+ keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
747
+ when /manual/i
748
+ doktype = 'man'
749
+ keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
750
+ when /(billett|ticket)/i
751
+ doktype = 'bil'
752
+ keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
753
+ when /(informasjon|information)/i
754
+ doktype = 'inf'
755
+ keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
756
+ else
757
+ doktype = 'dok'
758
+ keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
759
+ end
760
+ if not metadata['keywords'].empty?
761
+ keywords_preface == '' ? keywords = '' : keywords = keywords_preface
762
+ keywordsarray = metadata['keywords'].split(',')
763
+
764
+ #
765
+ # Sort array
766
+ #
767
+ keywordssorted = Array.new
768
+ keywordsarray.each_with_index do |value,index|
769
+ value = value.lstrip.chomp
770
+ value = value.gsub(/(Faktura|Rechnungs)(nummer)? /i,'fak')
771
+ value = value.gsub(/(Kunde)(n)?(nummer)? /i,'kdn')
772
+ value = value.gsub(/(Kunde)(n)?(nummer)?-/i,'kdn')
773
+ value = value.gsub(/(Ordre|Bestellung)(s?nummer)? /i,'ord')
774
+ value = value.gsub(/(Kvittering|Quittung)(snummer)? /i,'kvi')
775
+ value = value.gsub(/\s/,'_')
776
+ value = value.gsub(/\//,'_')
777
+ keywordsarray[index] = value
778
+ if value.match(/^(fak|kdn|ord|kvi)/)
779
+ keywordssorted.insert(0, value)
780
+ else
781
+ keywordssorted.push(value)
782
+ end
783
+ end
784
+
785
+ counter = 0
786
+ keywordssorted.each_with_index do |value,index|
787
+
788
+ # Exit condition limits the number of keywords used in the filename
789
+ # unless all keywords shall be added
790
+ if not options[':all-keywords']
791
+ counter > options[:keywords]-1 ? break : counter = counter + 1
792
+ end
793
+ if value.match(/(kvi|fak|ord|kdn)/i)
794
+ keywords == '' ? keywords = '-' + value : keywords = value + '-' + keywords
795
+ else
796
+ keywords == '' ? keywords = '-' + value : keywords.concat('-' + value)
797
+ end
798
+ end
799
+ # Normalise the keywords as well
800
+ #
801
+ I18n.enforce_available_locales = false
802
+ keywords = I18n.transliterate(keywords)
803
+
804
+ # There are no keywords
805
+ # Rare, but it happens
806
+ else
807
+
808
+ # There are no keywords.
809
+ # we are using the title and the subject
810
+ if keywords_preface != ''
811
+ keywords = keywords_preface
812
+ end
813
+
814
+ end
815
+ extension = 'pdf'
816
+ if keywords != nil and keywords[0] != '-'
817
+ keywords = '-' + keywords
818
+ end
819
+ keywords == nil ? keywords = '' : ''
820
+ newFilename = date + '-' +
821
+ author + '-' +
822
+ doktype +
823
+ keywords + '.' +
824
+ extension
825
+
826
+ # Output directory checks
827
+ if options[:outputdir]
828
+ #if not File.exist?(options[:outputdir])
829
+ # puts "Error: output dir '#{options[:outputdir]}' not found. Abort"
830
+ # exit 1
831
+ #end
832
+ end
833
+
834
+ if not options[:dryrun] and filename != newFilename.downcase
835
+ `mv -v '#{filename}' '#{newFilename.downcase}'`
836
+ else
837
+ puts filename + "\n => " + newFilename.downcase
838
+ end
839
+ end
840
+
841
+ #
842
+ # One parameter to show the current version
843
+ #
844
+ map %w[--version -v] => :__print_version
845
+ desc "--version, -v", 'Show the current script version'
846
+ def __print_version
847
+ puts VERSION
848
+ end
849
+
850
+ end
851
+
852
+ DOC.start
853
+
metadata ADDED
@@ -0,0 +1,45 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: pdfmd
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.3.0
5
+ platform: ruby
6
+ authors:
7
+ - Daniel Roos
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-03-16 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Managing the commong pdf metadata settings and renaming the pdf file
14
+ accordingly.
15
+ email: daniel@micronerd.org
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - lib/pdfmd.rb
21
+ homepage: http://rubygems.org/gems/pdfmd
22
+ licenses:
23
+ - MIT
24
+ metadata: {}
25
+ post_install_message:
26
+ rdoc_options: []
27
+ require_paths:
28
+ - lib
29
+ required_ruby_version: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ required_rubygems_version: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - ">="
37
+ - !ruby/object:Gem::Version
38
+ version: '0'
39
+ requirements: []
40
+ rubyforge_project:
41
+ rubygems_version: 2.4.6
42
+ signing_key:
43
+ specification_version: 4
44
+ summary: pdfmd - pdf-meta-data management
45
+ test_files: []