pdfmd 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/pdfmd.rb +853 -0
  3. metadata +45 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 0e9a319bb39e3972119dabeda67a88918e1662a9
4
+ data.tar.gz: 2ddb7e4e715fe65192685c19bcdb560b36f3708b
5
+ SHA512:
6
+ metadata.gz: e88256a30ab208960bf09071e88ec291c98349fc8b5dd66077867182b1a467bef916e2e6524e3371f95237863dd8e7e957462f52dcb9dc3a05b6bc172326d7ec
7
+ data.tar.gz: 0ae519f568c6409c249e5a154365c70a4c06dcfac3288ba863d336d28c2b4daf0188a4d07d0fa7c7fbf5b34bcfd16e5fe930b28759aa9c9f36db914abfe556ec
@@ -0,0 +1,853 @@
1
+ #!/usr/bin/env ruby
2
+ # == Version 1.3
3
+ #
4
+ # == File: pdfmetadata.rb
5
+ #
6
+ # Show and edit Metadata of PDF files and rename the files accordingly.
7
+ #
8
+ # === Requirements
9
+ #
10
+ # ==== Ruby gems:
11
+ # - thor
12
+ # - highline/import
13
+ # - fileutils
14
+ # - i18n
15
+ # - pathname
16
+ # - logger
17
+ #
18
+ # ==== OS applications:
19
+ #
20
+ # - exiftools
21
+ #
22
+ # === Usage
23
+ #
24
+ # $ ./pdfmetadata <action> <parameter> file
25
+ #
26
+ # $ ./pdfmetadata help <action>
27
+ #
28
+ # An overview about the actions can be seen when running the script without
29
+ # any parameters
30
+ #
31
+ # === Changelog
32
+ #
33
+ # Version 1.3
34
+ # - Small bugfix about special characters in filenames (author).
35
+ # - Bugfix for the tag 'createdate' written as 'CreateDate' which did not
36
+ # take the date then.
37
+ # - Removed inactive code.
38
+ # - Added paramter 'version'
39
+ #
40
+ # Version 1.2
41
+ # - Small bugfix with the sort function and the logfile being created.
42
+ #
43
+ # Version 1.1
44
+ # - Added Function to sort pdf documents into a directory structure based on
45
+ # the author of the document.
46
+ # - Added dependency 'pathname'
47
+ # - Added dependency 'logger'
48
+ # - Added dependency 'i18n'
49
+ # - Added method 'sort'
50
+ # - Changing a tag will now output the old value in the edit dialog.
51
+ # - Updated documentation and descriptions of methods
52
+ #
53
+ # Version 1.0
54
+ # - Added documentation in long description of the commands
55
+ # - Added method "explain" for further information
56
+ #
57
+ # Version 0.9
58
+ # - Added 'rename' option to edit metatags
59
+ # - Fixed some output strings
60
+ #
61
+ # Version 0.x
62
+ # - All other stuff
63
+ #
64
+ # Check and set metadata of PDF documents
65
+ #
66
+ # A complete set of metada contains
67
+ #
68
+ # * CreateDate
69
+ # * Title
70
+ # * Author
71
+ # * Subject
72
+ # * Keywords (optional)
73
+ #
74
+ # TODO: Include password protected PDF documents as well
75
+ # TODO: Fix broken PDF files automatically
76
+ # TODO: Enable logging in more functions than only "sort"
77
+ # TODO: Read this: http://lostechies.com/derickbailey/2011/04/29/writing-a-thor-application/
78
+ # TODO: ... and this: http://blog.paracode.com/2012/05/17/building-your-tools-with-thor/
79
+ # TODO: Create Gem: http://yehudakatz.com/2010/04/02/using-gemspecs-as-intended/
80
+ # gs \
81
+ # -o repaired.pdf \
82
+ # -sDEVICE=pdfwrite \
83
+ # -dPDFSETTINGS=/prepress \
84
+ # corrupted.pdf
85
+ #
86
+ # == Author
87
+ #
88
+ # Daniel Roos <daniel-git@micronarrativ.org>
89
+ # Source: https://github.com/Micronarrativ/micronarrativ/tree/scripts
90
+ #
91
+ require "thor"
92
+ require "highline/import"
93
+ require "fileutils"
94
+ require "i18n"
95
+ require 'pathname'
96
+ require 'logger'
97
+
98
+ VERSION = '1.3'
99
+ #
100
+ # Function to read the metadata from a given file
101
+ # hash readMetadata(string)
102
+ #
103
+ def readMetadata(pathFile = false)
104
+ metadata = Hash.new
105
+ metadata['keywords'] = ''
106
+ metadata['subject'] = ''
107
+ metadata['title'] = ''
108
+ metadata['author'] = ''
109
+ metadata['creator'] = ''
110
+ metadata['createdate'] = ''
111
+ if not File.file?(pathFile)
112
+ puts "Cannot access file #{pathFile}. Abort"
113
+ abort
114
+ end
115
+
116
+ # Fetch the Metada with the help of exiftools (unless something better is
117
+ # found
118
+ metaStrings = `exiftool '#{pathFile}' | egrep -i '^Creator\s+\:|^Author|Create Date|Subject|Keywords|Title'`
119
+
120
+ # Time to cherrypick the available data
121
+ entries = metaStrings.split("\n")
122
+ entries.each do |entry|
123
+ values = entry.split(" : ")
124
+ values[0].match(/Creator/) and metadata['creator'] == '' ? metadata['creator'] = values[1]: metadata['creator'] = ''
125
+ values[0].match(/Author/) and metadata['author'] == '' ? metadata['author'] = values[1]: metadata['author'] = ''
126
+ values[0].match(/Create Date/) and metadata['createdate'] == '' ? metadata['createdate'] = values[1]: metadata['createdate'] = ''
127
+ values[0].match(/Subject/) and metadata['subject'] == '' ? metadata['subject'] = values[1]: metadata['subject'] = ''
128
+ values[0].match(/Keywords/) and metadata['keywords'] == '' ? metadata['keywords'] = values[1]: metadata['keywords'] =''
129
+ values[0].match(/Title/) and metadata['title'] == '' ? metadata['title'] = values[1]: metadata['title'] =''
130
+ end
131
+ return metadata
132
+ end
133
+
134
+ #
135
+ # Set Keywords Preface based on title and subject
136
+ # If subject matches a number/character combination and contains no spaces,
137
+ # the preface will be combined with the doktype.
138
+ # If not: preface will contain the whole subject with dots and spaces being
139
+ # replaced with underscores
140
+ #
141
+ def setKeywordsPreface(metadata, doktype)
142
+ if metadata['subject'].match(/^\d+[^+s]+.*/)
143
+ return doktype + metadata['subject']
144
+ else
145
+ subject = metadata['subject']
146
+
147
+ # Take care of special characters
148
+ I18n.enforce_available_locales = false
149
+ subject = I18n.transliterate(metadata['subject'])
150
+
151
+ # Replace everything else
152
+ subject = subject.gsub(/[^a-zA-Z0-9]+/,'_')
153
+ return subject
154
+ end
155
+ end
156
+
157
+ #
158
+ # Read user input
159
+ #
160
+ def readUserInput(textstring = 'Enter value: ')
161
+ return ask textstring
162
+ end
163
+
164
+ #
165
+ # Identify a date
166
+ # Function takes a string and tries to identify a date in there.
167
+ # returns false if no date could be identified
168
+ # otherwise the date is returned in the format as
169
+ #
170
+ # YYYY:MM:DD HH:mm:ss
171
+ #
172
+ # For missing time values zero is assumed
173
+ #
174
+ def identifyDate(datestring)
175
+ identifiedDate = ''
176
+ year = '[1-2][90][0-9][0-9]'
177
+ month = '0[0-9]|10|11|12'
178
+ day = '[1-9]|0[1-9]|1[0-9]|2[0-9]|3[0-1]'
179
+ hour = '[0-1][0-9]|2[0-3]|[1-9]'
180
+ minute = '[0-5][0-9]'
181
+ second = '[0-5][0-9]'
182
+ case datestring
183
+ when /^(#{year})(#{month})(#{day})$/
184
+ identifiedDate = $1 + ':' + $2 + ':' + $3 + ' 00:00:00'
185
+ when /^(#{year})(#{month})(#{day})(#{hour})(#{minute})(#{second})$/
186
+ identifiedDate = $1 + ':' + $2 + ':' + $3 + ' ' + $4 + ':' + $5 + ':' + $6
187
+ when /^(#{year})[\:|\.|\-](#{month})[\:|\.|\-](#{day})\s(#{hour})[\:](#{minute})[\:](#{second})$/
188
+ identifiedDate = $1 + ':' + $2 + ':' + $3 + ' ' + $4 + ':' + $5 + ':' + $6
189
+ when /^(#{year})[\:|\.|\-](#{month})[\:|\.|\-](#{day})$/
190
+ day = "%02d" % $3
191
+ month = "%02d" % $2
192
+ identifiedDate = $1 + ':' + month + ':' + day + ' 00:00:00'
193
+ else
194
+ identifiedDate = false
195
+ end
196
+ return identifiedDate
197
+ end
198
+
199
+ class DOC < Thor
200
+
201
+
202
+ #
203
+ # Show the current metadata tags
204
+ #
205
+ # TODO: format output as JSON and YAML
206
+ # TODO: Enable additional options
207
+ #
208
+ desc 'show', 'Show metadata of a file'
209
+ method_option :all, :type => :boolean, :aliases => '-a', :desc => 'Show all metatags', :default => false, :required => false
210
+ method_option :tag, :type => :string, :aliases => '-t', :desc => 'Show specific tag(s), comma separated', :required => false
211
+ long_desc <<-LONGDESC
212
+ == General
213
+
214
+ Show metatags of a PDF document.
215
+
216
+ The following tags are being shown:
217
+ \x5 * Author
218
+ \x5 * Creator
219
+ \x5 * CreateDate
220
+ \x5 * Title
221
+ \x5 * Subject
222
+ \x5 * Keywords
223
+
224
+ == Parameters
225
+
226
+ --all, -a
227
+ \x5 Show all relevant metatags for a document.
228
+
229
+ Relevant tags are Author,Creator, CreateDate, Title, Subject, Keywords.
230
+
231
+ --tag, -t
232
+ \x5 Specify the metatag to show. The selected metatag must be one of the relevant tags. Other tags are ignored and nothing is returned.
233
+
234
+ == Example
235
+
236
+ # Show default metatags for a pdf document
237
+ \x5>CLI show <filename>
238
+
239
+ # Show default metatags for example.pdf
240
+ \x5>CLI show example.pdf
241
+
242
+ # Show value for metatag 'Author' for the file example.pdf
243
+ \x5>CLI show -t author example.pdf
244
+
245
+ # Show value for metatags 'Author','Title' for the file example.pdf
246
+ \x5>CLI show -t author,title example.pdf
247
+
248
+ LONGDESC
249
+ def show(filename)
250
+ metadata = readMetadata(filename)
251
+
252
+ # Output all metatags
253
+ if options[:all] or options[:tag].nil?
254
+ puts "Author : " + metadata['author'].to_s
255
+ puts "Creator : " + metadata['creator'].to_s
256
+ puts "CreateDate : " + metadata['createdate'].to_s
257
+ puts "Subject : " + metadata['subject'].to_s
258
+ puts "Title : " + metadata['title'].to_s
259
+ puts "Keywords : " + metadata['keywords'].to_s
260
+
261
+ # Ouput only specific tags
262
+ elsif not options[:tag].nil?
263
+ tags = options[:tag].split(',')
264
+ tags.each do |tag|
265
+ puts metadata[tag]
266
+ end
267
+ end
268
+
269
+ end
270
+
271
+ #
272
+ # Change a MetaTag Attribute
273
+ #
274
+ # TODO: keywords are added differently according to the documentation
275
+ # http://www.sno.phy.queensu.ca/~phil/exiftool/faq.html
276
+ desc 'edit', 'Edit Meta Tag(s)'
277
+ long_desc <<-LONGDESC
278
+ == General
279
+
280
+ Command will edit the metadata of a PDF document. Multiple values can be
281
+ specified or 'all'.
282
+
283
+ The command will invoke an interactive user input and request the values
284
+ for the metatag.
285
+
286
+ Additionally the file can be renamed at the end according to the new meta
287
+ tags. See `$ #{__FILE__} help rename` for details.
288
+
289
+ == Parameters
290
+
291
+ --tag, -t
292
+ \x5 Names or list of names of Metatag fields to set, separated by commata.
293
+
294
+ --rename, -r
295
+ \x5 Rename file after updating the meta tag information according to the fields.
296
+
297
+ This parameter is identical to running `> CLI rename <filename>`
298
+
299
+ General example:
300
+
301
+ # Edit tag 'TAG' and set a new value interactive.
302
+ \x5>CLI edit -t TAG <filename>
303
+
304
+ # Edit tag 'Author' and set new value interactive.
305
+ \x5>CLI edit -t author example.pdf
306
+
307
+ # Edit mulitple Tags and set a new value.
308
+ \x5>CLI edit -t tag1,tag2,tag3 <filename>
309
+
310
+
311
+ == Multiple Tags
312
+
313
+ For setting multiple tags list the tags comma separated.
314
+
315
+ For setting all tags (Author, Title, Subject, CreateDate, Keywords) use the keyword 'all' as tagname.
316
+
317
+ # Set tags 'Author', 'Title', 'Subject' in example.pdf interactivly.
318
+ \x5>CLI edit -t author,title,subject example.pdf`
319
+
320
+ # Set tags 'Author', 'Title', 'Subject', 'CreateDate', 'Keywords' in
321
+ example.pdf interactive.
322
+ \x5>CLI edit -t all example.pdf
323
+
324
+ == Tag: CreateDate
325
+
326
+ In order to enter a value for the 'CreateDate' field, some internal matching is going on in order to make it easier and faster to enter dates and times.
327
+
328
+ The following formats are identified/matched:
329
+
330
+ \x5 yyyymmdd
331
+ \x5 yyyymmd
332
+ \x5 yyyymmddHHMMSS
333
+ \x5 yyyy-mm-dd HH:MM:SS
334
+ \x5 yyyy:mm:dd HH:MM:SS
335
+ \x5 yyyy.mm.dd HH:MM:SS
336
+ \x5 yyyy-mm-d
337
+ \x5 yyyy-mm-dd
338
+ \x5 yyyy.mm.d
339
+ \x5 yyyy.mm.dd
340
+ \x5 yyyy:mm:d
341
+ \x5 yyyy:mm:dd
342
+
343
+ \x5 - If HH:MM:SS or HHMMSS is not provided, those values are automatically set to zero.
344
+ \x5 - The output format of every timestamp is <yyyy:mm:dd HH:MM:SS>
345
+ \x5 - When providing and invalid date, the incorrect date is rejected and the user asked to provide the correct date.
346
+
347
+ == Rename file
348
+
349
+ In addition to setting the tags the current file can be renamed according to
350
+ the new metadata.
351
+
352
+ # Set tag 'Author' and rename file example.pdf
353
+ \x5> CLI edit -t author -r example.pdf
354
+
355
+ See `> CLI help rename` for details about renaming.
356
+
357
+ LONGDESC
358
+ method_option :tag, :type => :string, :aliases => '-t', :desc => 'Name of the Tag(s) to Edit', :default => false, :required => true
359
+ method_option :rename, :type => :boolean, :aliases => '-r', :desc => 'Rename file after changing meta-tags', :default => false, :required => false
360
+ def edit(filename)
361
+ metadata = readMetadata(filename)
362
+
363
+ if options[:tag] == 'all'
364
+ tags = ['author','title','subject','createdate','keywords']
365
+ else
366
+ tags = options[:tag].split(',')
367
+ end
368
+ tags.each do |currentTag|
369
+
370
+ # Change the tag to something we can use here
371
+ puts "Current value: '#{metadata[currentTag.downcase]}'"
372
+ answer = readUserInput("Enter new value for #{currentTag} :")
373
+ if currentTag.downcase == 'createdate'
374
+ while not answer = identifyDate(answer)
375
+ puts 'Invalid date format'
376
+ answer = readUserInput("Enter new value for #{currentTag} :")
377
+ end
378
+ end
379
+ puts "Changing value for #{currentTag}: '#{metadata[currentTag]}' => #{answer}"
380
+ `exiftool -#{currentTag.downcase}='#{answer}' -overwrite_original '#{filename}'`
381
+ end
382
+
383
+ #
384
+ # If required, run the renaming task afterwards
385
+ # This is not pretty, but seems to be the only way to do this in THOR
386
+ #
387
+ if options[:rename]
388
+ puts `#{__FILE__} rename '#{filename}'`
389
+ end
390
+
391
+ end
392
+
393
+ #
394
+ # Check the metadata for the minium necessary tags
395
+ # See documentation at the top of this file for defailts
396
+ #
397
+ # void check(string)
398
+ desc 'check', 'Check Metadata for completeness'
399
+ long_desc <<-LONGDESC
400
+ == General
401
+
402
+ Show value of the following metatags of a PDF document:
403
+
404
+ - Author
405
+ \x5- Creator
406
+ \x5- CreateDate
407
+ \x5- Subject
408
+ \x5- Title
409
+ \x5- Keywords
410
+
411
+ == Example
412
+
413
+ # Show the values of the metatags for example.pdf
414
+ \x5>CLI show example.pdf
415
+
416
+ LONGDESC
417
+ def check(filename)
418
+ returnvalue = 0
419
+ readMetadata(filename).each do|key,value|
420
+ if key.match(/author|subject|createdate|title/) and value.empty?
421
+ puts 'Missing value: ' + key
422
+ returnvalue == 0 ? returnvalue = 1 : ''
423
+ end
424
+ end
425
+ exit returnvalue
426
+ end
427
+
428
+ #
429
+ # Explain fields and Metatags
430
+ # Show information about how they are used.
431
+ #
432
+ desc 'explain','Show more information about usuable Meta-Tags'
433
+ long_desc <<-LONGDESC
434
+ == General
435
+
436
+ Explain some terms used with the script.
437
+
438
+ == Example
439
+
440
+ # Show the available subjects
441
+ \x5>CLI explain
442
+
443
+ # Show information about the subject 'author'
444
+ \x5>CLI explain author
445
+
446
+ LONGDESC
447
+ def explain(term='')
448
+
449
+ case term
450
+ when ''
451
+ puts 'Available subjects:'
452
+ puts '- author'
453
+ puts '- createdate'
454
+ puts '- keywords'
455
+ puts '- subject'
456
+ puts '- title'
457
+ puts ' '
458
+ puts "Run `$ #{__FILE__} explain <subject>` to get more details."
459
+ when 'author'
460
+ puts '[Author]'
461
+ puts ' The sender or creator of the document.'
462
+ when 'createdate'
463
+ puts '[CreateDate]'
464
+ puts ' Date of the document. This is not the date when the file was created, but'
465
+ puts ' the date found in the document or printed on the document.'
466
+ when 'title'
467
+ puts '[Title]'
468
+ puts ' General type of the document, e.g. Manual, Invoice.'
469
+ when 'subject'
470
+ puts '[Subject]'
471
+ puts ' What is the document about.'
472
+ puts ' For example:'
473
+ puts ' Manual: What is the manual about?'
474
+ puts ' Invoice: Invoice number?'
475
+ puts ' Contract: Contract number of Subject of the contract?'
476
+ puts ' Order: Ordernumber of the document?'
477
+ when 'keywords'
478
+ puts '[Keywords]'
479
+ puts ' Anything else that might be of interesst.'
480
+ puts ' In Orders the elements that have been orders. Contracts might contain the'
481
+ puts ' Names and adress of the involved parties.'
482
+ puts ' '
483
+ puts ' When writing Invoices with their numbers, these will be automatically be '
484
+ puts ' picked up and can be integrated in the filename, e.g. "Invoicenumber 12334'
485
+ end
486
+
487
+ end
488
+
489
+ #
490
+ # Sort the files into directories based on the author
491
+ #
492
+ desc 'sort','Sort files into directories sorted by Author'
493
+ long_desc <<-LONGDESC
494
+ == General
495
+
496
+ Will sort pdf documents into subdirectories according to the value of their
497
+ tag 'author'.
498
+
499
+ When using this action a logfile with all actions will be generated in the
500
+ current working directory with the same name as the script and the ending
501
+ '.log'. This can be disabled with the parameter 'log' if required.
502
+
503
+ If a document does not have an entry in the meta tag 'author', the file will
504
+ not be processed. This can be seen in the output of the logfile as well.
505
+
506
+ === Parameters
507
+
508
+ [*destination|d*]
509
+ \x5 Speficy the root output directory to where the folderstructure is being created.
510
+
511
+ This parameter is required.
512
+
513
+ [*copy|c*]
514
+ \x5 Copy the files instead of moving them.
515
+
516
+ [*log|l*]
517
+ \x5 Disable/Enable the logging.
518
+ \x5 Default: enabled.
519
+
520
+ === Replacement rules
521
+
522
+ The subdirectories for the documents are generated from the values in the
523
+ tag 'author' of each document.
524
+
525
+ In order to ensure a clean directory structure, there are certain rules
526
+ for altering the values.
527
+ \x5 1. Whitespaces are replaced by underscores.
528
+ \x5 2. Dots are replaced by underscores.
529
+ \x5 3. All letters are converted to their lowercase version.
530
+ \x5 4. Special characters are serialized
531
+
532
+ === Example
533
+
534
+ This command does the following:
535
+ \x5 1. Take all pdf documents in the subdirectory ./documents.
536
+ \x5 2. Create the output folder structure in `/tmp/test/`.
537
+ \x5 3. Copy the files instead of moving them.
538
+ \x5 4. Disable the logging.
539
+ \x5> CLI sort -d /tmp/test -c -l false ./documents
540
+
541
+ LONGDESC
542
+ method_option :destination, :aliases => '-d', :required => true, :type => :string, :desc => 'Defines the output directory'
543
+ method_option :copy, :aliases => '-c', :required => false, :type => :boolean, :desc => 'Copy files instead of moving them'
544
+ method_option :log, :aliases => '-l', :require => false, :type => :boolean, :desc => 'Enable/Disable creation of log files', :default => true
545
+ def sort(inputDir = '.')
546
+
547
+ destination = options[:destination]
548
+ logenable = options[:log]
549
+ scriptname = Pathname.new(__FILE__).basename
550
+ logenable ? $logger = Logger.new(Dir.pwd.chomp('/') + "/#{scriptname}.log") : ''
551
+
552
+ # Input validation
553
+ !File.exist?(inputDir) ? abort('Input directory does not exist. Abort.'): ''
554
+ File.directory?(inputDir) ? '' : abort('Input is a single file')
555
+ File.file?(destination) ? abort("Output '#{destination}' is an existing file. Cannot create directory with the same name. Abort") : ''
556
+ unless File.directory?(destination)
557
+ FileUtils.mkdir_p(destination)
558
+ $logger.info("Destination '#{destination}' has been created.")
559
+ end
560
+
561
+ # Iterate through all files
562
+ Dir[inputDir.chomp('/') + '/*.pdf'].sort.each do |file|
563
+
564
+ metadata = readMetadata(file)
565
+ if metadata['author'] and not metadata['author'].empty?
566
+ author = metadata['author'].gsub(' ','_').gsub('.','_')
567
+ I18n.enforce_available_locales = false # Serialize special characters
568
+ author = I18n.transliterate(author).downcase
569
+ folderdestination = destination.chomp('/') + '/' + author
570
+ unless File.directory?(folderdestination)
571
+ FileUtils.mkdir_p(folderdestination)
572
+ logenable ? $logger.info("Folder '#{folderdestination}' has been created."): ''
573
+ end
574
+ filedestination = destination.chomp('/') + '/' + author + '/' + Pathname.new(file).basename.to_s
575
+
576
+ # Final check before touching the filesystem
577
+ if not File.exist?(filedestination)
578
+ $logger.info("File '#{file}' => '#{filedestination}'")
579
+
580
+ # Move/Copy the file
581
+ if options[:copy]
582
+ FileUtils.cp(file, filedestination)
583
+ else
584
+ FileUtils.mv(file,filedestination)
585
+ end
586
+
587
+ else
588
+ logenable ? $logger.warn("File '#{filedestination}' already exists. Ignoring.") : ''
589
+ end
590
+ else
591
+ logenable ? $logger.warn("Missing tag 'Author' for file '#{file}'. Skipping.") : (puts "Missing tag 'Author' for file '#{file}'. Skipping")
592
+ end
593
+ end
594
+
595
+ end
596
+
597
+ #
598
+ # Rename the file according to the Metadata
599
+ #
600
+ # Scheme: YYYYMMDD-author-subject-keywords.extension
601
+ desc 'rename', 'Rename the file according to Metadata'
602
+ long_desc <<-LONGDESC
603
+ == General
604
+
605
+ Rename a file with the meta tags in the document.
606
+
607
+ == Parameter
608
+
609
+ --dry-run, -n
610
+ \x5 Simulate the renaming process and show the result without changing the file.
611
+
612
+ --all-keywords, -a
613
+ \x5 Use all keywords from the meta information in the file name and ignore the limit.
614
+
615
+ --keywwords, -k
616
+ \x5 Set the number of keywords used in the filename to a new value.
617
+ \x5 Default: 3
618
+
619
+ --outputdir, -o
620
+ \x5 Not implemented yet. Default output dir for the renamed file is the source directory.
621
+
622
+ == Example
623
+
624
+ # Rename the file according to the metatags
625
+ \x5> CLI rename <filename>
626
+
627
+ # Rename example.pdf according to the metatags
628
+ \x5> CLI rename example.pdf
629
+
630
+ # Simulate renaming example.pdf according to the metatags (dry-run)
631
+ \x5> CLI rename -n example.pdf
632
+
633
+ == Rules
634
+
635
+ There are some rules regarding how documents are being renamed
636
+
637
+ Rule 1: All documents have the following filenaming structure:
638
+
639
+ <yyyymmdd>-<author>-<type>-<additionalInformation>.<extension>
640
+
641
+ \x5 # <yyyymmdd>: Year, month and day identival to the meta information in the
642
+ document.
643
+ \x5 # <author>: Author of the document, identical to the meta information
644
+ in the document. Special characters and whitespaces are replaced.
645
+ \x5 # <type>: Document type, is being generated from the title field in the metadata of the document. Document type is a three character abbreviation following the following logic:
646
+
647
+ \x5 til => Tilbudt|Angebot
648
+ \x5 odb => Orderbekreftelse
649
+ \x5 fak => Faktura
650
+ \x5 ord => Order
651
+ \x5 avt => Kontrakt|Avtale|Vertrag|contract
652
+ \x5 kvi => Kvittering
653
+ \x5 man => Manual
654
+ \x5 bil => Billett|Ticket
655
+ \x5 inf => Informasjon|Information
656
+ \x5 dok => unknown
657
+
658
+ If the dokument type can not be determined automatically, it defaults to 'dok'.
659
+
660
+ # <additionalInformation>: Information generated from the metadata fields
661
+ 'title', 'subject' and 'keywords'.
662
+
663
+ If 'Title' or 'Keywords' contains one of the following keywords, the will be replaced with the corresponding abbreviation followed by the specified value separated by a whitespace:
664
+
665
+ \x5 fak => Faktura|Fakturanummer|Rechnung|Rechnungsnummer
666
+ \x5 kdn => Kunde|Kundenummer|Kunde|Kundennummer
667
+ \x5 ord => Ordre|Ordrenummer|Bestellung|Bestellungsnummer
668
+ \x5 kvi => Kvittering|Kvitteringsnummer|Quittung|Quittungsnummer
669
+
670
+ Rule 2: The number of keywords used in the filename is defined by the parameter '-k'. See the section of that parameter for more details and the default value.
671
+
672
+ Rule 3: Keywords matching 'kvi','fak','ord','kdn' are prioritised.
673
+
674
+ Rule 4: Special characters and whitespaces are replaced:
675
+
676
+ \x5 ' ' => '_'
677
+ \x5 '/' => '_'
678
+
679
+ Rule 5: The new filename has only lowercase characters.
680
+
681
+ == Example (detailed)
682
+
683
+ # Example PDF with following MetaTags:
684
+
685
+ \x5 Filename : example.pdf
686
+ \x5 Author : John
687
+ \x5 Subject : new Product
688
+ \x5 Title : Presentation
689
+ \x5 CreateDate : 1970:01:01 01:00:00
690
+ \x5 Keywords : John Doe, Jane Doe, Mister Doe
691
+
692
+ # Renaming the file
693
+ \x5> CLI rename example.pdf
694
+ \x5 example.pdf => 19700101-john-dok-new_product-john_doe-jane_doe.pdf
695
+
696
+ # Simulation to rename the file (no actual change)
697
+ \x5> CLI rename -n example.pdf
698
+ \x5example.pdf => 19700101-john-dok-new_product-john_doe-jane_doe.pdf
699
+
700
+ # Renaming the file with all keywords
701
+ \x5> CLI rename -n -a example.pdf
702
+ \x5example.pdf => 19700101-john-dok-new_product-john_doe-jane_doe-mister_doe.pdf
703
+
704
+ LONGDESC
705
+ method_option :dryrun, :type => :boolean, :aliases => '-n', :desc => 'Run without making changes', :default => false, :required => false
706
+ method_option ':all-keywords', :type => :boolean, :aliases => '-a', :desc => 'Add all keywords (no limit)', :default => false, :required => false
707
+ method_option :keywords, :type => :numeric, :aliases => '-k', :desc => 'Number of keywords to include (Default: 3)', :default => 3, :required => false
708
+ method_option :outputdir, :aliases => '-o', :type => :string, :desc => 'Speficy output directory', :default => :false, :required => :false
709
+ def rename(filename)
710
+ metadata = readMetadata(filename).each do |key,value|
711
+
712
+ # Check if the metadata is complete
713
+ if key.match(/author|subject|createdate|title/) and value.empty?
714
+ puts 'Missing value for ' + key
715
+ puts 'Abort'
716
+ exit 1
717
+ end
718
+
719
+ end
720
+
721
+ date = metadata['createdate'].gsub(/\ \d{2}\:\d{2}\:\d{2}.*$/,'').gsub(/\:/,'')
722
+ author = metadata['author'].gsub(/\./,'_').gsub(/\-/,'').gsub(/\s/,'_')
723
+ I18n.enforce_available_locales = false
724
+ author = I18n.transliterate(author) # Normalising
725
+
726
+ keywords_preface = ''
727
+ # This statement can probably be optimised
728
+ case metadata['title']
729
+ when /(Tilbudt|Angebot)/i
730
+ doktype = 'til'
731
+ keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
732
+ when /Orderbekrefelse/i
733
+ doktype = 'odb'
734
+ keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
735
+ when /faktura/i
736
+ doktype = 'fak'
737
+ keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
738
+ when /order/i
739
+ doktype = 'ord'
740
+ keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
741
+ when /(kontrakt|avtale|vertrag|contract)/i
742
+ doktype = 'avt'
743
+ keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
744
+ when /kvittering/i
745
+ doktype = 'kvi'
746
+ keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
747
+ when /manual/i
748
+ doktype = 'man'
749
+ keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
750
+ when /(billett|ticket)/i
751
+ doktype = 'bil'
752
+ keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
753
+ when /(informasjon|information)/i
754
+ doktype = 'inf'
755
+ keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
756
+ else
757
+ doktype = 'dok'
758
+ keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
759
+ end
760
+ if not metadata['keywords'].empty?
761
+ keywords_preface == '' ? keywords = '' : keywords = keywords_preface
762
+ keywordsarray = metadata['keywords'].split(',')
763
+
764
+ #
765
+ # Sort array
766
+ #
767
+ keywordssorted = Array.new
768
+ keywordsarray.each_with_index do |value,index|
769
+ value = value.lstrip.chomp
770
+ value = value.gsub(/(Faktura|Rechnungs)(nummer)? /i,'fak')
771
+ value = value.gsub(/(Kunde)(n)?(nummer)? /i,'kdn')
772
+ value = value.gsub(/(Kunde)(n)?(nummer)?-/i,'kdn')
773
+ value = value.gsub(/(Ordre|Bestellung)(s?nummer)? /i,'ord')
774
+ value = value.gsub(/(Kvittering|Quittung)(snummer)? /i,'kvi')
775
+ value = value.gsub(/\s/,'_')
776
+ value = value.gsub(/\//,'_')
777
+ keywordsarray[index] = value
778
+ if value.match(/^(fak|kdn|ord|kvi)/)
779
+ keywordssorted.insert(0, value)
780
+ else
781
+ keywordssorted.push(value)
782
+ end
783
+ end
784
+
785
+ counter = 0
786
+ keywordssorted.each_with_index do |value,index|
787
+
788
+ # Exit condition limits the number of keywords used in the filename
789
+ # unless all keywords shall be added
790
+ if not options[':all-keywords']
791
+ counter > options[:keywords]-1 ? break : counter = counter + 1
792
+ end
793
+ if value.match(/(kvi|fak|ord|kdn)/i)
794
+ keywords == '' ? keywords = '-' + value : keywords = value + '-' + keywords
795
+ else
796
+ keywords == '' ? keywords = '-' + value : keywords.concat('-' + value)
797
+ end
798
+ end
799
+ # Normalise the keywords as well
800
+ #
801
+ I18n.enforce_available_locales = false
802
+ keywords = I18n.transliterate(keywords)
803
+
804
+ # There are no keywords
805
+ # Rare, but it happens
806
+ else
807
+
808
+ # There are no keywords.
809
+ # we are using the title and the subject
810
+ if keywords_preface != ''
811
+ keywords = keywords_preface
812
+ end
813
+
814
+ end
815
+ extension = 'pdf'
816
+ if keywords != nil and keywords[0] != '-'
817
+ keywords = '-' + keywords
818
+ end
819
+ keywords == nil ? keywords = '' : ''
820
+ newFilename = date + '-' +
821
+ author + '-' +
822
+ doktype +
823
+ keywords + '.' +
824
+ extension
825
+
826
+ # Output directory checks
827
+ if options[:outputdir]
828
+ #if not File.exist?(options[:outputdir])
829
+ # puts "Error: output dir '#{options[:outputdir]}' not found. Abort"
830
+ # exit 1
831
+ #end
832
+ end
833
+
834
+ if not options[:dryrun] and filename != newFilename.downcase
835
+ `mv -v '#{filename}' '#{newFilename.downcase}'`
836
+ else
837
+ puts filename + "\n => " + newFilename.downcase
838
+ end
839
+ end
840
+
841
+ #
842
+ # One parameter to show the current version
843
+ #
844
+ map %w[--version -v] => :__print_version
845
+ desc "--version, -v", 'Show the current script version'
846
+ def __print_version
847
+ puts VERSION
848
+ end
849
+
850
+ end
851
+
852
+ DOC.start
853
+
metadata ADDED
@@ -0,0 +1,45 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: pdfmd
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.3.0
5
+ platform: ruby
6
+ authors:
7
+ - Daniel Roos
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-03-16 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Managing the commong pdf metadata settings and renaming the pdf file
14
+ accordingly.
15
+ email: daniel@micronerd.org
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - lib/pdfmd.rb
21
+ homepage: http://rubygems.org/gems/pdfmd
22
+ licenses:
23
+ - MIT
24
+ metadata: {}
25
+ post_install_message:
26
+ rdoc_options: []
27
+ require_paths:
28
+ - lib
29
+ required_ruby_version: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ required_rubygems_version: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - ">="
37
+ - !ruby/object:Gem::Version
38
+ version: '0'
39
+ requirements: []
40
+ rubyforge_project:
41
+ rubygems_version: 2.4.6
42
+ signing_key:
43
+ specification_version: 4
44
+ summary: pdfmd - pdf-meta-data management
45
+ test_files: []