pdfmd 1.4.0 → 1.4.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,531 @@
1
+ #!/usr/bin/env ruby
2
+ # == Version 1.3
3
+ #
4
+ # == File: pdfmd.rb
5
+ #
6
+ # Show and edit Metadata of PDF files and rename the files accordingly.
7
+ #
8
+ # === Requirements
9
+ #
10
+ # ==== Ruby gems:
11
+ # - thor
12
+ # - highline/import
13
+ # - fileutils
14
+ # - i18n
15
+ # - pathname
16
+ # - logger
17
+ #
18
+ # ==== OS applications:
19
+ #
20
+ # - exiftools
21
+ #
22
+ # === Usage
23
+ #
24
+ # $ ./pdfmd <action> <parameter> file
25
+ #
26
+ # $ ./pdfmd help <action>
27
+ #
28
+ # An overview about the actions can be seen when running the script without
29
+ # any parameters
30
+ #
31
+ # Check and set metadata of PDF documents
32
+ #
33
+ # A complete set of metada contains
34
+ #
35
+ # * CreateDate
36
+ # * Title
37
+ # * Author
38
+ # * Subject
39
+ # * Keywords (optional)
40
+ #
41
+ # TODO: Include password protected PDF documents as well
42
+ # TODO: Fix broken PDF files automatically
43
+ # TODO: Enable logging in more functions than only "sort"
44
+ # TODO: Read this: http://lostechies.com/derickbailey/2011/04/29/writing-a-thor-application/
45
+ # TODO: ... and this: http://blog.paracode.com/2012/05/17/building-your-tools-with-thor/
46
+ # TODO: Create Gem: http://yehudakatz.com/2010/04/02/using-gemspecs-as-intended/
47
+ # gs \
48
+ # -o repaired.pdf \
49
+ # -sDEVICE=pdfwrite \
50
+ # -dPDFSETTINGS=/prepress \
51
+ # corrupted.pdf
52
+ #
53
+ # == Author
54
+ #
55
+ # Daniel Roos <daniel-git@micronarrativ.org>
56
+ # Source: https://github.com/Micronarrativ/ruby-pmd
57
+ #
58
+ require "thor"
59
+ require "highline/import"
60
+ require "fileutils"
61
+ require "i18n"
62
+ require 'pathname'
63
+ require 'logger'
64
+
65
+ VERSION = '1.4.2'
66
+
67
+ # Include general usage methods
68
+ require_relative('pdfmd/methods.rb')
69
+
70
+ class DOC < Thor
71
+
72
+ #
73
+ # Show the current metadata tags
74
+ #
75
+ # TODO: format output as JSON and YAML
76
+ # TODO: Enable additional options
77
+ # TODO: Add command to show current settings (from hiera)
78
+ #
79
+ desc 'show', 'Show metadata of a file'
80
+ method_option :all, :type => :boolean, :aliases => '-a', :desc => 'Show all metatags', :default => false, :required => false
81
+ method_option :tag, :type => :string, :aliases => '-t', :desc => 'Show specific tag(s), comma separated', :required => false
82
+ long_desc <<-LONGDESC
83
+ == General
84
+
85
+ Show metatags of a PDF document.
86
+
87
+ The following tags are being shown:
88
+ \x5 * Author
89
+ \x5 * Creator
90
+ \x5 * CreateDate
91
+ \x5 * Title
92
+ \x5 * Subject
93
+ \x5 * Keywords
94
+
95
+ == Parameters
96
+
97
+ --all, -a
98
+ \x5 Show all relevant metatags for a document.
99
+
100
+ Relevant tags are Author,Creator, CreateDate, Title, Subject, Keywords.
101
+
102
+ --tag, -t
103
+ \x5 Specify the metatag to show. The selected metatag must be one of the relevant tags. Other tags are ignored and nothing is returned.
104
+
105
+ The value for the parameter is case insensitive: 'Author' == 'author'
106
+
107
+ == Example
108
+
109
+ # Show default metatags for a pdf document
110
+ \x5>CLI show <filename>
111
+
112
+ # Show default metatags for example.pdf
113
+ \x5>CLI show example.pdf
114
+
115
+ # Show value for metatag 'Author' for the file example.pdf
116
+ \x5>CLI show -t author example.pdf
117
+
118
+ # Show value for metatags 'Author','Title' for the file example.pdf
119
+ \x5>CLI show -t author,title example.pdf
120
+
121
+ LONGDESC
122
+ def show(filename)
123
+
124
+ ENV['PDFMD_FILENAME'] = filename
125
+ ENV['PDFMD_TAGS'] = options[:tag]
126
+ ENV['PDFMD_ALL'] = options[:all].to_s
127
+ require_relative('./pdfmd/show.rb')
128
+
129
+ end
130
+
131
+ #
132
+ # Change a MetaTag Attribute
133
+ #
134
+ # TODO: keywords are added differently according to the documentation
135
+ # http://www.sno.phy.queensu.ca/~phil/exiftool/faq.html
136
+ desc 'edit', 'Edit Meta Tag(s)'
137
+ long_desc <<-LONGDESC
138
+ == General
139
+
140
+ Command will edit the metadata of a PDF document. Multiple values can be
141
+ specified or 'all'.
142
+
143
+ The command will invoke an interactive user input and request the values
144
+ for the metatag.
145
+
146
+ Additionally the file can be renamed at the end according to the new meta
147
+ tags. See `$ #{__FILE__} help rename` for details.
148
+
149
+ == Parameters
150
+
151
+ --tag, -t
152
+ \x5 Names or list of names of Metatag fields to set, separated by commata.
153
+
154
+ --rename, -r
155
+ \x5 Rename file after updating the meta tag information according to the fields.
156
+
157
+ This parameter is identical to running `> CLI rename <filename>`
158
+
159
+ General example:
160
+
161
+ # Edit tag 'TAG' and set a new value interactive.
162
+ \x5>CLI edit -t TAG <filename>
163
+
164
+ # Edit tag 'Author' and set new value interactive.
165
+ \x5>CLI edit -t author example.pdf
166
+
167
+ # Edit mulitple Tags and set a new value.
168
+ \x5>CLI edit -t tag1,tag2,tag3 <filename>
169
+
170
+
171
+ == Multiple Tags
172
+
173
+ For setting multiple tags list the tags comma separated.
174
+
175
+ For setting all tags (Author, Title, Subject, CreateDate, Keywords) use the keyword 'all' as tagname.
176
+
177
+ # Set tags 'Author', 'Title', 'Subject' in example.pdf interactivly.
178
+ \x5>CLI edit -t author,title,subject example.pdf`
179
+
180
+ # Set tags 'Author', 'Title', 'Subject', 'CreateDate', 'Keywords' in
181
+ example.pdf interactive.
182
+ \x5>CLI edit -t all example.pdf
183
+
184
+ == Tag: CreateDate
185
+
186
+ In order to enter a value for the 'CreateDate' field, some internal matching is going on in order to make it easier and faster to enter dates and times.
187
+
188
+ The following formats are identified/matched:
189
+
190
+ \x5 yyyymmdd
191
+ \x5 yyyymmd
192
+ \x5 yyyymmddHHMMSS
193
+ \x5 yyyy-mm-dd HH:MM:SS
194
+ \x5 yyyy:mm:dd HH:MM:SS
195
+ \x5 yyyy.mm.dd HH:MM:SS
196
+ \x5 yyyy-mm-d
197
+ \x5 yyyy-mm-dd
198
+ \x5 yyyy.mm.d
199
+ \x5 yyyy.mm.dd
200
+ \x5 yyyy:mm:d
201
+ \x5 yyyy:mm:dd
202
+
203
+ \x5 - If HH:MM:SS or HHMMSS is not provided, those values are automatically set to zero.
204
+ \x5 - The output format of every timestamp is <yyyy:mm:dd HH:MM:SS>
205
+ \x5 - When providing and invalid date, the incorrect date is rejected and the user asked to provide the correct date.
206
+
207
+ == Rename file
208
+
209
+ In addition to setting the tags the current file can be renamed according to
210
+ the new metadata.
211
+
212
+ # Set tag 'Author' and rename file example.pdf
213
+ \x5> CLI edit -t author -r example.pdf
214
+
215
+ See `> CLI help rename` for details about renaming.
216
+
217
+ LONGDESC
218
+ method_option :tag, :type => :string, :aliases => '-t', :desc => 'Name of the Tag(s) to Edit', :default => false, :required => true
219
+ method_option :rename, :type => :boolean, :aliases => '-r', :desc => 'Rename file after changing meta-tags', :default => false, :required => false
220
+ def edit(filename)
221
+
222
+ ENV['PDFMD_FILENAME'] = filename
223
+ ENV['PDFMD_TAG'] = options[:tag]
224
+ ENV['PDFMD_RENAME'] = options[:rename].to_s
225
+ ENV['PDFMD'] = __FILE__
226
+ require_relative('./pdfmd/edit.rb')
227
+
228
+ end
229
+
230
+ #
231
+ # Check the metadata for the minium necessary tags
232
+ # See documentation at the top of this file for defailts
233
+ #
234
+ # void check(string)
235
+ desc 'check', 'Check Metadata for completeness'
236
+ long_desc <<-LONGDESC
237
+ == General
238
+
239
+ Show value of the following metatags of a PDF document:
240
+
241
+ - Author
242
+ \x5- Creator
243
+ \x5- CreateDate
244
+ \x5- Subject
245
+ \x5- Title
246
+ \x5- Keywords
247
+
248
+ == Example
249
+
250
+ # Show the values of the metatags for example.pdf
251
+ \x5>CLI show example.pdf
252
+
253
+ LONGDESC
254
+ def check(filename)
255
+
256
+ ENV['PDFMD_FILENAME'] = filename
257
+ require_relative('./pdfmd/check.rb')
258
+
259
+ end
260
+
261
+ #
262
+ # Explain fields and Metatags
263
+ # Show information about how they are used.
264
+ #
265
+ desc 'explain','Show more information about usuable Meta-Tags'
266
+ long_desc <<-LONGDESC
267
+ == General
268
+
269
+ Explain some terms used with the script.
270
+
271
+ == Example
272
+
273
+ # Show the available subjects
274
+ \x5>CLI explain
275
+
276
+ # Show information about the subject 'author'
277
+ \x5>CLI explain author
278
+
279
+ LONGDESC
280
+ def explain(term='')
281
+
282
+ ENV['PDFMD_EXPLAIN'] = term
283
+ ENV['PDFMD'] = File.basename(__FILE__)
284
+ require_relative('./pdfmd/explain.rb')
285
+
286
+ end
287
+
288
+ #
289
+ # Sort the files into directories based on the author
290
+ #
291
+ desc 'sort','Sort files into directories sorted by Author'
292
+ long_desc <<-LONGDESC
293
+ == General
294
+
295
+ Will sort pdf documents into subdirectories according to the value of their
296
+ tag 'author'.
297
+
298
+ When using this action a logfile with all actions will be generated in the
299
+ current working directory with the same name as the script and the ending
300
+ '.log'. This can be disabled with the parameter 'log' if required.
301
+
302
+ If a document does not have an entry in the meta tag 'author', the file will
303
+ not be processed. This can be seen in the output of the logfile as well.
304
+
305
+ === Parameters
306
+
307
+ [*destination|d*]
308
+ \x5 Speficy the root output directory to where the folderstructure is being created.
309
+
310
+ This parameter is required if hiera is not configured.
311
+
312
+ This parameter overwrites the hiera defaults
313
+
314
+ [*copy|c*]
315
+ \x5 Copy the files instead of moving them.
316
+
317
+ [*log|l*]
318
+ \x5 Disable/Enable the logging.
319
+ \x5 Default: enabled.
320
+
321
+ [*interactive|i*]
322
+ \x5 Disable/Enable interactive sorting. This will ask for confirmation for
323
+ \x5 each sorting action.
324
+ \x5 Default: disabled.
325
+
326
+ === Replacement rules
327
+
328
+ The subdirectories for the documents are generated from the values in the
329
+ tag 'author' of each document.
330
+
331
+ In order to ensure a clean directory structure, there are certain rules
332
+ for altering the values.
333
+ \x5 1. Whitespaces are replaced by underscores.
334
+ \x5 2. Dots are replaced by underscores.
335
+ \x5 3. All letters are converted to their lowercase version.
336
+ \x5 4. Special characters are serialized
337
+
338
+ === Hiera configuration
339
+
340
+ Set the default values mentioned below as sub-hash of the main configuration:
341
+
342
+ YAML
343
+ \x5sort:
344
+ \x5 key: value
345
+
346
+ === Hiera defaults
347
+
348
+ The following values can be influenced by the hiera configuration in the
349
+ section 'sort'. Commandline parameter will overwrite the defaults coming
350
+ from hiera unless otherwise notet.
351
+
352
+ [*copy*]
353
+ \x5 If set to true copies the files from the source directory instead of moving them.
354
+
355
+ [*destination*]
356
+ \x5 Specifies the default output directory (root-directory). Either this or the
357
+ command line parameter for destinations must be set.
358
+
359
+ [*logfile*]
360
+ \x5 Specifies the default path for the logfile output. If this is not
361
+ specfied a logfile with the scriptname + '.log' will be created in the
362
+ current working directory.
363
+
364
+ [*interactive*]
365
+ \x5 If set to true, each file must be acknowledged to be processed when
366
+ running the script.
367
+
368
+ === Example
369
+
370
+ This command does the following:
371
+ \x5 1. Take all pdf documents in the subdirectory ./documents.
372
+ \x5 2. Create the output folder structure in `/tmp/test/`.
373
+ \x5 3. Copy the files instead of moving them.
374
+ \x5 4. Disable the logging.
375
+ \x5> CLI sort -d /tmp/test -c -l false ./documents
376
+
377
+ LONGDESC
378
+ method_option :destination, :aliases => '-d', :required => false, :type => :string, :desc => 'Defines the output directory'
379
+ method_option :copy, :aliases => '-c', :required => false, :type => :boolean, :desc => 'Copy files instead of moving them'
380
+ method_option :log, :aliases => '-l', :required => false, :type => :boolean, :desc => 'Enable/Disable creation of log files', :default => true
381
+ method_option :interactive, :aliases => '-i', :required => false, :type => :boolean, :desc => 'Enable/Disable interactive sort'
382
+ def sort(inputDir)
383
+
384
+ ENV['PDFMD_INPUTDIR'] = inputDir
385
+ ENV['PDFMD_DESTINATION'] = options[:destination].to_s
386
+ ENV['PDFMD_COPY'] = options[:copy].to_s
387
+ ENV['PDFMD_LOG'] = options[:log].to_s
388
+ ENV['PDFMD_INTERACTIVE'] = options[:interactive].to_s
389
+ require_relative('./pdfmd/sort.rb')
390
+
391
+ end
392
+
393
+ #
394
+ # Rename the file according to the Metadata
395
+ #
396
+ # Scheme: YYYYMMDD-author-subject-keywords.extension
397
+ desc 'rename', 'Rename the file according to Metadata'
398
+ long_desc <<-LONGDESC
399
+ == General
400
+
401
+ Rename a file with the meta tags in the document.
402
+
403
+ == Parameter
404
+
405
+ --dry-run, -n
406
+ \x5 Simulate the renaming process and show the result without changing the file.
407
+
408
+ --all-keywords, -a
409
+ \x5 Use all keywords from the meta information in the file name and ignore the limit.
410
+
411
+ --keywwords, -k
412
+ \x5 Set the number of keywords used in the filename to a new value.
413
+ \x5 Default: 3
414
+
415
+ --outputdir, -o
416
+ \x5 Rename the file and move it to the directory defined in '--outputdir'.
417
+
418
+ The directory must exist at runtime.
419
+
420
+ == Example
421
+
422
+ # Rename the file according to the metatags
423
+ \x5> CLI rename <filename>
424
+
425
+ # Rename example.pdf according to the metatags
426
+ \x5> CLI rename example.pdf
427
+
428
+ # Simulate renaming example.pdf according to the metatags (dry-run)
429
+ \x5> CLI rename -n example.pdf
430
+
431
+ == Rules
432
+
433
+ There are some rules regarding how documents are being renamed
434
+
435
+ Rule 1: All documents have the following filenaming structure:
436
+
437
+ <yyyymmdd>-<author>-<type>-<additionalInformation>.<extension>
438
+
439
+ \x5 # <yyyymmdd>: Year, month and day identival to the meta information in the
440
+ document.
441
+ \x5 # <author>: Author of the document, identical to the meta information
442
+ in the document. Special characters and whitespaces are replaced.
443
+ \x5 # <type>: Document type, is being generated from the title field in the metadata of the document. Document type is a three character abbreviation following the following logic:
444
+
445
+ \x5 til => Tilbudt|Angebot
446
+ \x5 odb => Orderbekreftelse
447
+ \x5 fak => Faktura
448
+ \x5 ord => Order
449
+ \x5 avt => Kontrakt|Avtale|Vertrag|contract
450
+ \x5 kvi => Kvittering
451
+ \x5 man => Manual
452
+ \x5 bil => Billett|Ticket
453
+ \x5 inf => Informasjon|Information
454
+ \x5 dok => unknown
455
+
456
+ If the dokument type can not be determined automatically, it defaults to 'dok'.
457
+
458
+ # <additionalInformation>: Information generated from the metadata fields
459
+ 'title', 'subject' and 'keywords'.
460
+
461
+ If 'Title' or 'Keywords' contains one of the following keywords, the will be replaced with the corresponding abbreviation followed by the specified value separated by a whitespace:
462
+
463
+ \x5 fak => Faktura|Fakturanummer|Rechnung|Rechnungsnummer
464
+ \x5 kdn => Kunde|Kundenummer|Kunde|Kundennummer
465
+ \x5 ord => Ordre|Ordrenummer|Bestellung|Bestellungsnummer
466
+ \x5 kvi => Kvittering|Kvitteringsnummer|Quittung|Quittungsnummer
467
+
468
+ Rule 2: The number of keywords used in the filename is defined by the parameter '-k'. See the section of that parameter for more details and the default value.
469
+
470
+ Rule 3: Keywords matching 'kvi','fak','ord','kdn' are prioritised.
471
+
472
+ Rule 4: Special characters and whitespaces are replaced:
473
+
474
+ \x5 ' ' => '_'
475
+ \x5 '/' => '_'
476
+
477
+ Rule 5: The new filename has only lowercase characters.
478
+
479
+ == Example (detailed)
480
+
481
+ # Example PDF with following MetaTags:
482
+
483
+ \x5 Filename : example.pdf
484
+ \x5 Author : John
485
+ \x5 Subject : new Product
486
+ \x5 Title : Presentation
487
+ \x5 CreateDate : 1970:01:01 01:00:00
488
+ \x5 Keywords : John Doe, Jane Doe, Mister Doe
489
+
490
+ # Renaming the file
491
+ \x5> CLI rename example.pdf
492
+ \x5 example.pdf => 19700101-john-dok-new_product-john_doe-jane_doe.pdf
493
+
494
+ # Simulation to rename the file (no actual change)
495
+ \x5> CLI rename -n example.pdf
496
+ \x5example.pdf => 19700101-john-dok-new_product-john_doe-jane_doe.pdf
497
+
498
+ # Renaming the file with all keywords
499
+ \x5> CLI rename -n -a example.pdf
500
+
501
+ \x5 example.pdf => 19700101-john-dok-new_product-john_doe-jane_doe-mister_doe.pdf
502
+
503
+ LONGDESC
504
+ method_option :dryrun, :type => :boolean, :aliases => '-n', :desc => 'Run without making changes', :default => false, :required => false
505
+ method_option :allkeywords, :type => :boolean, :aliases => '-a', :desc => 'Add all keywords (no limit)', :default => false, :required => false
506
+ method_option :keywords, :type => :numeric, :aliases => '-k', :desc => 'Number of keywords to include (Default: 3)', :default => 3, :required => false
507
+ method_option :outputdir, :aliases => '-o', :type => :string, :desc => 'Speficy output directory', :default => :false, :required => :false
508
+ def rename(filename)
509
+
510
+ ENV['PDFMD_FILENAME'] = filename
511
+ ENV['PDFMD_DRYRUN'] = options[:dryrun].to_s
512
+ ENV['PDFMD_ALLKEYWORDS'] = options[:allkeywords].to_s
513
+ ENV['PDFMD_OUTPUTDIR'] = options[:outputdir].to_s
514
+ ENV['PDFMD_NUMBERKEYWORDS'] = options[:keywords].to_s
515
+ require_relative('./pdfmd/rename.rb')
516
+
517
+ end
518
+
519
+ #
520
+ # One parameter to show the current version
521
+ #
522
+ map %w[--version -v] => :__print_version
523
+ desc "--version, -v", 'Show the current script version'
524
+ def __print_version
525
+ puts VERSION
526
+ end
527
+
528
+ end
529
+
530
+ DOC.start
531
+