pdfmd 1.4.0 → 1.4.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,10 @@
1
+ filename = ENV.fetch('PDFMD_FILENAME')
2
+
3
+ returnvalue = 0
4
+ readMetadata(filename).each do|key,value|
5
+ if key.match(/author|subject|createdate|title/) and value.empty?
6
+ puts 'Missing value: ' + key
7
+ returnvalue == 0 ? returnvalue = 1 : ''
8
+ end
9
+ end
10
+ exit returnvalue
@@ -0,0 +1,40 @@
1
+ #
2
+ # Thor command 'edit' for changing the common
3
+ # ExifTags within the PDF file
4
+ #
5
+ filename = ENV.fetch('PDFMD_FILENAME')
6
+ optTag = ENV['PDFMD_TAG'] || nil
7
+ optRename = ENV['PDFMD_RENAME'] == 'true' ? true : false
8
+ pdfmd = ENV['PDFMD']
9
+
10
+
11
+ metadata = readMetadata(filename)
12
+
13
+ if optTag == 'all'
14
+ tags = ['author','title','subject','createdate','keywords']
15
+ else
16
+ tags = optTag.split(',')
17
+ end
18
+ tags.each do |currentTag|
19
+
20
+ # Change the tag to something we can use here
21
+ puts "Current value: '#{metadata[currentTag.downcase]}'"
22
+ answer = readUserInput("Enter new value for #{currentTag} :")
23
+ if currentTag.downcase == 'createdate'
24
+ while not answer = identifyDate(answer)
25
+ puts 'Invalid date format'
26
+ answer = readUserInput("Enter new value for #{currentTag} :")
27
+ end
28
+ end
29
+ puts "Changing value for #{currentTag}: '#{metadata[currentTag]}' => #{answer}"
30
+ `exiftool -#{currentTag.downcase}='#{answer}' -overwrite_original '#{filename}'`
31
+ end
32
+
33
+ #
34
+ # If required, run the renaming task afterwards
35
+ # This is not pretty, but seems to be the only way to do this in THOR
36
+ #
37
+ if optRename
38
+ `#{pdfmd} rename '#{filename}'`
39
+ end
40
+
@@ -0,0 +1,3 @@
1
+ [Author]
2
+ The sender or creator of the document.
3
+
@@ -0,0 +1,6 @@
1
+ [CreateDate]
2
+ Date of the document. THis is not the date when the file was created, but
3
+ the date found or printed in the document.
4
+
5
+ Take the date of a contract as an example.
6
+
@@ -0,0 +1,18 @@
1
+ Information about hiera: https://docs.puppetlabs.com/hiera/1/index.html
2
+
3
+ Installation:
4
+
5
+ ```
6
+ $ gem install hiera
7
+ ```
8
+
9
+ Configure default settings in hiera:
10
+
11
+ YAML
12
+ ---
13
+ pdfmd::config:
14
+ sort:
15
+ destination : /tmp/output
16
+ copy : true
17
+ logfile : /var/log/pdfmd.log
18
+
@@ -0,0 +1,9 @@
1
+ [Keywords]
2
+ Anything else that might be of interesst.
3
+ In Orders the elements that have been orders. Contracts might contain the
4
+ Names and adress of the involved parties.
5
+
6
+ When writing Invoices with their numbers, these will be automatically be
7
+ picked up and can be integrated in the filename, e.g. "Invoicenumber 12334"
8
+ becomes "inv12334"
9
+
@@ -0,0 +1,17 @@
1
+ term = ENV.fetch('PDFMD_EXPLAIN')
2
+ pdfmd = ENV.fetch('PDFMD')
3
+
4
+ case term
5
+ when ''
6
+ puts 'Available subjects:'
7
+ puts '- author'
8
+ puts '- createdate'
9
+ puts '- hiera'
10
+ puts '- keywords'
11
+ puts '- subject'
12
+ puts '- title'
13
+ puts ' '
14
+ puts "Run `$ #{pdfmd} explain <subject>` to get more details."
15
+ else
16
+ puts File.read("lib/pdfmd/explain.#{term.downcase}.md")
17
+ end
@@ -0,0 +1,8 @@
1
+ [Subject]
2
+ What is the document about.
3
+
4
+ For example:
5
+ Manual : What is the manual about?
6
+ Invoice : Invoice number?
7
+ Contract: Contract number of Subject of the contract?
8
+ Order : Ordernumber of the document?
@@ -0,0 +1,5 @@
1
+ [Title]
2
+ General type of the document, e.g. manual, invoice of contract.
3
+
4
+ Can be chosen freely, but some keywords are treated special when creating
5
+ the filename.
@@ -0,0 +1,130 @@
1
+ # == File: methods.rb
2
+ #
3
+ # General methods for supporting smaller tasks of the Thor commands
4
+
5
+ #
6
+ # Query Hiera installation
7
+ # I don't give a sh** about cross platform at this point.
8
+ #
9
+ # Return the hash of the hiera values or false (if no hiera is found)
10
+ #
11
+ def queryHiera(keyword,facts = 'UNSET')
12
+
13
+ # Set default facts
14
+ facts == 'UNSET' ? facts = "fqdn=#{`hostname`}" : ''
15
+
16
+ # If hiera isn't found, return false
17
+ # otherwise return the hash
18
+ if !system('which hiera > /dev/null 2>&1')
19
+ puts 'Cannot find "hiera" command in $path.'
20
+ return false
21
+ else
22
+ return eval(`hiera #{keyword} #{facts}`)
23
+ end
24
+
25
+ end
26
+
27
+
28
+
29
+ #
30
+ # Set Keywords Preface based on title and subject
31
+ # If subject matches a number/character combination and contains no spaces,
32
+ # the preface will be combined with the doktype.
33
+ # If not: preface will contain the whole subject with dots and spaces being
34
+ # replaced with underscores
35
+ #
36
+ def setKeywordsPreface(metadata, doktype)
37
+ if metadata['subject'].match(/^\d+[^+s]+.*/)
38
+ return doktype + metadata['subject']
39
+ else
40
+ subject = metadata['subject']
41
+
42
+ # Take care of special characters
43
+ I18n.enforce_available_locales = false
44
+ subject = I18n.transliterate(metadata['subject'])
45
+
46
+ # Replace everything else
47
+ subject = subject.gsub(/[^a-zA-Z0-9]+/,'_')
48
+ return subject
49
+ end
50
+ end
51
+
52
+
53
+ #
54
+ # Function to read the metadata from a given file
55
+ # hash readMetadata(string)
56
+ #
57
+ def readMetadata(pathFile = false)
58
+ metadata = Hash.new
59
+ metadata['keywords'] = ''
60
+ metadata['subject'] = ''
61
+ metadata['title'] = ''
62
+ metadata['author'] = ''
63
+ metadata['creator'] = ''
64
+ metadata['createdate'] = ''
65
+ if not File.file?(pathFile)
66
+ puts "Cannot access file #{pathFile}. Abort"
67
+ abort
68
+ end
69
+
70
+ # Fetch the Metada with the help of exiftools (unless something better is
71
+ # found
72
+ metaStrings = `exiftool '#{pathFile}' | egrep -i '^Creator\s+\:|^Author|Create Date|Subject|Keywords|Title'`
73
+
74
+ # Time to cherrypick the available data
75
+ entries = metaStrings.split("\n")
76
+ entries.each do |entry|
77
+ values = entry.split(" : ")
78
+ values[0].match(/Creator/) and metadata['creator'] == '' ? metadata['creator'] = values[1]: metadata['creator'] = ''
79
+ values[0].match(/Author/) and metadata['author'] == '' ? metadata['author'] = values[1]: metadata['author'] = ''
80
+ values[0].match(/Create Date/) and metadata['createdate'] == '' ? metadata['createdate'] = values[1]: metadata['createdate'] = ''
81
+ values[0].match(/Subject/) and metadata['subject'] == '' ? metadata['subject'] = values[1]: metadata['subject'] = ''
82
+ values[0].match(/Keywords/) and metadata['keywords'] == '' ? metadata['keywords'] = values[1]: metadata['keywords'] =''
83
+ values[0].match(/Title/) and metadata['title'] == '' ? metadata['title'] = values[1]: metadata['title'] =''
84
+ end
85
+ return metadata
86
+ end
87
+
88
+
89
+ #
90
+ # Read user input
91
+ #
92
+ def readUserInput(textstring = 'Enter value: ')
93
+ return ask textstring
94
+ end
95
+
96
+
97
+ #
98
+ # Identify a date
99
+ # Function takes a string and tries to identify a date in there.
100
+ # returns false if no date could be identified
101
+ # otherwise the date is returned in the format as
102
+ #
103
+ # YYYY:MM:DD HH:mm:ss
104
+ #
105
+ # For missing time values zero is assumed
106
+ #
107
+ def identifyDate(datestring)
108
+ identifiedDate = ''
109
+ year = '[1-2][90][0-9][0-9]'
110
+ month = '0[0-9]|10|11|12'
111
+ day = '[1-9]|0[1-9]|1[0-9]|2[0-9]|3[0-1]'
112
+ hour = '[0-1][0-9]|2[0-3]|[1-9]'
113
+ minute = '[0-5][0-9]'
114
+ second = '[0-5][0-9]'
115
+ case datestring
116
+ when /^(#{year})(#{month})(#{day})$/
117
+ identifiedDate = $1 + ':' + $2 + ':' + $3 + ' 00:00:00'
118
+ when /^(#{year})(#{month})(#{day})(#{hour})(#{minute})(#{second})$/
119
+ identifiedDate = $1 + ':' + $2 + ':' + $3 + ' ' + $4 + ':' + $5 + ':' + $6
120
+ when /^(#{year})[\:|\.|\-](#{month})[\:|\.|\-](#{day})\s(#{hour})[\:](#{minute})[\:](#{second})$/
121
+ identifiedDate = $1 + ':' + $2 + ':' + $3 + ' ' + $4 + ':' + $5 + ':' + $6
122
+ when /^(#{year})[\:|\.|\-](#{month})[\:|\.|\-](#{day})$/
123
+ day = "%02d" % $3
124
+ month = "%02d" % $2
125
+ identifiedDate = $1 + ':' + month + ':' + day + ' 00:00:00'
126
+ else
127
+ identifiedDate = false
128
+ end
129
+ return identifiedDate
130
+ end
@@ -0,0 +1,146 @@
1
+ #
2
+ # Thor command 'rename'
3
+ #
4
+ # TODO: Define outputdir from Hiera
5
+ # TODO: Add option for copy when renaming
6
+ # TODO: Add option to create outputdir if not existing
7
+ # TODO: Define option to create outputdir via Hiera
8
+ #
9
+ filename = ENV.fetch('PDFMD_FILENAME')
10
+ allkeywords = ENV.fetch('PDFMD_ALLKEYWORDS')
11
+ outputdir = ENV.fetch('PDFMD_OUTPUTDIR') == 'false' ? false : ENV.fetch('PDFMD_OUTPUTDIR')
12
+ dryrun = ENV.fetch('PDFMD_DRYRUN') == 'false' ? false : true
13
+ numberKeywords = ENV.fetch('PDFMD_NUMBERKEYWORDS').to_i
14
+
15
+ metadata = readMetadata(filename).each do |key,value|
16
+
17
+ # Check if the metadata is complete
18
+ if key.match(/author|subject|createdate|title/) and value.empty?
19
+ puts 'Missing value for ' + key
20
+ puts 'Abort'
21
+ exit 1
22
+ end
23
+
24
+ end
25
+
26
+ date = metadata['createdate'].gsub(/\ \d{2}\:\d{2}\:\d{2}.*$/,'').gsub(/\:/,'')
27
+ author = metadata['author'].gsub(/\./,'_').gsub(/\-/,'').gsub(/\s/,'_')
28
+ I18n.enforce_available_locales = false
29
+ author = I18n.transliterate(author) # Normalising
30
+
31
+ keywords_preface = ''
32
+ # This statement can probably be optimised
33
+ case metadata['title']
34
+ when /(Tilbudt|Angebot)/i
35
+ doktype = 'til'
36
+ keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
37
+ when /Orderbekrefelse/i
38
+ doktype = 'odb'
39
+ keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
40
+ when /faktura/i
41
+ doktype = 'fak'
42
+ keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
43
+ when /order/i
44
+ doktype = 'ord'
45
+ keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
46
+ when /(kontrakt|avtale|vertrag|contract)/i
47
+ doktype = 'avt'
48
+ keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
49
+ when /kvittering/i
50
+ doktype = 'kvi'
51
+ keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
52
+ when /manual/i
53
+ doktype = 'man'
54
+ keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
55
+ when /(billett|ticket)/i
56
+ doktype = 'bil'
57
+ keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
58
+ when /(informasjon|information)/i
59
+ doktype = 'inf'
60
+ keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
61
+ else
62
+ doktype = 'dok'
63
+ keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
64
+ end
65
+ if not metadata['keywords'].empty?
66
+ keywords_preface == '' ? keywords = '' : keywords = keywords_preface
67
+ keywordsarray = metadata['keywords'].split(',')
68
+
69
+ #
70
+ # Sort array
71
+ #
72
+ keywordssorted = Array.new
73
+ keywordsarray.each_with_index do |value,index|
74
+ value = value.lstrip.chomp
75
+ value = value.gsub(/(Faktura|Rechnungs)(nummer)? /i,'fak')
76
+ value = value.gsub(/(Kunde)(n)?(nummer)? /i,'kdn')
77
+ value = value.gsub(/(Kunde)(n)?(nummer)?-/i,'kdn')
78
+ value = value.gsub(/(Ordre|Bestellung)(s?nummer)? /i,'ord')
79
+ value = value.gsub(/(Kvittering|Quittung)(snummer)? /i,'kvi')
80
+ value = value.gsub(/\s/,'_')
81
+ value = value.gsub(/\//,'_')
82
+ keywordsarray[index] = value
83
+ if value.match(/^(fak|kdn|ord|kvi)/)
84
+ keywordssorted.insert(0, value)
85
+ else
86
+ keywordssorted.push(value)
87
+ end
88
+ end
89
+
90
+ counter = 0
91
+ keywordssorted.each_with_index do |value,index|
92
+
93
+ # Exit condition limits the number of keywords used in the filename
94
+ # unless all keywords shall be added
95
+ if not allkeywords.empty?
96
+ counter > numberKeywords-1 ? break : counter = counter + 1
97
+ end
98
+ if value.match(/(kvi|fak|ord|kdn)/i)
99
+ keywords == '' ? keywords = '-' + value : keywords = value + '-' + keywords
100
+ else
101
+ keywords == '' ? keywords = '-' + value : keywords.concat('-' + value)
102
+ end
103
+ end
104
+ # Normalise the keywords as well
105
+ #
106
+ I18n.enforce_available_locales = false
107
+ keywords = I18n.transliterate(keywords)
108
+
109
+ # There are no keywords
110
+ # Rare, but it happens
111
+ else
112
+
113
+ # There are no keywords.
114
+ # we are using the title and the subject
115
+ if keywords_preface != ''
116
+ keywords = keywords_preface
117
+ end
118
+
119
+ end
120
+ extension = 'pdf'
121
+ if keywords != nil and keywords[0] != '-'
122
+ keywords = '-' + keywords
123
+ end
124
+ keywords == nil ? keywords = '' : ''
125
+ newFilename = date + '-' +
126
+ author + '-' +
127
+ doktype +
128
+ keywords + '.' +
129
+ extension
130
+
131
+ # Output directory checks
132
+ if outputdir
133
+ if not File.exist?(outputdir)
134
+ puts "Error: output dir '#{outputdir}' not found. Abort."
135
+ exit 1
136
+ end
137
+ else
138
+ # Output to Inputdir
139
+ outputdir = File.dirname(filename)
140
+ end
141
+
142
+ if not dryrun and filename != newFilename.downcase
143
+ `mv -v '#{filename}' '#{outputdir}/#{newFilename.downcase}'`
144
+ else
145
+ puts filename + "\n => " + newFilename.downcase
146
+ end
@@ -0,0 +1,24 @@
1
+ filename = ENV.fetch('PDFMD_FILENAME')
2
+ optTag = ENV['PDFMD_TAGS'] || nil
3
+ optAll = ENV['PDFMD_ALL'] == 'true' ? true : nil
4
+
5
+ metadata = readMetadata(filename)
6
+
7
+ # Output all metatags
8
+ if optAll or optTag.nil?
9
+
10
+ puts "Author : " + metadata['author'].to_s
11
+ puts "Creator : " + metadata['creator'].to_s
12
+ puts "CreateDate : " + metadata['createdate'].to_s
13
+ puts "Subject : " + metadata['subject'].to_s
14
+ puts "Title : " + metadata['title'].to_s
15
+ puts "Keywords : " + metadata['keywords'].to_s
16
+
17
+ elsif not optTag.nil? # Output specific tag(s)
18
+
19
+ tags = optTag.split(',')
20
+ tags.each do |tag|
21
+ puts metadata[tag.downcase]
22
+ end
23
+
24
+ end
@@ -0,0 +1,100 @@
1
+ inputDir = ENV.fetch('PDFMD_INPUTDIR')
2
+
3
+ require_relative('./methods.rb')
4
+ require 'fileutils'
5
+
6
+ opt_destination = ENV.fetch('PDFMD_DESTINATION')
7
+ opt_copy = ENV.fetch('PDFMD_COPY')
8
+ opt_log = ENV.fetch('PDFMD_LOG')
9
+ opt_interactive = ENV.fetch('PDFMD_INTERACTIVE')
10
+
11
+ hieraDefaults = queryHiera('pdfmd::config')
12
+
13
+ copyAction = opt_copy.empty? ? false : true
14
+ if opt_copy.nil? and hieraDefaults['sort']['copy'] == true
15
+ copyAction = true
16
+ puts 'Setting action to copy based on Hiera.'
17
+ end
18
+
19
+ interactiveAction = opt_interactive.empty? ? false : true
20
+ if opt_interactive.empty? and hieraDefaults['sort']['interactive'] == true
21
+ interactiveAction = true
22
+ puts 'Setting interactive to true based on Hiera.'
23
+ end
24
+
25
+ # Fetch alternate destination from hiera if available
26
+ destination = opt_destination
27
+ if destination.nil? or destination == ''
28
+
29
+ hieraHash = queryHiera('pdfmd::config')
30
+ if !hieraHash['sort']['destination'].nil?
31
+ destination = hieraHash['sort']['destination']
32
+ else
33
+ puts 'No information about destination found.'
34
+ puts 'Set parameter -d or configure hiera.'
35
+ puts 'Abort.'
36
+ exit 1
37
+ end
38
+
39
+ end
40
+
41
+ logenable = opt_log
42
+ logfile = !hieraDefaults['sort']['logfile'].nil? ? hieraDefaults['sort']['logfile'] : Dir.pwd.chomp('/') + '/' + Pathname.new(__FILE__).basename + '.log'
43
+
44
+ # Check that logfilepath exists and is writeable
45
+ if !File.writable?(logfile)
46
+ puts "Cannot write '#{logfile}. Abort."
47
+ exit 1
48
+ end
49
+ logenable ? $logger = Logger.new(logfile) : ''
50
+
51
+ # Input validation
52
+ !File.exist?(inputDir) ? abort('Input directory does not exist. Abort.'): ''
53
+ File.directory?(inputDir) ? '' : abort('Input is a single file')
54
+ File.file?(destination) ? abort("Output '#{destination}' is an existing file. Cannot create directory with the same name. Abort") : ''
55
+ unless File.directory?(destination)
56
+ FileUtils.mkdir_p(destination)
57
+ $logger.info("Destination '#{destination}' has been created.")
58
+ end
59
+
60
+ # Iterate through all files
61
+ Dir[inputDir.chomp('/') + '/*.pdf'].sort.each do |file|
62
+
63
+ if interactiveAction
64
+ answer = readUserInput("Process '#{file}' ([y]/n): ")
65
+ answer = answer.empty? ? 'y' : answer
66
+ answer.match(/y/) ? '' : next
67
+ end
68
+
69
+ metadata = readMetadata(file)
70
+ if metadata['author'] and not metadata['author'].empty?
71
+ author = metadata['author'].gsub(' ','_').gsub('.','_')
72
+ I18n.enforce_available_locales = false # Serialize special characters
73
+ author = I18n.transliterate(author).downcase
74
+ folderdestination = destination.chomp('/') + '/' + author
75
+
76
+ unless File.directory?(folderdestination)
77
+ FileUtils.mkdir_p(folderdestination)
78
+ logenable ? $logger.info("Folder '#{folderdestination}' has been created."): ''
79
+ end
80
+
81
+ filedestination = destination.chomp('/') + '/' + author + '/' + Pathname.new(file).basename.to_s
82
+
83
+ # Final check before touching the filesystem
84
+ if not File.exist?(filedestination)
85
+ $logger.info("File '#{file}' => '#{filedestination}'")
86
+
87
+ # Move/Copy the file
88
+ if copyAction
89
+ FileUtils.cp(file, filedestination)
90
+ else
91
+ FileUtils.mv(file,filedestination)
92
+ end
93
+
94
+ else
95
+ logenable ? $logger.warn("File '#{filedestination}' already exists. Ignoring.") : ''
96
+ end
97
+ else
98
+ logenable ? $logger.warn("Missing tag 'Author' for file '#{file}'. Skipping.") : (puts "Missing tag 'Author' for file '#{file}'. Skipping")
99
+ end
100
+ end