pdfmd 1.4.0 → 1.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,10 @@
1
+ filename = ENV.fetch('PDFMD_FILENAME')
2
+
3
+ returnvalue = 0
4
+ readMetadata(filename).each do|key,value|
5
+ if key.match(/author|subject|createdate|title/) and value.empty?
6
+ puts 'Missing value: ' + key
7
+ returnvalue == 0 ? returnvalue = 1 : ''
8
+ end
9
+ end
10
+ exit returnvalue
@@ -0,0 +1,40 @@
1
+ #
2
+ # Thor command 'edit' for changing the common
3
+ # ExifTags within the PDF file
4
+ #
5
+ filename = ENV.fetch('PDFMD_FILENAME')
6
+ optTag = ENV['PDFMD_TAG'] || nil
7
+ optRename = ENV['PDFMD_RENAME'] == 'true' ? true : false
8
+ pdfmd = ENV['PDFMD']
9
+
10
+
11
+ metadata = readMetadata(filename)
12
+
13
+ if optTag == 'all'
14
+ tags = ['author','title','subject','createdate','keywords']
15
+ else
16
+ tags = optTag.split(',')
17
+ end
18
+ tags.each do |currentTag|
19
+
20
+ # Change the tag to something we can use here
21
+ puts "Current value: '#{metadata[currentTag.downcase]}'"
22
+ answer = readUserInput("Enter new value for #{currentTag} :")
23
+ if currentTag.downcase == 'createdate'
24
+ while not answer = identifyDate(answer)
25
+ puts 'Invalid date format'
26
+ answer = readUserInput("Enter new value for #{currentTag} :")
27
+ end
28
+ end
29
+ puts "Changing value for #{currentTag}: '#{metadata[currentTag]}' => #{answer}"
30
+ `exiftool -#{currentTag.downcase}='#{answer}' -overwrite_original '#{filename}'`
31
+ end
32
+
33
+ #
34
+ # If required, run the renaming task afterwards
35
+ # This is not pretty, but seems to be the only way to do this in THOR
36
+ #
37
+ if optRename
38
+ `#{pdfmd} rename '#{filename}'`
39
+ end
40
+
@@ -0,0 +1,3 @@
1
+ [Author]
2
+ The sender or creator of the document.
3
+
@@ -0,0 +1,6 @@
1
+ [CreateDate]
2
+ Date of the document. THis is not the date when the file was created, but
3
+ the date found or printed in the document.
4
+
5
+ Take the date of a contract as an example.
6
+
@@ -0,0 +1,18 @@
1
+ Information about hiera: https://docs.puppetlabs.com/hiera/1/index.html
2
+
3
+ Installation:
4
+
5
+ ```
6
+ $ gem install hiera
7
+ ```
8
+
9
+ Configure default settings in hiera:
10
+
11
+ YAML
12
+ ---
13
+ pdfmd::config:
14
+ sort:
15
+ destination : /tmp/output
16
+ copy : true
17
+ logfile : /var/log/pdfmd.log
18
+
@@ -0,0 +1,9 @@
1
+ [Keywords]
2
+ Anything else that might be of interesst.
3
+ In Orders the elements that have been orders. Contracts might contain the
4
+ Names and adress of the involved parties.
5
+
6
+ When writing Invoices with their numbers, these will be automatically be
7
+ picked up and can be integrated in the filename, e.g. "Invoicenumber 12334"
8
+ becomes "inv12334"
9
+
@@ -0,0 +1,17 @@
1
+ term = ENV.fetch('PDFMD_EXPLAIN')
2
+ pdfmd = ENV.fetch('PDFMD')
3
+
4
+ case term
5
+ when ''
6
+ puts 'Available subjects:'
7
+ puts '- author'
8
+ puts '- createdate'
9
+ puts '- hiera'
10
+ puts '- keywords'
11
+ puts '- subject'
12
+ puts '- title'
13
+ puts ' '
14
+ puts "Run `$ #{pdfmd} explain <subject>` to get more details."
15
+ else
16
+ puts File.read("lib/pdfmd/explain.#{term.downcase}.md")
17
+ end
@@ -0,0 +1,8 @@
1
+ [Subject]
2
+ What is the document about.
3
+
4
+ For example:
5
+ Manual : What is the manual about?
6
+ Invoice : Invoice number?
7
+ Contract: Contract number of Subject of the contract?
8
+ Order : Ordernumber of the document?
@@ -0,0 +1,5 @@
1
+ [Title]
2
+ General type of the document, e.g. manual, invoice of contract.
3
+
4
+ Can be chosen freely, but some keywords are treated special when creating
5
+ the filename.
@@ -0,0 +1,130 @@
1
+ # == File: methods.rb
2
+ #
3
+ # General methods for supporting smaller tasks of the Thor commands
4
+
5
+ #
6
+ # Query Hiera installation
7
+ # I don't give a sh** about cross platform at this point.
8
+ #
9
+ # Return the hash of the hiera values or false (if no hiera is found)
10
+ #
11
+ def queryHiera(keyword,facts = 'UNSET')
12
+
13
+ # Set default facts
14
+ facts == 'UNSET' ? facts = "fqdn=#{`hostname`}" : ''
15
+
16
+ # If hiera isn't found, return false
17
+ # otherwise return the hash
18
+ if !system('which hiera > /dev/null 2>&1')
19
+ puts 'Cannot find "hiera" command in $path.'
20
+ return false
21
+ else
22
+ return eval(`hiera #{keyword} #{facts}`)
23
+ end
24
+
25
+ end
26
+
27
+
28
+
29
+ #
30
+ # Set Keywords Preface based on title and subject
31
+ # If subject matches a number/character combination and contains no spaces,
32
+ # the preface will be combined with the doktype.
33
+ # If not: preface will contain the whole subject with dots and spaces being
34
+ # replaced with underscores
35
+ #
36
+ def setKeywordsPreface(metadata, doktype)
37
+ if metadata['subject'].match(/^\d+[^+s]+.*/)
38
+ return doktype + metadata['subject']
39
+ else
40
+ subject = metadata['subject']
41
+
42
+ # Take care of special characters
43
+ I18n.enforce_available_locales = false
44
+ subject = I18n.transliterate(metadata['subject'])
45
+
46
+ # Replace everything else
47
+ subject = subject.gsub(/[^a-zA-Z0-9]+/,'_')
48
+ return subject
49
+ end
50
+ end
51
+
52
+
53
+ #
54
+ # Function to read the metadata from a given file
55
+ # hash readMetadata(string)
56
+ #
57
+ def readMetadata(pathFile = false)
58
+ metadata = Hash.new
59
+ metadata['keywords'] = ''
60
+ metadata['subject'] = ''
61
+ metadata['title'] = ''
62
+ metadata['author'] = ''
63
+ metadata['creator'] = ''
64
+ metadata['createdate'] = ''
65
+ if not File.file?(pathFile)
66
+ puts "Cannot access file #{pathFile}. Abort"
67
+ abort
68
+ end
69
+
70
+ # Fetch the Metada with the help of exiftools (unless something better is
71
+ # found
72
+ metaStrings = `exiftool '#{pathFile}' | egrep -i '^Creator\s+\:|^Author|Create Date|Subject|Keywords|Title'`
73
+
74
+ # Time to cherrypick the available data
75
+ entries = metaStrings.split("\n")
76
+ entries.each do |entry|
77
+ values = entry.split(" : ")
78
+ values[0].match(/Creator/) and metadata['creator'] == '' ? metadata['creator'] = values[1]: metadata['creator'] = ''
79
+ values[0].match(/Author/) and metadata['author'] == '' ? metadata['author'] = values[1]: metadata['author'] = ''
80
+ values[0].match(/Create Date/) and metadata['createdate'] == '' ? metadata['createdate'] = values[1]: metadata['createdate'] = ''
81
+ values[0].match(/Subject/) and metadata['subject'] == '' ? metadata['subject'] = values[1]: metadata['subject'] = ''
82
+ values[0].match(/Keywords/) and metadata['keywords'] == '' ? metadata['keywords'] = values[1]: metadata['keywords'] =''
83
+ values[0].match(/Title/) and metadata['title'] == '' ? metadata['title'] = values[1]: metadata['title'] =''
84
+ end
85
+ return metadata
86
+ end
87
+
88
+
89
+ #
90
+ # Read user input
91
+ #
92
+ def readUserInput(textstring = 'Enter value: ')
93
+ return ask textstring
94
+ end
95
+
96
+
97
+ #
98
+ # Identify a date
99
+ # Function takes a string and tries to identify a date in there.
100
+ # returns false if no date could be identified
101
+ # otherwise the date is returned in the format as
102
+ #
103
+ # YYYY:MM:DD HH:mm:ss
104
+ #
105
+ # For missing time values zero is assumed
106
+ #
107
+ def identifyDate(datestring)
108
+ identifiedDate = ''
109
+ year = '[1-2][90][0-9][0-9]'
110
+ month = '0[0-9]|10|11|12'
111
+ day = '[1-9]|0[1-9]|1[0-9]|2[0-9]|3[0-1]'
112
+ hour = '[0-1][0-9]|2[0-3]|[1-9]'
113
+ minute = '[0-5][0-9]'
114
+ second = '[0-5][0-9]'
115
+ case datestring
116
+ when /^(#{year})(#{month})(#{day})$/
117
+ identifiedDate = $1 + ':' + $2 + ':' + $3 + ' 00:00:00'
118
+ when /^(#{year})(#{month})(#{day})(#{hour})(#{minute})(#{second})$/
119
+ identifiedDate = $1 + ':' + $2 + ':' + $3 + ' ' + $4 + ':' + $5 + ':' + $6
120
+ when /^(#{year})[\:|\.|\-](#{month})[\:|\.|\-](#{day})\s(#{hour})[\:](#{minute})[\:](#{second})$/
121
+ identifiedDate = $1 + ':' + $2 + ':' + $3 + ' ' + $4 + ':' + $5 + ':' + $6
122
+ when /^(#{year})[\:|\.|\-](#{month})[\:|\.|\-](#{day})$/
123
+ day = "%02d" % $3
124
+ month = "%02d" % $2
125
+ identifiedDate = $1 + ':' + month + ':' + day + ' 00:00:00'
126
+ else
127
+ identifiedDate = false
128
+ end
129
+ return identifiedDate
130
+ end
@@ -0,0 +1,146 @@
1
+ #
2
+ # Thor command 'rename'
3
+ #
4
+ # TODO: Define outputdir from Hiera
5
+ # TODO: Add option for copy when renaming
6
+ # TODO: Add option to create outputdir if not existing
7
+ # TODO: Define option to create outputdir via Hiera
8
+ #
9
+ filename = ENV.fetch('PDFMD_FILENAME')
10
+ allkeywords = ENV.fetch('PDFMD_ALLKEYWORDS')
11
+ outputdir = ENV.fetch('PDFMD_OUTPUTDIR') == 'false' ? false : ENV.fetch('PDFMD_OUTPUTDIR')
12
+ dryrun = ENV.fetch('PDFMD_DRYRUN') == 'false' ? false : true
13
+ numberKeywords = ENV.fetch('PDFMD_NUMBERKEYWORDS').to_i
14
+
15
+ metadata = readMetadata(filename).each do |key,value|
16
+
17
+ # Check if the metadata is complete
18
+ if key.match(/author|subject|createdate|title/) and value.empty?
19
+ puts 'Missing value for ' + key
20
+ puts 'Abort'
21
+ exit 1
22
+ end
23
+
24
+ end
25
+
26
+ date = metadata['createdate'].gsub(/\ \d{2}\:\d{2}\:\d{2}.*$/,'').gsub(/\:/,'')
27
+ author = metadata['author'].gsub(/\./,'_').gsub(/\-/,'').gsub(/\s/,'_')
28
+ I18n.enforce_available_locales = false
29
+ author = I18n.transliterate(author) # Normalising
30
+
31
+ keywords_preface = ''
32
+ # This statement can probably be optimised
33
+ case metadata['title']
34
+ when /(Tilbudt|Angebot)/i
35
+ doktype = 'til'
36
+ keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
37
+ when /Orderbekrefelse/i
38
+ doktype = 'odb'
39
+ keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
40
+ when /faktura/i
41
+ doktype = 'fak'
42
+ keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
43
+ when /order/i
44
+ doktype = 'ord'
45
+ keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
46
+ when /(kontrakt|avtale|vertrag|contract)/i
47
+ doktype = 'avt'
48
+ keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
49
+ when /kvittering/i
50
+ doktype = 'kvi'
51
+ keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
52
+ when /manual/i
53
+ doktype = 'man'
54
+ keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
55
+ when /(billett|ticket)/i
56
+ doktype = 'bil'
57
+ keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
58
+ when /(informasjon|information)/i
59
+ doktype = 'inf'
60
+ keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
61
+ else
62
+ doktype = 'dok'
63
+ keywords_preface = setKeywordsPreface(metadata,doktype.gsub(/\-/,''))
64
+ end
65
+ if not metadata['keywords'].empty?
66
+ keywords_preface == '' ? keywords = '' : keywords = keywords_preface
67
+ keywordsarray = metadata['keywords'].split(',')
68
+
69
+ #
70
+ # Sort array
71
+ #
72
+ keywordssorted = Array.new
73
+ keywordsarray.each_with_index do |value,index|
74
+ value = value.lstrip.chomp
75
+ value = value.gsub(/(Faktura|Rechnungs)(nummer)? /i,'fak')
76
+ value = value.gsub(/(Kunde)(n)?(nummer)? /i,'kdn')
77
+ value = value.gsub(/(Kunde)(n)?(nummer)?-/i,'kdn')
78
+ value = value.gsub(/(Ordre|Bestellung)(s?nummer)? /i,'ord')
79
+ value = value.gsub(/(Kvittering|Quittung)(snummer)? /i,'kvi')
80
+ value = value.gsub(/\s/,'_')
81
+ value = value.gsub(/\//,'_')
82
+ keywordsarray[index] = value
83
+ if value.match(/^(fak|kdn|ord|kvi)/)
84
+ keywordssorted.insert(0, value)
85
+ else
86
+ keywordssorted.push(value)
87
+ end
88
+ end
89
+
90
+ counter = 0
91
+ keywordssorted.each_with_index do |value,index|
92
+
93
+ # Exit condition limits the number of keywords used in the filename
94
+ # unless all keywords shall be added
95
+ if not allkeywords.empty?
96
+ counter > numberKeywords-1 ? break : counter = counter + 1
97
+ end
98
+ if value.match(/(kvi|fak|ord|kdn)/i)
99
+ keywords == '' ? keywords = '-' + value : keywords = value + '-' + keywords
100
+ else
101
+ keywords == '' ? keywords = '-' + value : keywords.concat('-' + value)
102
+ end
103
+ end
104
+ # Normalise the keywords as well
105
+ #
106
+ I18n.enforce_available_locales = false
107
+ keywords = I18n.transliterate(keywords)
108
+
109
+ # There are no keywords
110
+ # Rare, but it happens
111
+ else
112
+
113
+ # There are no keywords.
114
+ # we are using the title and the subject
115
+ if keywords_preface != ''
116
+ keywords = keywords_preface
117
+ end
118
+
119
+ end
120
+ extension = 'pdf'
121
+ if keywords != nil and keywords[0] != '-'
122
+ keywords = '-' + keywords
123
+ end
124
+ keywords == nil ? keywords = '' : ''
125
+ newFilename = date + '-' +
126
+ author + '-' +
127
+ doktype +
128
+ keywords + '.' +
129
+ extension
130
+
131
+ # Output directory checks
132
+ if outputdir
133
+ if not File.exist?(outputdir)
134
+ puts "Error: output dir '#{outputdir}' not found. Abort."
135
+ exit 1
136
+ end
137
+ else
138
+ # Output to Inputdir
139
+ outputdir = File.dirname(filename)
140
+ end
141
+
142
+ if not dryrun and filename != newFilename.downcase
143
+ `mv -v '#{filename}' '#{outputdir}/#{newFilename.downcase}'`
144
+ else
145
+ puts filename + "\n => " + newFilename.downcase
146
+ end
@@ -0,0 +1,24 @@
1
+ filename = ENV.fetch('PDFMD_FILENAME')
2
+ optTag = ENV['PDFMD_TAGS'] || nil
3
+ optAll = ENV['PDFMD_ALL'] == 'true' ? true : nil
4
+
5
+ metadata = readMetadata(filename)
6
+
7
+ # Output all metatags
8
+ if optAll or optTag.nil?
9
+
10
+ puts "Author : " + metadata['author'].to_s
11
+ puts "Creator : " + metadata['creator'].to_s
12
+ puts "CreateDate : " + metadata['createdate'].to_s
13
+ puts "Subject : " + metadata['subject'].to_s
14
+ puts "Title : " + metadata['title'].to_s
15
+ puts "Keywords : " + metadata['keywords'].to_s
16
+
17
+ elsif not optTag.nil? # Output specific tag(s)
18
+
19
+ tags = optTag.split(',')
20
+ tags.each do |tag|
21
+ puts metadata[tag.downcase]
22
+ end
23
+
24
+ end
@@ -0,0 +1,100 @@
1
+ inputDir = ENV.fetch('PDFMD_INPUTDIR')
2
+
3
+ require_relative('./methods.rb')
4
+ require 'fileutils'
5
+
6
+ opt_destination = ENV.fetch('PDFMD_DESTINATION')
7
+ opt_copy = ENV.fetch('PDFMD_COPY')
8
+ opt_log = ENV.fetch('PDFMD_LOG')
9
+ opt_interactive = ENV.fetch('PDFMD_INTERACTIVE')
10
+
11
+ hieraDefaults = queryHiera('pdfmd::config')
12
+
13
+ copyAction = opt_copy.empty? ? false : true
14
+ if opt_copy.nil? and hieraDefaults['sort']['copy'] == true
15
+ copyAction = true
16
+ puts 'Setting action to copy based on Hiera.'
17
+ end
18
+
19
+ interactiveAction = opt_interactive.empty? ? false : true
20
+ if opt_interactive.empty? and hieraDefaults['sort']['interactive'] == true
21
+ interactiveAction = true
22
+ puts 'Setting interactive to true based on Hiera.'
23
+ end
24
+
25
+ # Fetch alternate destination from hiera if available
26
+ destination = opt_destination
27
+ if destination.nil? or destination == ''
28
+
29
+ hieraHash = queryHiera('pdfmd::config')
30
+ if !hieraHash['sort']['destination'].nil?
31
+ destination = hieraHash['sort']['destination']
32
+ else
33
+ puts 'No information about destination found.'
34
+ puts 'Set parameter -d or configure hiera.'
35
+ puts 'Abort.'
36
+ exit 1
37
+ end
38
+
39
+ end
40
+
41
+ logenable = opt_log
42
+ logfile = !hieraDefaults['sort']['logfile'].nil? ? hieraDefaults['sort']['logfile'] : Dir.pwd.chomp('/') + '/' + Pathname.new(__FILE__).basename + '.log'
43
+
44
+ # Check that logfilepath exists and is writeable
45
+ if !File.writable?(logfile)
46
+ puts "Cannot write '#{logfile}. Abort."
47
+ exit 1
48
+ end
49
+ logenable ? $logger = Logger.new(logfile) : ''
50
+
51
+ # Input validation
52
+ !File.exist?(inputDir) ? abort('Input directory does not exist. Abort.'): ''
53
+ File.directory?(inputDir) ? '' : abort('Input is a single file')
54
+ File.file?(destination) ? abort("Output '#{destination}' is an existing file. Cannot create directory with the same name. Abort") : ''
55
+ unless File.directory?(destination)
56
+ FileUtils.mkdir_p(destination)
57
+ $logger.info("Destination '#{destination}' has been created.")
58
+ end
59
+
60
+ # Iterate through all files
61
+ Dir[inputDir.chomp('/') + '/*.pdf'].sort.each do |file|
62
+
63
+ if interactiveAction
64
+ answer = readUserInput("Process '#{file}' ([y]/n): ")
65
+ answer = answer.empty? ? 'y' : answer
66
+ answer.match(/y/) ? '' : next
67
+ end
68
+
69
+ metadata = readMetadata(file)
70
+ if metadata['author'] and not metadata['author'].empty?
71
+ author = metadata['author'].gsub(' ','_').gsub('.','_')
72
+ I18n.enforce_available_locales = false # Serialize special characters
73
+ author = I18n.transliterate(author).downcase
74
+ folderdestination = destination.chomp('/') + '/' + author
75
+
76
+ unless File.directory?(folderdestination)
77
+ FileUtils.mkdir_p(folderdestination)
78
+ logenable ? $logger.info("Folder '#{folderdestination}' has been created."): ''
79
+ end
80
+
81
+ filedestination = destination.chomp('/') + '/' + author + '/' + Pathname.new(file).basename.to_s
82
+
83
+ # Final check before touching the filesystem
84
+ if not File.exist?(filedestination)
85
+ $logger.info("File '#{file}' => '#{filedestination}'")
86
+
87
+ # Move/Copy the file
88
+ if copyAction
89
+ FileUtils.cp(file, filedestination)
90
+ else
91
+ FileUtils.mv(file,filedestination)
92
+ end
93
+
94
+ else
95
+ logenable ? $logger.warn("File '#{filedestination}' already exists. Ignoring.") : ''
96
+ end
97
+ else
98
+ logenable ? $logger.warn("Missing tag 'Author' for file '#{file}'. Skipping.") : (puts "Missing tag 'Author' for file '#{file}'. Skipping")
99
+ end
100
+ end