pdfh 0.1.9 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Pdfh
4
+ # Calculate correct period from the extracted document date and subtype month offset
5
+ class DocumentPeriod
6
+ attr_reader :month, :year
7
+
8
+ # @return [self]
9
+ def initialize(month:, month_offset:, year:, day: nil)
10
+ @day = day
11
+ @raw_month = month
12
+ @raw_year = year
13
+ normalized_month = Month.normalize_to_i(month) + (month_offset || 0)
14
+ year_offset = 0
15
+ @month = case normalized_month
16
+ when 0
17
+ year_offset = -1
18
+ 12
19
+ when 13
20
+ year_offset = 1
21
+ 1
22
+ else normalized_month
23
+ end
24
+ @year = (year.size == 2 ? "20#{year}" : year).to_i + year_offset
25
+ end
26
+
27
+ # @return [String]
28
+ def to_s
29
+ "#{year}-#{month.to_s.rjust(2, "0")}"
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,163 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "fileutils"
4
+
5
+ module Pdfh
6
+ # Main functionality. This class is intended to manage the pdf documents
7
+ class DocumentProcessor
8
+ # @return [self]
9
+ def initialize
10
+ @options = Pdfh.parse_argv
11
+ Pdfh.verbose = options[:verbose]
12
+ Pdfh.dry = options[:dry]
13
+ Pdfh.verbose_print(options)
14
+ @mode = options.key?(:type) ? :file : :directory
15
+ end
16
+
17
+ # @return [void]
18
+ def start
19
+ @settings = Settings.new(Pdfh.search_config_file)
20
+ puts "Destination path: #{@settings.base_path.colorize(:light_blue)}" if Pdfh.verbose?
21
+
22
+ @mode == :file ? process_files : process_lookup_dirs
23
+ rescue SettingsIOError => e
24
+ Pdfh.error_print(e.message, exit_app: false)
25
+ Pdfh.create_settings_file
26
+ exit(1)
27
+ rescue StandardError => e
28
+ Pdfh.error_print e.message
29
+ end
30
+
31
+ private
32
+
33
+ attr_reader :options
34
+
35
+ # @param [String] file_name
36
+ # @return [DocumentType]
37
+ def match_doc_type(file_name)
38
+ @settings.document_types.each do |type|
39
+ match = type.re_file.match(file_name)
40
+ return type if match
41
+ end
42
+ nil
43
+ end
44
+
45
+ # @return [DocumentType]
46
+ def doc_type_by_id(id)
47
+ @settings.document_types.find { |t| t.gid == id }
48
+ end
49
+
50
+ # @return [void]
51
+ def process_files
52
+ type_id = options[:type]
53
+ raise ArgumentError, "No files provided to process #{type_id.inspect} type." unless options[:files]
54
+
55
+ type = doc_type_by_id(type_id)
56
+ puts
57
+ options[:files].each do |file|
58
+ unless File.exist?(file)
59
+ Pdfh.warn_print "File #{file.inspect} does not exist."
60
+ next
61
+ end
62
+ unless File.extname(file) == ".pdf"
63
+ Pdfh.warn_print "File #{file.inspect} is not a pdf."
64
+ next
65
+ end
66
+ process_document(file, type)
67
+ end
68
+ end
69
+
70
+ # @return [void]
71
+ def process_lookup_dirs
72
+ @settings.lookup_dirs.each do |work_directory|
73
+ process_directory(work_directory)
74
+ end
75
+ end
76
+
77
+ # @param [String] work_directory
78
+ # @return [Enumerator]
79
+ def process_directory(work_directory)
80
+ Pdfh.headline(work_directory)
81
+ processed_count = 0
82
+ ignored_files = []
83
+ files = Dir["#{work_directory}/*.pdf"]
84
+ files.each do |pdf_file|
85
+ type = match_doc_type(pdf_file)
86
+ if type
87
+ processed_count += 1
88
+ process_document(pdf_file, type)
89
+ else
90
+ ignored_files << basename_without_ext(pdf_file)
91
+ end
92
+ end
93
+ puts " (No files processed)".colorize(:light_black) if processed_count.zero?
94
+ return unless Pdfh.verbose?
95
+
96
+ puts "\n No document type found for these PDF files:" if ignored_files.any?
97
+ ignored_files.each.with_index(1) { |file, index| Pdfh.ident_print index, file, color: :magenta }
98
+ end
99
+
100
+ ##
101
+ # Generate document, and process actions
102
+ # @param [String] file
103
+ # @param [DocumentType] type
104
+ # @return [void]
105
+ def process_document(file, type)
106
+ base = File.basename(file)
107
+ puts "Working on #{base.colorize(:light_green)}"
108
+ pad = 12
109
+ Pdfh.ident_print "Type", type.name, color: :light_blue, width: pad
110
+ doc = Document.new(file, type)
111
+ Pdfh.ident_print "Sub-Type", doc.sub_type, color: :light_blue, width: pad
112
+ Pdfh.ident_print "Period", doc.period, color: :light_blue, width: pad
113
+ Pdfh.ident_print "New Name", doc.new_name, color: :light_blue, width: pad
114
+ Pdfh.ident_print "Store Path", doc.store_path, color: :light_blue, width: pad
115
+ Pdfh.ident_print "Other files", doc.companion_files(join: true), color: :light_blue, width: pad
116
+ Pdfh.ident_print "Print CMD", doc.print_cmd, color: :light_blue, width: pad
117
+ Pdfh.ident_print "Processed?", "No (in Dry mode)", color: :red, width: pad if Pdfh.dry?
118
+ write_pdf(doc)
119
+ rescue StandardError => e
120
+ Pdfh.ident_print "Doc Error", e.message, color: :red, width: pad
121
+ end
122
+
123
+ def write_pdf(document)
124
+ base_path = @settings.base_path
125
+ full_path = File.join(base_path, document.store_path, document.new_name)
126
+ dir_path = File.join(base_path, document.store_path)
127
+
128
+ FileUtils.mkdir_p(dir_path) unless File.exist?(dir_path)
129
+
130
+ document.pdf_doc.write_new_pdf(dir_path, full_path)
131
+ make_document_backup(document)
132
+ copy_companion_files(dir_path, document)
133
+ end
134
+
135
+ # Create a backup of original document
136
+ def make_document_backup(document)
137
+ Pdfh.verbose_print "~~~~~~~~~~~~~~~~~~ Creating PDF backup"
138
+ Dir.chdir(document.home_dir) do
139
+ Pdfh.verbose_print " Working on: #{document.home_dir.inspect} directory"
140
+ Pdfh.verbose_print " mv #{document.file_name.inspect} -> #{document.backup_name.inspect}"
141
+ File.rename(document.file_name, document.backup_name) unless Pdfh.dry?
142
+ end
143
+ end
144
+
145
+ def copy_companion_files(destination, document)
146
+ Pdfh.verbose_print "~~~~~~~~~~~~~~~~~~ Writing Companion files"
147
+ document.companion_files.each do |file|
148
+ Pdfh.verbose_print " Working on #{file.inspect}..."
149
+ src_name = File.join(document.home_dir, file)
150
+ src_ext = File.extname(file)
151
+ dest_name = File.basename(document.new_name, ".pdf")
152
+ dest_full = File.join(destination, "#{dest_name}#{src_ext}")
153
+ Pdfh.verbose_print " cp #{src_name} --> #{dest_full}"
154
+ FileUtils.cp(src_name, dest_full) unless Pdfh.dry?
155
+ end
156
+ end
157
+
158
+ # @return [String]
159
+ def basename_without_ext(file)
160
+ File.basename(file, File.extname(file))
161
+ end
162
+ end
163
+ end
@@ -0,0 +1,43 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "base64"
4
+
5
+ module Pdfh
6
+ DocumentSubType = Struct.new(:name, :month_offset, keyword_init: true)
7
+
8
+ DocumentType = Struct.new(:name, :re_file, :re_date, :pwd, :store_path, :name_template, :sub_types, :print_cmd,
9
+ keyword_init: true) do
10
+ # @return [self]
11
+ def initialize(args)
12
+ super
13
+ self.name_template ||= "{original}"
14
+ self.re_file = Regexp.new(re_file)
15
+ self.re_date = Regexp.new(re_date)
16
+ self.pwd = Base64.decode64(pwd) if pwd
17
+ self.sub_types = extract_subtype(sub_types) if sub_types
18
+ end
19
+
20
+ # @return [String]
21
+ def gid
22
+ name.downcase.gsub(/[^0-9A-Za-z\s]/, "").tr(" ", "-")
23
+ end
24
+
25
+ # @return [DocumentSubType]
26
+ def sub_type(text)
27
+ # Regexp.new(st.name).match?(name)
28
+ sub_types&.find { |st| /#{st.name}/i.match?(text) }
29
+ end
30
+
31
+ private
32
+
33
+ # @param sub_types [Array]
34
+ # @return [DocumentSubType]
35
+ def extract_subtype(sub_types)
36
+ sub_types.map do |st|
37
+ name = st["name"]
38
+ offset = st["month_offset"].to_i
39
+ DocumentSubType.new(name: name, month_offset: offset)
40
+ end
41
+ end
42
+ end
43
+ end
data/lib/pdfh/month.rb CHANGED
@@ -1,41 +1,51 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Pdfh
4
- ##
5
- # Handles Month convertions
4
+ # Handles Month conversions
6
5
  class Month
7
- MONTHS = {
8
- enero: 1,
9
- febrero: 2,
10
- marzo: 3,
11
- abril: 4,
12
- mayo: 5,
13
- junio: 6,
14
- julio: 7,
15
- agosto: 8,
16
- septiembre: 9,
17
- octubre: 10,
18
- noviembre: 11,
19
- diciembre: 12
20
- }.freeze
21
-
22
- ##
23
- # @param [String] month
24
- # @return [Integer]
25
- def self.normalize(month)
26
- # When param is a number
27
- month_num = month.to_i
28
- return month_num if month_num.between?(1, 12) # (1..12).include?(month_num)
29
-
30
- # When param is a 3 char month: 'mar', 'nov'
31
- if month.size == 3
32
- MONTHS.each_key do |mon|
33
- return MONTHS[mon] if mon.to_s[0, 3] == month
34
- end
6
+ class << self
7
+ FINDER_3L = proc { |name_search, month| month[0, 3].casecmp?(name_search) }
8
+ FINDER_FL = proc { |name_search, month| month.casecmp?(name_search) }
9
+
10
+ # rubocop:disable Layout/SpaceInsideArrayPercentLiteral
11
+ MONTHS_EN = %w[january february march april may june july august september october november december].freeze
12
+ MONTHS_ES = %w[enero febrero marzo abril mayo junio julio agosto septiembre octubre noviembre diciembre].freeze
13
+ # rubocop:enable Layout/SpaceInsideArrayPercentLiteral
14
+
15
+ # @param [String] month
16
+ # @return [Integer]
17
+ def normalize_to_i(month)
18
+ # When param is a number
19
+ month_num = month.to_i
20
+ raise ArgumentError, "Month #{month.inspect} is not a valid month number" if month_num > 12
21
+
22
+ return month_num if month_num.between?(1, 12)
23
+
24
+ # When param is a 3 char month: 'mar', 'nov'
25
+ return find_month(month, FINDER_3L) if month.size == 3
26
+
27
+ # When param has a direct match
28
+ find_month(month, FINDER_FL)
35
29
  end
36
30
 
37
- # When param has a direct match
38
- MONTHS[month.to_sym]
31
+ private
32
+
33
+ # @return [Integer]
34
+ def find_month(name, finder)
35
+ find_by_name = finder.curry[name]
36
+ match = MONTHS_ES.find(&find_by_name)
37
+ return month_number(MONTHS_ES, match) if match
38
+
39
+ match = MONTHS_EN.find(&find_by_name)
40
+ return month_number(MONTHS_EN, match) if match
41
+
42
+ raise ArgumentError, "Month #{name.inspect} is not valid"
43
+ end
44
+
45
+ # @return [Integer]
46
+ def month_number(month_array, name)
47
+ month_array.rindex(name) + 1
48
+ end
39
49
  end
40
50
  end
41
51
  end
@@ -0,0 +1,41 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "optparse"
4
+
5
+ module Pdfh
6
+ OPT_PARSER = OptionParser.new do |opts|
7
+ opts.default_argv
8
+ # Process ARGV
9
+ opts.banner = "Usage: #{opts.program_name} [options] [file1 ...]"
10
+ opts.separator ""
11
+ opts.separator "Specific options:"
12
+
13
+ opts.on("-tID", "--type=ID", "Document type id (requires a trailing file list)")
14
+ opts.on_tail("-T", "--list-types", "List document types in configuration") do
15
+ settings = Settings.new(Pdfh.search_config_file)
16
+ ident = 4
17
+ max_width = settings.document_types.map { |t| t.gid.size }.max
18
+ puts "#{" " * ident}#{"ID".ljust(max_width)} Type Name"
19
+ puts "#{" " * ident}#{"-" * max_width} -----------------------"
20
+ settings.document_types.each do |type|
21
+ puts "#{" " * ident}#{type.gid.ljust(max_width)} #{type.name.inspect}"
22
+ end
23
+ exit
24
+ rescue SettingsIOError => e
25
+ Pdfh.error_print(e.message, exit_app: false)
26
+ Pdfh.create_settings_file
27
+ exit(1)
28
+ end
29
+ opts.on_tail("-V", "--version", "Show version") do
30
+ puts "#{opts.program_name} v#{Pdfh::VERSION}"
31
+ exit
32
+ end
33
+ opts.on_tail("-h", "--help", "help (this dialog)") do
34
+ puts opts
35
+ exit
36
+ end
37
+
38
+ opts.on("-v", "--verbose", "Show more output. Useful for debug")
39
+ opts.on("-d", "--dry", "Dry run, does not write new pdf")
40
+ end
41
+ end
@@ -1,7 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Pdfh
4
- ##
5
4
  # Handles the Pdf document text extraction and password removal
6
5
  # TODO: Replace command utils with this gem
7
6
  # require 'pdf-reader'
@@ -11,44 +10,46 @@ module Pdfh
11
10
  # @text << page.text
12
11
  # end
13
12
  class PdfHandler
14
- attr_reader :file, :password
13
+ attr_reader :file
15
14
 
15
+ # @return [self]
16
16
  def initialize(file, password)
17
17
  @file = file
18
- @password = password
18
+ @password_option = password ? "--password=#{password.inspect} " : ""
19
19
  end
20
20
 
21
21
  ##
22
22
  # Gets the text from the pdf in order to execute
23
- # the regular expresiom matches
23
+ # the regular expresion matches
24
+ # @return [String]
24
25
  def extract_text
25
26
  temp = `mktemp`.chomp
26
- Verbose.print " --> #{temp} temporal file assigned."
27
+ Pdfh.verbose_print "~~~~~~~~~~~~~~~~~~ Extract PDF text"
28
+ Pdfh.verbose_print " --> #{temp.inspect} temporal file assigned."
27
29
 
28
- password_opt = "--password='#{@password}'" if @password
29
- cmd = %(qpdf #{password_opt} --decrypt --stream-data=uncompress '#{@file}' '#{temp}')
30
- Verbose.print " Command: #{cmd}"
30
+ cmd = %(qpdf #{@password_option}--decrypt --stream-data=uncompress #{@file.inspect} #{temp.inspect})
31
+ Pdfh.verbose_print " DeCrypt Command: #{cmd}"
31
32
  _result = `#{cmd}`
32
33
 
33
- cmd2 = %(pdftotext -enc UTF-8 '#{temp}' -)
34
- Verbose.print " Command: #{cmd2}"
34
+ cmd2 = %(pdftotext -enc UTF-8 #{temp.inspect} -)
35
+ Pdfh.verbose_print " Extract Command: #{cmd2}"
35
36
  text = `#{cmd2}`
36
- Verbose.print " Text extracted: #{text}"
37
+ Pdfh.verbose_print " Text: #{text.inspect}"
37
38
  text
38
39
  end
39
40
 
40
- def write_pdf(dir_path, full_path)
41
- Verbose.print "~~~~~~~~~~~~~~~~~~ Writing PDFs"
41
+ # @return [void]
42
+ def write_new_pdf(dir_path, full_path)
43
+ Pdfh.verbose_print "~~~~~~~~~~~~~~~~~~ Writing PDFs"
42
44
  raise IOError, "Path #{dir_path} not found." unless Dir.exist?(dir_path)
43
45
 
44
- password_opt = "--password='#{@password}'" if @password
45
- cmd = %(qpdf #{password_opt} --decrypt '#{@file}' '#{full_path}')
46
- Verbose.print " Write pdf command: #{cmd}"
46
+ cmd = %(qpdf #{@password_option}--decrypt #{@file.inspect} #{full_path.inspect})
47
+ Pdfh.verbose_print " Write PDF Command: #{cmd}"
47
48
 
48
- return if Dry.active?
49
+ return if Pdfh.dry?
49
50
 
50
51
  _result = `#{cmd}`
51
- raise IOError, "File #{full_path} was not created." unless File.file?(full_path)
52
+ raise IOError, "New PDF file #{full_path.inspect} was not created." unless File.file?(full_path)
52
53
  end
53
54
  end
54
55
  end
data/lib/pdfh/settings.rb CHANGED
@@ -1,61 +1,53 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require "yaml"
4
- require "ostruct"
5
- require "base64"
6
4
 
7
5
  module Pdfh
8
- ##
9
6
  # Handles the config yaml data mapping, and associates a file name with a doc type
10
7
  class Settings
11
- attr_reader :scrape_dirs, :base_path, :document_types
8
+ attr_reader :lookup_dirs, :base_path, :document_types
12
9
 
13
- def initialize(file)
14
- file_hash = YAML.load_file(file)
15
- Verbose.print "Loaded configuration file: #{file}"
10
+ # @param config_file [String]
11
+ # @return [self]
12
+ def initialize(config_file)
13
+ file_hash = YAML.load_file(config_file)
14
+ Pdfh.verbose_print "Loaded configuration file: #{config_file}"
16
15
 
17
- @scrape_dirs = process_scrape_dirs(file_hash["scrape_dirs"])
18
- @base_path = File.expand_path(file_hash["base_path"])
19
- @document_types = process_doc_types(file_hash["document_types"])
16
+ process_lookup_dirs(file_hash["lookup_dirs"])
17
+ process_destination_base(file_hash["destination_base_path"])
20
18
 
21
- Verbose.print "Processing directories:"
22
- scrape_dirs.each { |dir| Verbose.print " - #{dir}" }
23
- Verbose.print
24
- end
19
+ Pdfh.verbose_print "Configured Look up directories:"
20
+ lookup_dirs.each_with_index { |dir, idx| Pdfh.verbose_print " #{idx + 1}. #{dir}" }
21
+ Pdfh.verbose_print
25
22
 
26
- ##
27
- # @param [String] file_name
28
- # @return [OpenStruct]
29
- def match_doc_type(file_name)
30
- document_types.each do |type|
31
- match = type.re_file.match(file_name)
32
- return type if match
33
- end
34
- nil
23
+ @document_types = load_doc_types(file_hash["document_types"])
35
24
  end
36
25
 
37
26
  private
38
27
 
39
- def process_scrape_dirs(scrape_dirs_list)
40
- scrape_dirs_list.map do |dir|
28
+ # @return [void]
29
+ def process_lookup_dirs(lookup_dirs_list)
30
+ @lookup_dirs = lookup_dirs_list.filter_map do |dir|
41
31
  expanded = File.expand_path(dir)
42
- dir_exists = File.directory?(expanded)
43
- if dir_exists
44
- expanded
45
- else
46
- Verbose.print " ** Directory #{dir} does not exists."
32
+ unless File.directory?(expanded)
33
+ Pdfh.verbose_print " ** Error, Directory #{dir} does not exists."
34
+ next
47
35
  end
48
- end.compact
36
+ expanded
37
+ end
38
+ raise ArgumentError, "No valid Look up directories configured." if lookup_dirs.empty?
49
39
  end
50
40
 
51
- def process_doc_types(doc_types)
52
- doc_types.map do |x|
53
- object = OpenStruct.new(x)
54
- object.re_file = Regexp.new(object.re_file)
55
- object.re_date = Regexp.new(object.re_date)
56
- object.pwd = object.pwd ? Base64.decode64(object.pwd) : nil
57
- object
58
- end
41
+ # @return [void]
42
+ def process_destination_base(dir)
43
+ @base_path = File.expand_path(dir)
44
+ raise ArgumentError, "Destination base directory is not configured." if @base_path.nil?
45
+ raise ArgumentError, "Destination base directory #{@base_path} does not exist." unless File.directory?(@base_path)
46
+ end
47
+
48
+ # @return [Array<DocumentType>]
49
+ def load_doc_types(doc_types)
50
+ doc_types.map { |data| DocumentType.new(data) }
59
51
  end
60
52
  end
61
53
  end