pdfh 3.3.1 → 4.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/.editorconfig +0 -15
  3. data/.gitignore +3 -0
  4. data/.pre-commit-config.yaml +1 -1
  5. data/.rubocop.yml +5 -1
  6. data/.rubocop_todo.yml +5 -18
  7. data/.simplecov +32 -0
  8. data/AGENTS.md +174 -0
  9. data/CHANGELOG.md +74 -9
  10. data/Gemfile +0 -4
  11. data/Gemfile.lock +17 -37
  12. data/README.md +72 -37
  13. data/Rakefile +24 -6
  14. data/bin/console +3 -10
  15. data/bin/run +0 -1
  16. data/exe/pdfh +1 -1
  17. data/justfile +65 -0
  18. data/lib/pdfh/main.rb +25 -120
  19. data/lib/pdfh/models/document.rb +43 -128
  20. data/lib/pdfh/models/document_type.rb +35 -69
  21. data/lib/pdfh/models/run_options.rb +20 -0
  22. data/lib/pdfh/models/settings.rb +23 -83
  23. data/lib/pdfh/services/directory_scanner.rb +27 -0
  24. data/lib/pdfh/services/document_manager.rb +125 -0
  25. data/lib/pdfh/services/document_matcher.rb +57 -0
  26. data/lib/pdfh/services/opt_parser.rb +76 -0
  27. data/lib/pdfh/services/pdf_text_extractor.rb +45 -0
  28. data/lib/pdfh/services/settings_builder.rb +113 -0
  29. data/lib/pdfh/services/settings_validator.rb +150 -0
  30. data/lib/pdfh/utils/console.rb +5 -5
  31. data/lib/pdfh/utils/date_info.rb +55 -0
  32. data/lib/pdfh/utils/file_info.rb +47 -0
  33. data/lib/pdfh/utils/rename_validator.rb +4 -3
  34. data/lib/pdfh/version.rb +1 -1
  35. data/lib/pdfh.rb +25 -20
  36. data/mise.toml +20 -3
  37. data/pdfh.gemspec +3 -3
  38. metadata +18 -15
  39. data/lib/ext/string.rb +0 -9
  40. data/lib/pdfh/concerns/password_decodable.rb +0 -31
  41. data/lib/pdfh/models/document_period.rb +0 -37
  42. data/lib/pdfh/models/document_sub_type.rb +0 -6
  43. data/lib/pdfh/models/zip_types.rb +0 -17
  44. data/lib/pdfh/settings_template.rb +0 -21
  45. data/lib/pdfh/utils/opt_parser.rb +0 -78
  46. data/lib/pdfh/utils/options.rb +0 -38
  47. data/lib/pdfh/utils/pdf_file_handler.rb +0 -122
  48. data/lib/pdfh/utils/settings_builder.rb +0 -62
@@ -3,99 +3,65 @@
3
3
  module Pdfh
4
4
  # Represents a type of document that can be processed by pdfh
5
5
  class DocumentType
6
- include Concerns::PasswordDecodable
7
-
8
- REQUIRED_FIELDS = %i[name re_file re_date store_path].freeze
6
+ REQUIRED_KEYS = %i[name re_date store_path].freeze
7
+ DEFAULT_NAME_TEMPLATE = "{name} {period}"
9
8
 
10
9
  # @!attribute [r] name
11
- # @return [String] The name of the document type.
12
- # @!attribute [r] re_file
13
- # @return [Regexp] The regular expression to match file names.
10
+ # @return [String] The name of the document type
11
+ # @!attribute [r] re_id
12
+ # @return [Regexp] The regular expression to extract the document ID
14
13
  # @!attribute [r] re_date
15
- # @return [Regexp] The regular expression to extract dates and its information.
16
- # @!attribute [r] pwd
17
- # @return [String, nil] The base64 password for the document type, if any.
14
+ # @return [Regexp] The regular expression to extract dates
18
15
  # @!attribute [r] store_path
19
- # @return [String] The path where the document will be stored.
16
+ # @return [String] The path where the document will be stored
20
17
  # @!attribute [r] name_template
21
- # @return [String] The template for generating document names.
22
- # @!attribute [r] sub_types
23
- # @return [Array<DocumentSubType>, nil] The subtypes of the document, if any.
24
- attr_reader :name, :re_file, :re_date, :pwd, :store_path, :name_template, :sub_types
18
+ # @return [String] The template for generating document names
19
+ # @!attribute [r] path_validator
20
+ # @return [RenameValidator] The validator for the storage path
21
+ # @!attribute [r] name_validator
22
+ # @return [RenameValidator] The validator for the document name
23
+ attr_reader :name, :re_id, :re_date, :store_path, :name_template, :path_validator, :name_validator
25
24
 
26
- # @param args [Hash]
27
- # @return [self]
25
+ # @param args [Hash] The initialization arguments
26
+ # @return [DocumentType]
28
27
  def initialize(args)
29
28
  args.each { |k, v| instance_variable_set(:"@#{k}", v) }
30
- @name_template ||= "{original}"
31
- @re_file = Regexp.new(re_file)
29
+ return if missing_keys?
30
+
31
+ @name = name.to_s.strip
32
+ @re_id = Regexp.new(re_id || name)
32
33
  @re_date = Regexp.new(re_date)
33
- @sub_types = extract_subtypes(sub_types) if sub_types&.any?
34
+ @name_template = name_template || DEFAULT_NAME_TEMPLATE
34
35
  @path_validator = RenameValidator.new(store_path)
35
- @name_validator = RenameValidator.new(name_template)
36
- return if @path_validator.valid? && @name_validator.valid?
36
+ @name_validator = RenameValidator.new(@name_template)
37
+ end
37
38
 
38
- raise_validators_error
39
+ # @return [Boolean]
40
+ def valid?
41
+ missing_keys.empty? &&
42
+ @path_validator.valid? &&
43
+ @name_validator.valid?
39
44
  end
40
45
 
41
- # @return [Hash{Symbol->any}]
46
+ # @return [Hash{String => Object}]
42
47
  def to_h
43
48
  instance_variables.to_h { |var| [var.to_s.delete_prefix("@"), instance_variable_get(var)] }
44
49
  end
45
50
 
46
51
  # removes special characters from string and replaces spaces with dashes
47
- # @example usage
48
- # "Test This?%&".gid
49
- # # => "test-this"
52
+ # @example
53
+ # "Test This?%&".gid # => "test-this"
50
54
  # @return [String]
51
55
  def gid
52
56
  name.downcase.gsub(/[^0-9A-Za-z\s]/, "").tr(" ", "-")
53
57
  end
54
58
 
55
- # search the subtype name in the pdf document
56
- # @return [DocumentSubType]
57
- def sub_type(text)
58
- # Regexp.new(st.name).match?(name)
59
- sub_types&.find { |st| /#{st.name}/i.match?(text) }
60
- end
61
-
62
- # @param values [Hash{Symbol->String}
63
- # @return [String]
64
- def generate_new_name(values)
65
- @name_validator.gsub(values)
66
- end
67
-
68
- # @param values [Hash{Symbol->String}
69
- # @return [String]
70
- def generate_path(values)
71
- @path_validator.gsub(values)
59
+ # @return [Array<Symbol>]
60
+ def missing_keys
61
+ @missing_keys ||= REQUIRED_KEYS.select { |key| instance_variable_get(:"@#{key}").to_s.strip.empty? }
72
62
  end
73
63
 
74
- private
75
-
76
- attr_accessor :path_validator, :name_validator
77
-
78
- # @param sub_types [Array<Hash{Symbol->String}>]
79
- # @return [Array<DocumentSubType>]
80
- def extract_subtypes(sub_types)
81
- sub_types.map do |st|
82
- data = {
83
- name: st[:name],
84
- month_offset: st[:month_offset].to_i,
85
- re_date: st[:re_date] && Regexp.new(st[:re_date])
86
- }.compact
87
- DocumentSubType.new(data)
88
- end
89
- end
90
-
91
- # @raise [ArgumentError] when called
92
- # @return [void]
93
- def raise_validators_error
94
- template = "has invalid %<field>s[Unknown tokens=%<error>s]"
95
- errors = []
96
- errors << format(template, field: :store_path, error: path_validator.unknown_list) unless path_validator.valid?
97
- errors << format(template, field: :name_template, error: name_validator.unknown_list) unless name_validator.valid?
98
- raise ArgumentError, "Document type #{name.inspect} #{errors.join(", ")}"
99
- end
64
+ # @return [Boolean]
65
+ def missing_keys? = missing_keys.any?
100
66
  end
101
67
  end
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Pdfh
4
+ # Runtime options for the application
5
+ class RunOptions
6
+ # @param verbose [Boolean]
7
+ # @param dry [Boolean]
8
+ # @return [RunOptions]
9
+ def initialize(verbose: false, dry: false)
10
+ @verbose = verbose
11
+ @dry = dry
12
+ end
13
+
14
+ # @return [Boolean]
15
+ def verbose? = @verbose
16
+
17
+ # @return [Boolean]
18
+ def dry? = @dry
19
+ end
20
+ end
@@ -1,101 +1,41 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Pdfh
4
- # Handles the config yaml data mapping, and associates a file name with a doc type
4
+ # Handles the config yaml data mapping, and associates a file name with a doc type.
5
+ # This is a pure data object — validation is handled by Services::SettingsValidator.
5
6
  class Settings
6
7
  # @!attribute [r] lookup_dirs
7
- # @return [Array<String>] List of directories to look up for processing.
8
+ # @return [Array<String>] List of validated, expanded directories to look up for processing.
8
9
  # @!attribute [r] base_path
9
- # @return [String] The base directory path for storing processed files.
10
- # @!attribute [r] zip_types
11
- # @return [Array<ZipType>, nil] List of zip types to process, or nil if none.
12
- attr_reader :lookup_dirs, :base_path, :zip_types
10
+ # @return [String] The validated, expanded base directory path for storing processed files.
11
+ attr_reader :lookup_dirs, :base_path
13
12
 
14
- # @param config_data [Hash]
15
- # @return [self]
16
- def initialize(config_data)
17
- process_lookup_dirs(config_data[:lookup_dirs])
18
- process_destination_base(config_data[:destination_base_path])
19
-
20
- Pdfh.debug "Configured Look up directories:"
21
- lookup_dirs.each.with_index(1) { |dir, idx| Pdfh.debug " #{idx}. #{dir}" }
22
- Pdfh.debug
23
-
24
- build_doc_types(config_data[:document_types])
25
- build_zip_types(config_data[:zip_types]) if config_data.key?(:zip_types)
13
+ # @param lookup_dirs [Array<String>] Already validated and expanded directories
14
+ # @param base_path [String] Already validated and expanded base path
15
+ # @param document_types [Hash{String => DocumentType}] Already validated document types keyed by gid
16
+ # @return [Settings]
17
+ def initialize(lookup_dirs:, base_path:, document_types:)
18
+ @lookup_dirs = lookup_dirs
19
+ @base_path = base_path
20
+ @document_types = document_types
26
21
  end
27
22
 
28
23
  # @return [Array<DocumentType>]
29
- def document_types
30
- @document_types.values
24
+ def document_types = @document_types.values
25
+
26
+ # @example
27
+ # # document_types.map(&:name) ['12345', '12', '123']
28
+ # settings.document_types_name_max_size #=> 5
29
+ # @return [Integer]
30
+ def document_types_name_max_size
31
+ return 0 if document_types.empty?
32
+
33
+ document_types.map { _1.name.length }.max
31
34
  end
32
35
 
33
36
  # @return [DocumentType]
34
37
  def document_type(id)
35
38
  @document_types[id]
36
39
  end
37
-
38
- # @return [Boolean]
39
- def zip_types?
40
- !!zip_types&.any?
41
- end
42
-
43
- private
44
-
45
- # @param lookup_dirs_list [Array[String]]
46
- # @return [void]
47
- def process_lookup_dirs(lookup_dirs_list)
48
- @lookup_dirs = lookup_dirs_list.filter_map do |dir|
49
- expanded = File.expand_path(dir)
50
- unless File.directory?(expanded)
51
- Pdfh.debug " ** Error, Directory #{dir} does not exists."
52
- next
53
- end
54
- expanded
55
- end
56
- raise ArgumentError, "No valid Look up directories configured." if lookup_dirs.empty?
57
- end
58
-
59
- # @return [void]
60
- # @param dir [String]
61
- def process_destination_base(dir)
62
- @base_path = File.expand_path(dir)
63
- raise ArgumentError, "Destination base directory is not configured." if @base_path.nil?
64
- raise ArgumentError, "Destination base directory #{@base_path} does not exist." unless File.directory?(@base_path)
65
- end
66
-
67
- # @param doc_types [Array<Hash>]
68
- # @return [void]
69
- def build_doc_types(doc_types)
70
- @document_types = doc_types.each_with_object({}) do |data, result|
71
- next if missing_required_fields?(data)
72
-
73
- doc_type = DocumentType.new(data)
74
- result.store(doc_type.gid, doc_type)
75
- rescue ArgumentError => e
76
- Pdfh.error_print e.message, exit_app: false
77
- Pdfh.backtrace_print e if Pdfh.verbose?
78
- end
79
- end
80
-
81
- def missing_required_fields?(data)
82
- missing_fields = DocumentType::REQUIRED_FIELDS.select { |field| data[field].nil? || data[field].to_s.empty? }
83
- if missing_fields.any?
84
- type_name = data[:name] || "Unnamed type"
85
- missing_fields_names = missing_fields.join(", ")
86
- Pdfh.info format("Skipping document type %<type_name>s. Missing required fields: %<missing_fields>s",
87
- type_name: type_name.colorize(:green),
88
- missing_fields: missing_fields_names.colorize(:red))
89
- end
90
- missing_fields.any?
91
- end
92
-
93
- # @param zip_types [Array<Hash>]
94
- # @return [void]
95
- def build_zip_types(zip_types)
96
- exit(1) if Pdfh::Utils::DependencyValidator.missing?(:unzip)
97
-
98
- @zip_types = zip_types.compact.map { ZipType.new(_1) }
99
- end
100
40
  end
101
41
  end
@@ -0,0 +1,27 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Pdfh
4
+ module Services
5
+ # Scans lookup dirs and returns matched documents
6
+ class DirectoryScanner
7
+ # @param directories [Array<String>]
8
+ # @return [DirectoryScanner]
9
+ def initialize(directories)
10
+ @directories = directories
11
+ end
12
+
13
+ # @return [Array<String>]
14
+ def scan
15
+ @directories.flat_map { |dir| scan_dir(dir) }
16
+ end
17
+
18
+ private
19
+
20
+ # @param dir [String]
21
+ # @return [Array<String>]
22
+ def scan_dir(dir)
23
+ Dir.glob(File.join(dir, "*.pdf"))
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,125 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Pdfh
4
+ module Services
5
+ # Manages the documents, rename, move, etc.
6
+ class DocumentManager
7
+ PDF_UNLOCKED_MAGIC_SUFFIX = "_unlocked"
8
+
9
+ # @param document [Document]
10
+ # @param base_path [String]
11
+ # @param dry_run [Boolean]
12
+ # @return [DocumentManager]
13
+ def initialize(document, base_path:, dry_run:)
14
+ @document = document
15
+ @base_path = base_path
16
+ @dry_run = dry_run
17
+ end
18
+
19
+ # @return [void]
20
+ def call
21
+ destination_dir = File.join(@base_path, @document.store_path)
22
+ destination_file = File.join(destination_dir, @document.new_name)
23
+
24
+ print_info(destination_dir) if Pdfh.logger.verbose?
25
+ create_destination_dir(destination_dir)
26
+ copy_pdf(destination_file)
27
+ move_companion_files(destination_dir)
28
+ backup_original
29
+ end
30
+
31
+ private
32
+
33
+ # @!attribute [r] document
34
+ # @return [Document]
35
+ attr_reader :document
36
+
37
+ # @return [Boolean]
38
+ def dry_run? = @dry_run
39
+
40
+ # @param dir [String]
41
+ # @return [void]
42
+ def create_destination_dir(dir)
43
+ return if Dir.exist?(dir)
44
+
45
+ Pdfh.logger.debug "Creating directory: #{dir}"
46
+ FileUtils.mkdir_p(dir) unless @dry_run
47
+ end
48
+
49
+ # @param destination_file [String]
50
+ # @return [void]
51
+ def copy_pdf(destination_file)
52
+ source_file = @document.file_info.path
53
+
54
+ companion_extensions = companion_files.map { File.extname(_1).delete(".") }
55
+ companion_str = companion_extensions.any? ? " [#{companion_extensions.join(", ").colorize(:magenta)}]" : ""
56
+ message = format("[%<type>s] %<file>s -> %<dest>s#{companion_str}",
57
+ type: document.type.name.ljust(15).colorize(:green),
58
+ file: document.file_info.name.colorize(:blue),
59
+ dest: document.new_name.colorize(:cyan))
60
+ if @dry_run
61
+ Pdfh.logger.info "#{"dry".colorize(:red)} #{message}" unless Pdfh.logger.verbose?
62
+ return
63
+ end
64
+
65
+ Pdfh.logger.info "#{"".colorize(:green)} #{message}" unless Pdfh.logger.verbose?
66
+ FileUtils.cp(source_file, destination_file, preserve: true)
67
+ end
68
+
69
+ # @param destination_dir [String]
70
+ # @return [void]
71
+ def move_companion_files(destination_dir)
72
+ companion_files.each do |companion|
73
+ source = companion
74
+ dest_name = File.basename(@document.new_name, @document.file_info.extension) + File.extname(companion)
75
+ destination = File.join(destination_dir, dest_name)
76
+
77
+ FileUtils.cp(source, destination, preserve: true) unless dry_run?
78
+ end
79
+ end
80
+
81
+ # @return [void]
82
+ def backup_original
83
+ source_file = document.file_info.path
84
+ backup_file = "#{source_file}.bkp"
85
+
86
+ FileUtils.mv(source_file, backup_file) unless dry_run?
87
+ end
88
+
89
+ # Finds companion files by removing the _unlocked suffix from the PDF name if present.
90
+ # This allows PDFs unlock by qpdf to locate their original companion files (e.g., .xml, .txt)
91
+ # that were never renamed with the _unlocked suffix.
92
+ #
93
+ # @return [Array<String>] array of non-PDF files with the same base name
94
+ # @example
95
+ # # If document is "cuenta_unlocked.pdf", searches for "cuenta.*"
96
+ # # Returns ["cuenta.xml", "cuenta.txt"] (excluding "cuenta.pdf")
97
+ def companion_files
98
+ @companion_files ||= begin
99
+ base_name = document.file_info.stem.delete_suffix(PDF_UNLOCKED_MAGIC_SUFFIX)
100
+ Dir.glob(File.join(document.file_info.dir, "#{base_name}.*")).reject do |file|
101
+ File.extname(file) == ".pdf"
102
+ end
103
+ end
104
+ end
105
+
106
+ # @param property [String]
107
+ # @param info [String]
108
+ # @return [void]
109
+ def print_info_line(property, info)
110
+ Pdfh.logger.ident_print property, info.to_s, color: :light_blue, width: 12
111
+ end
112
+
113
+ # @param destination_dir [String]
114
+ # @return [void]
115
+ def print_info(destination_dir)
116
+ print_info_line "Type", document.type.name
117
+ print_info_line "Period", document.date_info.period
118
+ print_info_line "New Name", document.new_name
119
+ print_info_line "Store Path", destination_dir
120
+ print_info_line "Extra files", companion_files.any? ? companion_files.join(", ") : "—"
121
+ print_info_line "Processed?", "No (in Dry mode)" if dry_run?
122
+ end
123
+ end
124
+ end
125
+ end
@@ -0,0 +1,57 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Pdfh
4
+ module Services
5
+ # Matches a PDF file against settings and builds a Document if valid
6
+ class DocumentMatcher
7
+ # @param document_types [Array<DocumentType>]
8
+ # @return [DocumentMatcher]
9
+ def initialize(document_types)
10
+ @document_types = document_types
11
+ end
12
+
13
+ # @param file [String] Path to the PDF file
14
+ # @param text [String] Extracted text from the PDF
15
+ # @return [Array<Document>]
16
+ def match(file, text)
17
+ @document_types.each_with_object([]) do |type, matches|
18
+ # Try to match the document type by ID (content)
19
+ next unless type.re_id.match?(text)
20
+
21
+ Pdfh.logger.debug "Matched document type: #{type.name}"
22
+
23
+ # Try to match the date in the text
24
+ date_match = type.re_date.match(text)
25
+ unless date_match
26
+ Pdfh.logger.debug "No date match found for #{type.name}"
27
+ next
28
+ end
29
+
30
+ # Extract date captures (handles both named and positional captures)
31
+ date_captures = extract_date_captures(date_match)
32
+
33
+ matches << Document.new(file, type, text, date_captures)
34
+ end
35
+ end
36
+
37
+ private
38
+
39
+ # Extracts date captures from MatchData, supporting both named and positional captures
40
+ # @param match_data [MatchData]
41
+ # @return [Hash{String => String}] Hash with keys 'm' (month), 'y' (year), 'd' (day)
42
+ def extract_date_captures(match_data)
43
+ if match_data.names.any?
44
+ Pdfh.logger.debug "Using #{"named".colorize(:green)} captures: #{match_data.named_captures.inspect}"
45
+ return match_data.named_captures
46
+ end
47
+
48
+ # Fall back to positional captures — assume order: [month, year, day?]
49
+ {}.tap do |c|
50
+ c["m"], c["y"], c["d"] = match_data.captures
51
+ c.compact!
52
+ Pdfh.logger.debug "Using #{"positional".colorize(:red)} captures: #{c.inspect}"
53
+ end
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,76 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "optparse"
4
+
5
+ module Pdfh
6
+ module Services
7
+ # Handles Argument options
8
+ class OptParser
9
+ # @param argv [Array<String>] command line arguments (ie. ARGV)
10
+ # @param console [Pdfh::Console, nil]
11
+ # @return [self]
12
+ def initialize(argv:, console: nil)
13
+ @argv = argv
14
+ @console = console || Console.new(false)
15
+ @options = {
16
+ verbose: false,
17
+ dry: false
18
+ }
19
+ end
20
+
21
+ # @return [Hash] Parsed options including flags and file arguments
22
+ def parse_argv
23
+ option_parser = build_option_parser
24
+ option_parser.parse!(@argv)
25
+ @options
26
+ rescue OptionParser::InvalidOption => e
27
+ @console.error_print(e.message, exit_app: false)
28
+ @console.info option_parser.help
29
+ exit 1
30
+ end
31
+
32
+ private
33
+
34
+ # @return [OptionParser] Configured OptionParser instance
35
+ def build_option_parser
36
+ OptionParser.new do |opts|
37
+ opts.banner = "Usage: #{opts.program_name} [options]"
38
+ opts.separator ""
39
+ opts.separator "Specific options:"
40
+
41
+ opts.on("-v", "--verbose", "Show more output. Useful for debug") { @options[:verbose] = true }
42
+ opts.on("-d", "--dry", "Dry run, does not write new pdf") { @options[:dry] = true }
43
+ opts.on_tail("-T", "--list-types", "List document types in configuration") { list_types && exit }
44
+ opts.on_tail("-V", "--version", "Show version") { version || exit }
45
+ opts.on_tail("-h", "--help", "help (this dialog)") { help || exit }
46
+ end
47
+ end
48
+
49
+ # @return [nil]
50
+ def version
51
+ @console.info "#{build_option_parser.program_name} v#{Pdfh::VERSION}"
52
+ end
53
+
54
+ # @return [nil]
55
+ def help
56
+ @console.info build_option_parser
57
+ end
58
+
59
+ # Lists the available document types
60
+ # @return [nil]
61
+ def list_types
62
+ # Temporarily set logger for loading settings
63
+ Pdfh.logger = @console
64
+
65
+ settings = SettingsBuilder.call
66
+ spacing = " " * 2
67
+ max_width = settings.document_types.map { |t| t.gid.size }.max
68
+ @console.info "#{spacing}#{"ID".ljust(max_width)} Type Name"
69
+ @console.info "#{spacing}#{"—" * max_width} #{"—" * 23}"
70
+ settings.document_types.each do |type|
71
+ @console.info "#{spacing}#{type.gid.ljust(max_width).yellow} #{type.name}"
72
+ end
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,45 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "shellwords"
4
+ require "English"
5
+
6
+ module Pdfh
7
+ module Services
8
+ # Extracts text from a PDF using pdftotext command
9
+ class PdfTextExtractor
10
+ # @param pdf_path [String]
11
+ # @return [String]
12
+ # @raise [ArgumentError] if file doesn't exist or is not a PDF
13
+ # @raise [RuntimeError] if extraction fails
14
+ def self.call(pdf_path)
15
+ validate_file!(pdf_path)
16
+
17
+ # Use Shellwords to properly escape the path for shell execution
18
+ safe_path = Shellwords.escape(pdf_path)
19
+ cmd = "pdftotext -enc UTF-8 -layout #{safe_path} - 2>/dev/null"
20
+
21
+ text = `#{cmd}`
22
+ exit_status = $CHILD_STATUS
23
+
24
+ # Check if command executed successfully
25
+ if exit_status.nil? || !exit_status.success?
26
+ Pdfh.logger.debug "Failed to extract text from: #{pdf_path}"
27
+ return ""
28
+ end
29
+
30
+ text
31
+ end
32
+
33
+ # @param pdf_path [String]
34
+ # @return [void]
35
+ # @raise [ArgumentError] if validation fails
36
+ def self.validate_file!(pdf_path)
37
+ raise ArgumentError, "PDF path cannot be nil" if pdf_path.nil?
38
+ raise ArgumentError, "PDF path cannot be empty" if pdf_path.empty?
39
+ raise ArgumentError, "File does not exist: #{pdf_path}" unless File.exist?(pdf_path)
40
+ raise ArgumentError, "Not a file: #{pdf_path}" unless File.file?(pdf_path)
41
+ raise ArgumentError, "Not a PDF file: #{pdf_path}" unless File.extname(pdf_path).casecmp?(".pdf")
42
+ end
43
+ end
44
+ end
45
+ end