pdfh 3.3.1 → 4.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.editorconfig +0 -15
- data/.gitignore +3 -0
- data/.pre-commit-config.yaml +1 -1
- data/.rubocop.yml +5 -1
- data/.rubocop_todo.yml +5 -18
- data/.simplecov +32 -0
- data/AGENTS.md +174 -0
- data/CHANGELOG.md +74 -9
- data/Gemfile +0 -4
- data/Gemfile.lock +17 -37
- data/README.md +72 -37
- data/Rakefile +24 -6
- data/bin/console +3 -10
- data/bin/run +0 -1
- data/exe/pdfh +1 -1
- data/justfile +65 -0
- data/lib/pdfh/main.rb +25 -120
- data/lib/pdfh/models/document.rb +43 -128
- data/lib/pdfh/models/document_type.rb +35 -69
- data/lib/pdfh/models/run_options.rb +20 -0
- data/lib/pdfh/models/settings.rb +23 -83
- data/lib/pdfh/services/directory_scanner.rb +27 -0
- data/lib/pdfh/services/document_manager.rb +125 -0
- data/lib/pdfh/services/document_matcher.rb +57 -0
- data/lib/pdfh/services/opt_parser.rb +76 -0
- data/lib/pdfh/services/pdf_text_extractor.rb +45 -0
- data/lib/pdfh/services/settings_builder.rb +113 -0
- data/lib/pdfh/services/settings_validator.rb +150 -0
- data/lib/pdfh/utils/console.rb +5 -5
- data/lib/pdfh/utils/date_info.rb +55 -0
- data/lib/pdfh/utils/file_info.rb +47 -0
- data/lib/pdfh/utils/rename_validator.rb +4 -3
- data/lib/pdfh/version.rb +1 -1
- data/lib/pdfh.rb +25 -20
- data/mise.toml +20 -3
- data/pdfh.gemspec +3 -3
- metadata +18 -15
- data/lib/ext/string.rb +0 -9
- data/lib/pdfh/concerns/password_decodable.rb +0 -31
- data/lib/pdfh/models/document_period.rb +0 -37
- data/lib/pdfh/models/document_sub_type.rb +0 -6
- data/lib/pdfh/models/zip_types.rb +0 -17
- data/lib/pdfh/settings_template.rb +0 -21
- data/lib/pdfh/utils/opt_parser.rb +0 -78
- data/lib/pdfh/utils/options.rb +0 -38
- data/lib/pdfh/utils/pdf_file_handler.rb +0 -122
- data/lib/pdfh/utils/settings_builder.rb +0 -62
|
@@ -3,99 +3,65 @@
|
|
|
3
3
|
module Pdfh
|
|
4
4
|
# Represents a type of document that can be processed by pdfh
|
|
5
5
|
class DocumentType
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
REQUIRED_FIELDS = %i[name re_file re_date store_path].freeze
|
|
6
|
+
REQUIRED_KEYS = %i[name re_date store_path].freeze
|
|
7
|
+
DEFAULT_NAME_TEMPLATE = "{name} {period}"
|
|
9
8
|
|
|
10
9
|
# @!attribute [r] name
|
|
11
|
-
# @return [String] The name of the document type
|
|
12
|
-
# @!attribute [r]
|
|
13
|
-
# @return [Regexp] The regular expression to
|
|
10
|
+
# @return [String] The name of the document type
|
|
11
|
+
# @!attribute [r] re_id
|
|
12
|
+
# @return [Regexp] The regular expression to extract the document ID
|
|
14
13
|
# @!attribute [r] re_date
|
|
15
|
-
# @return [Regexp] The regular expression to extract dates
|
|
16
|
-
# @!attribute [r] pwd
|
|
17
|
-
# @return [String, nil] The base64 password for the document type, if any.
|
|
14
|
+
# @return [Regexp] The regular expression to extract dates
|
|
18
15
|
# @!attribute [r] store_path
|
|
19
|
-
# @return [String] The path where the document will be stored
|
|
16
|
+
# @return [String] The path where the document will be stored
|
|
20
17
|
# @!attribute [r] name_template
|
|
21
|
-
# @return [String] The template for generating document names
|
|
22
|
-
# @!attribute [r]
|
|
23
|
-
# @return [
|
|
24
|
-
|
|
18
|
+
# @return [String] The template for generating document names
|
|
19
|
+
# @!attribute [r] path_validator
|
|
20
|
+
# @return [RenameValidator] The validator for the storage path
|
|
21
|
+
# @!attribute [r] name_validator
|
|
22
|
+
# @return [RenameValidator] The validator for the document name
|
|
23
|
+
attr_reader :name, :re_id, :re_date, :store_path, :name_template, :path_validator, :name_validator
|
|
25
24
|
|
|
26
|
-
# @param args [Hash]
|
|
27
|
-
# @return [
|
|
25
|
+
# @param args [Hash] The initialization arguments
|
|
26
|
+
# @return [DocumentType]
|
|
28
27
|
def initialize(args)
|
|
29
28
|
args.each { |k, v| instance_variable_set(:"@#{k}", v) }
|
|
30
|
-
|
|
31
|
-
|
|
29
|
+
return if missing_keys?
|
|
30
|
+
|
|
31
|
+
@name = name.to_s.strip
|
|
32
|
+
@re_id = Regexp.new(re_id || name)
|
|
32
33
|
@re_date = Regexp.new(re_date)
|
|
33
|
-
@
|
|
34
|
+
@name_template = name_template || DEFAULT_NAME_TEMPLATE
|
|
34
35
|
@path_validator = RenameValidator.new(store_path)
|
|
35
|
-
@name_validator = RenameValidator.new(name_template)
|
|
36
|
-
|
|
36
|
+
@name_validator = RenameValidator.new(@name_template)
|
|
37
|
+
end
|
|
37
38
|
|
|
38
|
-
|
|
39
|
+
# @return [Boolean]
|
|
40
|
+
def valid?
|
|
41
|
+
missing_keys.empty? &&
|
|
42
|
+
@path_validator.valid? &&
|
|
43
|
+
@name_validator.valid?
|
|
39
44
|
end
|
|
40
45
|
|
|
41
|
-
# @return [Hash{
|
|
46
|
+
# @return [Hash{String => Object}]
|
|
42
47
|
def to_h
|
|
43
48
|
instance_variables.to_h { |var| [var.to_s.delete_prefix("@"), instance_variable_get(var)] }
|
|
44
49
|
end
|
|
45
50
|
|
|
46
51
|
# removes special characters from string and replaces spaces with dashes
|
|
47
|
-
# @example
|
|
48
|
-
# "Test This?%&".gid
|
|
49
|
-
# # => "test-this"
|
|
52
|
+
# @example
|
|
53
|
+
# "Test This?%&".gid # => "test-this"
|
|
50
54
|
# @return [String]
|
|
51
55
|
def gid
|
|
52
56
|
name.downcase.gsub(/[^0-9A-Za-z\s]/, "").tr(" ", "-")
|
|
53
57
|
end
|
|
54
58
|
|
|
55
|
-
#
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
# Regexp.new(st.name).match?(name)
|
|
59
|
-
sub_types&.find { |st| /#{st.name}/i.match?(text) }
|
|
60
|
-
end
|
|
61
|
-
|
|
62
|
-
# @param values [Hash{Symbol->String}
|
|
63
|
-
# @return [String]
|
|
64
|
-
def generate_new_name(values)
|
|
65
|
-
@name_validator.gsub(values)
|
|
66
|
-
end
|
|
67
|
-
|
|
68
|
-
# @param values [Hash{Symbol->String}
|
|
69
|
-
# @return [String]
|
|
70
|
-
def generate_path(values)
|
|
71
|
-
@path_validator.gsub(values)
|
|
59
|
+
# @return [Array<Symbol>]
|
|
60
|
+
def missing_keys
|
|
61
|
+
@missing_keys ||= REQUIRED_KEYS.select { |key| instance_variable_get(:"@#{key}").to_s.strip.empty? }
|
|
72
62
|
end
|
|
73
63
|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
attr_accessor :path_validator, :name_validator
|
|
77
|
-
|
|
78
|
-
# @param sub_types [Array<Hash{Symbol->String}>]
|
|
79
|
-
# @return [Array<DocumentSubType>]
|
|
80
|
-
def extract_subtypes(sub_types)
|
|
81
|
-
sub_types.map do |st|
|
|
82
|
-
data = {
|
|
83
|
-
name: st[:name],
|
|
84
|
-
month_offset: st[:month_offset].to_i,
|
|
85
|
-
re_date: st[:re_date] && Regexp.new(st[:re_date])
|
|
86
|
-
}.compact
|
|
87
|
-
DocumentSubType.new(data)
|
|
88
|
-
end
|
|
89
|
-
end
|
|
90
|
-
|
|
91
|
-
# @raise [ArgumentError] when called
|
|
92
|
-
# @return [void]
|
|
93
|
-
def raise_validators_error
|
|
94
|
-
template = "has invalid %<field>s[Unknown tokens=%<error>s]"
|
|
95
|
-
errors = []
|
|
96
|
-
errors << format(template, field: :store_path, error: path_validator.unknown_list) unless path_validator.valid?
|
|
97
|
-
errors << format(template, field: :name_template, error: name_validator.unknown_list) unless name_validator.valid?
|
|
98
|
-
raise ArgumentError, "Document type #{name.inspect} #{errors.join(", ")}"
|
|
99
|
-
end
|
|
64
|
+
# @return [Boolean]
|
|
65
|
+
def missing_keys? = missing_keys.any?
|
|
100
66
|
end
|
|
101
67
|
end
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Pdfh
|
|
4
|
+
# Runtime options for the application
|
|
5
|
+
class RunOptions
|
|
6
|
+
# @param verbose [Boolean]
|
|
7
|
+
# @param dry [Boolean]
|
|
8
|
+
# @return [RunOptions]
|
|
9
|
+
def initialize(verbose: false, dry: false)
|
|
10
|
+
@verbose = verbose
|
|
11
|
+
@dry = dry
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
# @return [Boolean]
|
|
15
|
+
def verbose? = @verbose
|
|
16
|
+
|
|
17
|
+
# @return [Boolean]
|
|
18
|
+
def dry? = @dry
|
|
19
|
+
end
|
|
20
|
+
end
|
data/lib/pdfh/models/settings.rb
CHANGED
|
@@ -1,101 +1,41 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
module Pdfh
|
|
4
|
-
# Handles the config yaml data mapping, and associates a file name with a doc type
|
|
4
|
+
# Handles the config yaml data mapping, and associates a file name with a doc type.
|
|
5
|
+
# This is a pure data object — validation is handled by Services::SettingsValidator.
|
|
5
6
|
class Settings
|
|
6
7
|
# @!attribute [r] lookup_dirs
|
|
7
|
-
# @return [Array<String>] List of directories to look up for processing.
|
|
8
|
+
# @return [Array<String>] List of validated, expanded directories to look up for processing.
|
|
8
9
|
# @!attribute [r] base_path
|
|
9
|
-
# @return [String] The base directory path for storing processed files.
|
|
10
|
-
|
|
11
|
-
# @return [Array<ZipType>, nil] List of zip types to process, or nil if none.
|
|
12
|
-
attr_reader :lookup_dirs, :base_path, :zip_types
|
|
10
|
+
# @return [String] The validated, expanded base directory path for storing processed files.
|
|
11
|
+
attr_reader :lookup_dirs, :base_path
|
|
13
12
|
|
|
14
|
-
# @param
|
|
15
|
-
# @
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
Pdfh.debug
|
|
23
|
-
|
|
24
|
-
build_doc_types(config_data[:document_types])
|
|
25
|
-
build_zip_types(config_data[:zip_types]) if config_data.key?(:zip_types)
|
|
13
|
+
# @param lookup_dirs [Array<String>] Already validated and expanded directories
|
|
14
|
+
# @param base_path [String] Already validated and expanded base path
|
|
15
|
+
# @param document_types [Hash{String => DocumentType}] Already validated document types keyed by gid
|
|
16
|
+
# @return [Settings]
|
|
17
|
+
def initialize(lookup_dirs:, base_path:, document_types:)
|
|
18
|
+
@lookup_dirs = lookup_dirs
|
|
19
|
+
@base_path = base_path
|
|
20
|
+
@document_types = document_types
|
|
26
21
|
end
|
|
27
22
|
|
|
28
23
|
# @return [Array<DocumentType>]
|
|
29
|
-
def document_types
|
|
30
|
-
|
|
24
|
+
def document_types = @document_types.values
|
|
25
|
+
|
|
26
|
+
# @example
|
|
27
|
+
# # document_types.map(&:name) ['12345', '12', '123']
|
|
28
|
+
# settings.document_types_name_max_size #=> 5
|
|
29
|
+
# @return [Integer]
|
|
30
|
+
def document_types_name_max_size
|
|
31
|
+
return 0 if document_types.empty?
|
|
32
|
+
|
|
33
|
+
document_types.map { _1.name.length }.max
|
|
31
34
|
end
|
|
32
35
|
|
|
33
36
|
# @return [DocumentType]
|
|
34
37
|
def document_type(id)
|
|
35
38
|
@document_types[id]
|
|
36
39
|
end
|
|
37
|
-
|
|
38
|
-
# @return [Boolean]
|
|
39
|
-
def zip_types?
|
|
40
|
-
!!zip_types&.any?
|
|
41
|
-
end
|
|
42
|
-
|
|
43
|
-
private
|
|
44
|
-
|
|
45
|
-
# @param lookup_dirs_list [Array[String]]
|
|
46
|
-
# @return [void]
|
|
47
|
-
def process_lookup_dirs(lookup_dirs_list)
|
|
48
|
-
@lookup_dirs = lookup_dirs_list.filter_map do |dir|
|
|
49
|
-
expanded = File.expand_path(dir)
|
|
50
|
-
unless File.directory?(expanded)
|
|
51
|
-
Pdfh.debug " ** Error, Directory #{dir} does not exists."
|
|
52
|
-
next
|
|
53
|
-
end
|
|
54
|
-
expanded
|
|
55
|
-
end
|
|
56
|
-
raise ArgumentError, "No valid Look up directories configured." if lookup_dirs.empty?
|
|
57
|
-
end
|
|
58
|
-
|
|
59
|
-
# @return [void]
|
|
60
|
-
# @param dir [String]
|
|
61
|
-
def process_destination_base(dir)
|
|
62
|
-
@base_path = File.expand_path(dir)
|
|
63
|
-
raise ArgumentError, "Destination base directory is not configured." if @base_path.nil?
|
|
64
|
-
raise ArgumentError, "Destination base directory #{@base_path} does not exist." unless File.directory?(@base_path)
|
|
65
|
-
end
|
|
66
|
-
|
|
67
|
-
# @param doc_types [Array<Hash>]
|
|
68
|
-
# @return [void]
|
|
69
|
-
def build_doc_types(doc_types)
|
|
70
|
-
@document_types = doc_types.each_with_object({}) do |data, result|
|
|
71
|
-
next if missing_required_fields?(data)
|
|
72
|
-
|
|
73
|
-
doc_type = DocumentType.new(data)
|
|
74
|
-
result.store(doc_type.gid, doc_type)
|
|
75
|
-
rescue ArgumentError => e
|
|
76
|
-
Pdfh.error_print e.message, exit_app: false
|
|
77
|
-
Pdfh.backtrace_print e if Pdfh.verbose?
|
|
78
|
-
end
|
|
79
|
-
end
|
|
80
|
-
|
|
81
|
-
def missing_required_fields?(data)
|
|
82
|
-
missing_fields = DocumentType::REQUIRED_FIELDS.select { |field| data[field].nil? || data[field].to_s.empty? }
|
|
83
|
-
if missing_fields.any?
|
|
84
|
-
type_name = data[:name] || "Unnamed type"
|
|
85
|
-
missing_fields_names = missing_fields.join(", ")
|
|
86
|
-
Pdfh.info format("Skipping document type %<type_name>s. Missing required fields: %<missing_fields>s",
|
|
87
|
-
type_name: type_name.colorize(:green),
|
|
88
|
-
missing_fields: missing_fields_names.colorize(:red))
|
|
89
|
-
end
|
|
90
|
-
missing_fields.any?
|
|
91
|
-
end
|
|
92
|
-
|
|
93
|
-
# @param zip_types [Array<Hash>]
|
|
94
|
-
# @return [void]
|
|
95
|
-
def build_zip_types(zip_types)
|
|
96
|
-
exit(1) if Pdfh::Utils::DependencyValidator.missing?(:unzip)
|
|
97
|
-
|
|
98
|
-
@zip_types = zip_types.compact.map { ZipType.new(_1) }
|
|
99
|
-
end
|
|
100
40
|
end
|
|
101
41
|
end
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Pdfh
|
|
4
|
+
module Services
|
|
5
|
+
# Scans lookup dirs and returns matched documents
|
|
6
|
+
class DirectoryScanner
|
|
7
|
+
# @param directories [Array<String>]
|
|
8
|
+
# @return [DirectoryScanner]
|
|
9
|
+
def initialize(directories)
|
|
10
|
+
@directories = directories
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
# @return [Array<String>]
|
|
14
|
+
def scan
|
|
15
|
+
@directories.flat_map { |dir| scan_dir(dir) }
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
private
|
|
19
|
+
|
|
20
|
+
# @param dir [String]
|
|
21
|
+
# @return [Array<String>]
|
|
22
|
+
def scan_dir(dir)
|
|
23
|
+
Dir.glob(File.join(dir, "*.pdf"))
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Pdfh
|
|
4
|
+
module Services
|
|
5
|
+
# Manages the documents, rename, move, etc.
|
|
6
|
+
class DocumentManager
|
|
7
|
+
PDF_UNLOCKED_MAGIC_SUFFIX = "_unlocked"
|
|
8
|
+
|
|
9
|
+
# @param document [Document]
|
|
10
|
+
# @param base_path [String]
|
|
11
|
+
# @param dry_run [Boolean]
|
|
12
|
+
# @return [DocumentManager]
|
|
13
|
+
def initialize(document, base_path:, dry_run:)
|
|
14
|
+
@document = document
|
|
15
|
+
@base_path = base_path
|
|
16
|
+
@dry_run = dry_run
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
# @return [void]
|
|
20
|
+
def call
|
|
21
|
+
destination_dir = File.join(@base_path, @document.store_path)
|
|
22
|
+
destination_file = File.join(destination_dir, @document.new_name)
|
|
23
|
+
|
|
24
|
+
print_info(destination_dir) if Pdfh.logger.verbose?
|
|
25
|
+
create_destination_dir(destination_dir)
|
|
26
|
+
copy_pdf(destination_file)
|
|
27
|
+
move_companion_files(destination_dir)
|
|
28
|
+
backup_original
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
private
|
|
32
|
+
|
|
33
|
+
# @!attribute [r] document
|
|
34
|
+
# @return [Document]
|
|
35
|
+
attr_reader :document
|
|
36
|
+
|
|
37
|
+
# @return [Boolean]
|
|
38
|
+
def dry_run? = @dry_run
|
|
39
|
+
|
|
40
|
+
# @param dir [String]
|
|
41
|
+
# @return [void]
|
|
42
|
+
def create_destination_dir(dir)
|
|
43
|
+
return if Dir.exist?(dir)
|
|
44
|
+
|
|
45
|
+
Pdfh.logger.debug "Creating directory: #{dir}"
|
|
46
|
+
FileUtils.mkdir_p(dir) unless @dry_run
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# @param destination_file [String]
|
|
50
|
+
# @return [void]
|
|
51
|
+
def copy_pdf(destination_file)
|
|
52
|
+
source_file = @document.file_info.path
|
|
53
|
+
|
|
54
|
+
companion_extensions = companion_files.map { File.extname(_1).delete(".") }
|
|
55
|
+
companion_str = companion_extensions.any? ? " [#{companion_extensions.join(", ").colorize(:magenta)}]" : ""
|
|
56
|
+
message = format("[%<type>s] %<file>s -> %<dest>s#{companion_str}",
|
|
57
|
+
type: document.type.name.ljust(15).colorize(:green),
|
|
58
|
+
file: document.file_info.name.colorize(:blue),
|
|
59
|
+
dest: document.new_name.colorize(:cyan))
|
|
60
|
+
if @dry_run
|
|
61
|
+
Pdfh.logger.info "#{"dry".colorize(:red)} #{message}" unless Pdfh.logger.verbose?
|
|
62
|
+
return
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
Pdfh.logger.info "#{"".colorize(:green)} #{message}" unless Pdfh.logger.verbose?
|
|
66
|
+
FileUtils.cp(source_file, destination_file, preserve: true)
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# @param destination_dir [String]
|
|
70
|
+
# @return [void]
|
|
71
|
+
def move_companion_files(destination_dir)
|
|
72
|
+
companion_files.each do |companion|
|
|
73
|
+
source = companion
|
|
74
|
+
dest_name = File.basename(@document.new_name, @document.file_info.extension) + File.extname(companion)
|
|
75
|
+
destination = File.join(destination_dir, dest_name)
|
|
76
|
+
|
|
77
|
+
FileUtils.cp(source, destination, preserve: true) unless dry_run?
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# @return [void]
|
|
82
|
+
def backup_original
|
|
83
|
+
source_file = document.file_info.path
|
|
84
|
+
backup_file = "#{source_file}.bkp"
|
|
85
|
+
|
|
86
|
+
FileUtils.mv(source_file, backup_file) unless dry_run?
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# Finds companion files by removing the _unlocked suffix from the PDF name if present.
|
|
90
|
+
# This allows PDFs unlock by qpdf to locate their original companion files (e.g., .xml, .txt)
|
|
91
|
+
# that were never renamed with the _unlocked suffix.
|
|
92
|
+
#
|
|
93
|
+
# @return [Array<String>] array of non-PDF files with the same base name
|
|
94
|
+
# @example
|
|
95
|
+
# # If document is "cuenta_unlocked.pdf", searches for "cuenta.*"
|
|
96
|
+
# # Returns ["cuenta.xml", "cuenta.txt"] (excluding "cuenta.pdf")
|
|
97
|
+
def companion_files
|
|
98
|
+
@companion_files ||= begin
|
|
99
|
+
base_name = document.file_info.stem.delete_suffix(PDF_UNLOCKED_MAGIC_SUFFIX)
|
|
100
|
+
Dir.glob(File.join(document.file_info.dir, "#{base_name}.*")).reject do |file|
|
|
101
|
+
File.extname(file) == ".pdf"
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
# @param property [String]
|
|
107
|
+
# @param info [String]
|
|
108
|
+
# @return [void]
|
|
109
|
+
def print_info_line(property, info)
|
|
110
|
+
Pdfh.logger.ident_print property, info.to_s, color: :light_blue, width: 12
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
# @param destination_dir [String]
|
|
114
|
+
# @return [void]
|
|
115
|
+
def print_info(destination_dir)
|
|
116
|
+
print_info_line "Type", document.type.name
|
|
117
|
+
print_info_line "Period", document.date_info.period
|
|
118
|
+
print_info_line "New Name", document.new_name
|
|
119
|
+
print_info_line "Store Path", destination_dir
|
|
120
|
+
print_info_line "Extra files", companion_files.any? ? companion_files.join(", ") : "—"
|
|
121
|
+
print_info_line "Processed?", "No (in Dry mode)" if dry_run?
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
end
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Pdfh
|
|
4
|
+
module Services
|
|
5
|
+
# Matches a PDF file against settings and builds a Document if valid
|
|
6
|
+
class DocumentMatcher
|
|
7
|
+
# @param document_types [Array<DocumentType>]
|
|
8
|
+
# @return [DocumentMatcher]
|
|
9
|
+
def initialize(document_types)
|
|
10
|
+
@document_types = document_types
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
# @param file [String] Path to the PDF file
|
|
14
|
+
# @param text [String] Extracted text from the PDF
|
|
15
|
+
# @return [Array<Document>]
|
|
16
|
+
def match(file, text)
|
|
17
|
+
@document_types.each_with_object([]) do |type, matches|
|
|
18
|
+
# Try to match the document type by ID (content)
|
|
19
|
+
next unless type.re_id.match?(text)
|
|
20
|
+
|
|
21
|
+
Pdfh.logger.debug "Matched document type: #{type.name}"
|
|
22
|
+
|
|
23
|
+
# Try to match the date in the text
|
|
24
|
+
date_match = type.re_date.match(text)
|
|
25
|
+
unless date_match
|
|
26
|
+
Pdfh.logger.debug "No date match found for #{type.name}"
|
|
27
|
+
next
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Extract date captures (handles both named and positional captures)
|
|
31
|
+
date_captures = extract_date_captures(date_match)
|
|
32
|
+
|
|
33
|
+
matches << Document.new(file, type, text, date_captures)
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
private
|
|
38
|
+
|
|
39
|
+
# Extracts date captures from MatchData, supporting both named and positional captures
|
|
40
|
+
# @param match_data [MatchData]
|
|
41
|
+
# @return [Hash{String => String}] Hash with keys 'm' (month), 'y' (year), 'd' (day)
|
|
42
|
+
def extract_date_captures(match_data)
|
|
43
|
+
if match_data.names.any?
|
|
44
|
+
Pdfh.logger.debug "Using #{"named".colorize(:green)} captures: #{match_data.named_captures.inspect}"
|
|
45
|
+
return match_data.named_captures
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# Fall back to positional captures — assume order: [month, year, day?]
|
|
49
|
+
{}.tap do |c|
|
|
50
|
+
c["m"], c["y"], c["d"] = match_data.captures
|
|
51
|
+
c.compact!
|
|
52
|
+
Pdfh.logger.debug "Using #{"positional".colorize(:red)} captures: #{c.inspect}"
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "optparse"
|
|
4
|
+
|
|
5
|
+
module Pdfh
|
|
6
|
+
module Services
|
|
7
|
+
# Handles Argument options
|
|
8
|
+
class OptParser
|
|
9
|
+
# @param argv [Array<String>] command line arguments (ie. ARGV)
|
|
10
|
+
# @param console [Pdfh::Console, nil]
|
|
11
|
+
# @return [self]
|
|
12
|
+
def initialize(argv:, console: nil)
|
|
13
|
+
@argv = argv
|
|
14
|
+
@console = console || Console.new(false)
|
|
15
|
+
@options = {
|
|
16
|
+
verbose: false,
|
|
17
|
+
dry: false
|
|
18
|
+
}
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
# @return [Hash] Parsed options including flags and file arguments
|
|
22
|
+
def parse_argv
|
|
23
|
+
option_parser = build_option_parser
|
|
24
|
+
option_parser.parse!(@argv)
|
|
25
|
+
@options
|
|
26
|
+
rescue OptionParser::InvalidOption => e
|
|
27
|
+
@console.error_print(e.message, exit_app: false)
|
|
28
|
+
@console.info option_parser.help
|
|
29
|
+
exit 1
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
private
|
|
33
|
+
|
|
34
|
+
# @return [OptionParser] Configured OptionParser instance
|
|
35
|
+
def build_option_parser
|
|
36
|
+
OptionParser.new do |opts|
|
|
37
|
+
opts.banner = "Usage: #{opts.program_name} [options]"
|
|
38
|
+
opts.separator ""
|
|
39
|
+
opts.separator "Specific options:"
|
|
40
|
+
|
|
41
|
+
opts.on("-v", "--verbose", "Show more output. Useful for debug") { @options[:verbose] = true }
|
|
42
|
+
opts.on("-d", "--dry", "Dry run, does not write new pdf") { @options[:dry] = true }
|
|
43
|
+
opts.on_tail("-T", "--list-types", "List document types in configuration") { list_types && exit }
|
|
44
|
+
opts.on_tail("-V", "--version", "Show version") { version || exit }
|
|
45
|
+
opts.on_tail("-h", "--help", "help (this dialog)") { help || exit }
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# @return [nil]
|
|
50
|
+
def version
|
|
51
|
+
@console.info "#{build_option_parser.program_name} v#{Pdfh::VERSION}"
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# @return [nil]
|
|
55
|
+
def help
|
|
56
|
+
@console.info build_option_parser
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Lists the available document types
|
|
60
|
+
# @return [nil]
|
|
61
|
+
def list_types
|
|
62
|
+
# Temporarily set logger for loading settings
|
|
63
|
+
Pdfh.logger = @console
|
|
64
|
+
|
|
65
|
+
settings = SettingsBuilder.call
|
|
66
|
+
spacing = " " * 2
|
|
67
|
+
max_width = settings.document_types.map { |t| t.gid.size }.max
|
|
68
|
+
@console.info "#{spacing}#{"ID".ljust(max_width)} Type Name"
|
|
69
|
+
@console.info "#{spacing}#{"—" * max_width} #{"—" * 23}"
|
|
70
|
+
settings.document_types.each do |type|
|
|
71
|
+
@console.info "#{spacing}#{type.gid.ljust(max_width).yellow} #{type.name}"
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
end
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "shellwords"
|
|
4
|
+
require "English"
|
|
5
|
+
|
|
6
|
+
module Pdfh
|
|
7
|
+
module Services
|
|
8
|
+
# Extracts text from a PDF using pdftotext command
|
|
9
|
+
class PdfTextExtractor
|
|
10
|
+
# @param pdf_path [String]
|
|
11
|
+
# @return [String]
|
|
12
|
+
# @raise [ArgumentError] if file doesn't exist or is not a PDF
|
|
13
|
+
# @raise [RuntimeError] if extraction fails
|
|
14
|
+
def self.call(pdf_path)
|
|
15
|
+
validate_file!(pdf_path)
|
|
16
|
+
|
|
17
|
+
# Use Shellwords to properly escape the path for shell execution
|
|
18
|
+
safe_path = Shellwords.escape(pdf_path)
|
|
19
|
+
cmd = "pdftotext -enc UTF-8 -layout #{safe_path} - 2>/dev/null"
|
|
20
|
+
|
|
21
|
+
text = `#{cmd}`
|
|
22
|
+
exit_status = $CHILD_STATUS
|
|
23
|
+
|
|
24
|
+
# Check if command executed successfully
|
|
25
|
+
if exit_status.nil? || !exit_status.success?
|
|
26
|
+
Pdfh.logger.debug "Failed to extract text from: #{pdf_path}"
|
|
27
|
+
return ""
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
text
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# @param pdf_path [String]
|
|
34
|
+
# @return [void]
|
|
35
|
+
# @raise [ArgumentError] if validation fails
|
|
36
|
+
def self.validate_file!(pdf_path)
|
|
37
|
+
raise ArgumentError, "PDF path cannot be nil" if pdf_path.nil?
|
|
38
|
+
raise ArgumentError, "PDF path cannot be empty" if pdf_path.empty?
|
|
39
|
+
raise ArgumentError, "File does not exist: #{pdf_path}" unless File.exist?(pdf_path)
|
|
40
|
+
raise ArgumentError, "Not a file: #{pdf_path}" unless File.file?(pdf_path)
|
|
41
|
+
raise ArgumentError, "Not a PDF file: #{pdf_path}" unless File.extname(pdf_path).casecmp?(".pdf")
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|