pdfh 3.3.1 → 4.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.editorconfig +0 -15
- data/.gitignore +3 -0
- data/.pre-commit-config.yaml +1 -1
- data/.rubocop.yml +5 -1
- data/.rubocop_todo.yml +5 -18
- data/.simplecov +32 -0
- data/AGENTS.md +174 -0
- data/CHANGELOG.md +74 -9
- data/Gemfile +0 -4
- data/Gemfile.lock +17 -37
- data/README.md +72 -37
- data/Rakefile +24 -6
- data/bin/console +3 -10
- data/bin/run +0 -1
- data/exe/pdfh +1 -1
- data/justfile +65 -0
- data/lib/pdfh/main.rb +25 -120
- data/lib/pdfh/models/document.rb +43 -128
- data/lib/pdfh/models/document_type.rb +35 -69
- data/lib/pdfh/models/run_options.rb +20 -0
- data/lib/pdfh/models/settings.rb +23 -83
- data/lib/pdfh/services/directory_scanner.rb +27 -0
- data/lib/pdfh/services/document_manager.rb +125 -0
- data/lib/pdfh/services/document_matcher.rb +57 -0
- data/lib/pdfh/services/opt_parser.rb +76 -0
- data/lib/pdfh/services/pdf_text_extractor.rb +45 -0
- data/lib/pdfh/services/settings_builder.rb +113 -0
- data/lib/pdfh/services/settings_validator.rb +150 -0
- data/lib/pdfh/utils/console.rb +5 -5
- data/lib/pdfh/utils/date_info.rb +55 -0
- data/lib/pdfh/utils/file_info.rb +47 -0
- data/lib/pdfh/utils/rename_validator.rb +4 -3
- data/lib/pdfh/version.rb +1 -1
- data/lib/pdfh.rb +25 -20
- data/mise.toml +20 -3
- data/pdfh.gemspec +3 -3
- metadata +18 -15
- data/lib/ext/string.rb +0 -9
- data/lib/pdfh/concerns/password_decodable.rb +0 -31
- data/lib/pdfh/models/document_period.rb +0 -37
- data/lib/pdfh/models/document_sub_type.rb +0 -6
- data/lib/pdfh/models/zip_types.rb +0 -17
- data/lib/pdfh/settings_template.rb +0 -21
- data/lib/pdfh/utils/opt_parser.rb +0 -78
- data/lib/pdfh/utils/options.rb +0 -38
- data/lib/pdfh/utils/pdf_file_handler.rb +0 -122
- data/lib/pdfh/utils/settings_builder.rb +0 -62
data/README.md
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
[![Conventional Commits][cc-img]][cc-url]
|
|
6
6
|
[![Current version][gem-img]][gem-url]
|
|
7
7
|
|
|
8
|
-
Examine all PDF files in lookup directories,
|
|
8
|
+
Examine all PDF files in lookup directories, identify them using regular expressions, rename them, and copy them to organized directories.
|
|
9
9
|
|
|
10
10
|
## Installation
|
|
11
11
|
|
|
@@ -15,34 +15,52 @@ gem install pdfh
|
|
|
15
15
|
|
|
16
16
|
### Dependencies
|
|
17
17
|
|
|
18
|
-
You need to install
|
|
18
|
+
You need to install `pdftotext` to extract text from PDF files.
|
|
19
19
|
|
|
20
20
|
#### macOS
|
|
21
21
|
|
|
22
22
|
```bash
|
|
23
|
-
brew install
|
|
23
|
+
brew install xpdf
|
|
24
24
|
```
|
|
25
25
|
|
|
26
26
|
#### Fedora
|
|
27
27
|
|
|
28
28
|
```bash
|
|
29
|
-
sudo dnf install -y
|
|
29
|
+
sudo dnf install -y poppler-utils
|
|
30
30
|
```
|
|
31
31
|
|
|
32
32
|
#### Arch
|
|
33
33
|
|
|
34
34
|
```bash
|
|
35
|
-
sudo pacman -S
|
|
35
|
+
sudo pacman -S poppler
|
|
36
36
|
```
|
|
37
37
|
|
|
38
38
|
## Usage
|
|
39
39
|
|
|
40
40
|
After installing this gem, create your configuration file in one of the following directories:
|
|
41
|
+
|
|
41
42
|
- `~/.config/pdfh.yml`
|
|
42
43
|
- `~/pdfh.yml`
|
|
43
44
|
- or configure the `PDFH_CONFIG_FILE` environment variable
|
|
44
45
|
|
|
46
|
+
Then run:
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
pdfh
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
The tool will:
|
|
53
|
+
|
|
54
|
+
1. Scan all PDFs in the configured `lookup_dirs`
|
|
55
|
+
2. Extract text from each PDF using `pdftotext`
|
|
56
|
+
3. Match the extracted text from each PDF against your configured `document_types` (via `re_id`)
|
|
57
|
+
4. Copy matched documents to organized directories within `destination_base_path`
|
|
58
|
+
5. Rename files according to your `name_template`
|
|
59
|
+
|
|
60
|
+
### Configuration
|
|
61
|
+
|
|
45
62
|
Example configuration:
|
|
63
|
+
|
|
46
64
|
```yaml
|
|
47
65
|
---
|
|
48
66
|
lookup_dirs: # Directories where all PDFs will be analyzed
|
|
@@ -50,45 +68,42 @@ lookup_dirs: # Directories where all PDFs will be analyzed
|
|
|
50
68
|
destination_base_path: ~/PDFs # Directory where all matching documents will be copied (MUST exist)
|
|
51
69
|
document_types:
|
|
52
70
|
- name: My Bank # Description (type)
|
|
53
|
-
|
|
54
|
-
re_date: '\d{1,2} de (\w+) de (\d+)' # Date
|
|
55
|
-
pwd: base64_encoded # [OPTIONAL] Password if the document is protected
|
|
71
|
+
re_id: 'Account ID: 12334-\w{3}' # [OPTIONAL (uses name as fallback)] RegEx to match from PDF content as document identifier
|
|
72
|
+
re_date: '\d{1,2} de (\w+) de (\d+)' # Date RegEx (to extract from PDF content)
|
|
56
73
|
store_path: "{year}/bank_docs" # Relative path to copy this document
|
|
57
|
-
name_template: '{period} {
|
|
58
|
-
sub_types: # [OPTIONAL] In case your need an extra category
|
|
59
|
-
- name: AccountX # Regular expression to match this subtype
|
|
60
|
-
re_date: '\d{1,2} de (\w+)' # [OPTIONAL] Date regular expression
|
|
61
|
-
month_offset: -1 # [OPTIONAL] Integer (signed) value to adjust month
|
|
62
|
-
zip_types: # [OPTIONAL] Zip files to be processed BEFORE the PDFs
|
|
63
|
-
- name: My Bank 2 # Description
|
|
64
|
-
re_file: 'Document_MR5664_\d+_\d+.zip' # Regular expression to match its filename
|
|
65
|
-
pwd: base64_encoded # [OPTIONAL] Password if the document is protected
|
|
74
|
+
name_template: '{period} {name}' # [OPTIONAL] Template for new filename when copied
|
|
66
75
|
```
|
|
67
76
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
77
|
+
### Placeholders
|
|
78
|
+
|
|
79
|
+
**Store Path** and **Name Template** support the following placeholders:
|
|
71
80
|
|
|
72
|
-
|
|
81
|
+
| Placeholder | Description | Example |
|
|
82
|
+
| --- | --- | --- |
|
|
83
|
+
| `{original}` | Original filename | `MyBankDocument2.pdf` |
|
|
84
|
+
| `{period}` | Year-Month | `2022-07` |
|
|
85
|
+
| `{year}` | Year | `2022` |
|
|
86
|
+
| `{month}` | Month | `07` |
|
|
87
|
+
| `{day}` | Day (if captured) | `01` |
|
|
88
|
+
| `{quarter}` | Quarter (Q1-Q4) | `Q3` |
|
|
89
|
+
| `{bimester}` | Bimester (B1-B6) | `B4` |
|
|
90
|
+
| `{name}` | Document type **name** | `My Bank` |
|
|
73
91
|
|
|
74
|
-
|
|
75
|
-
--- |---------------------------| ---
|
|
76
|
-
`{original}` | Original filename | MyBankDocument2.pdf
|
|
77
|
-
`{period}` | Year-Month | 2022-01
|
|
78
|
-
`{year}` | Year | 2022
|
|
79
|
-
`{month}` | Month | 01
|
|
80
|
-
`{type}` | Document type **name** | My Bank
|
|
81
|
-
`{subtype}` | Sub type **name** | AccountX
|
|
82
|
-
`{extra}` | day if captured/matched | 01
|
|
92
|
+
The `period`, `year`, `month`, `day`, `quarter` and `bimester` placeholders are calculated from the date captured by the `re_date` regular expression.
|
|
83
93
|
|
|
84
|
-
|
|
94
|
+
### Date Extraction Examples
|
|
85
95
|
|
|
86
|
-
|
|
96
|
+
The `re_date` regex extracts date information from the PDF content:
|
|
87
97
|
|
|
88
|
-
Date text | RegEx | Captured
|
|
89
|
-
--- | --- | ---
|
|
90
|
-
`01/02/2025` | `(?<d>\d{2}\/(?<m>\d{2})\/(?<y>\d{4})` | d: `01` m: `02` y: `2025`
|
|
91
|
-
`072025
|
|
98
|
+
| Date text | RegEx | Captured |
|
|
99
|
+
| --- | --- | --- |
|
|
100
|
+
| `01/02/2025` | `(?<d>\d{2})\/(?<m>\d{2})\/(?<y>\d{4})` | d: `01` m: `02` y: `2025` |
|
|
101
|
+
| `072025 -` | `(?<m>\d{2})(?<y>\d{4}) -` | m: `07` y: `2025` |
|
|
102
|
+
| `31 de julio de 2025` | `\d{1,2} de (\w+) de (\d+)` | month: `julio` year: `2025` |
|
|
103
|
+
|
|
104
|
+
Named captures supported: `y` for year, `m` for month, `d` for day.
|
|
105
|
+
|
|
106
|
+
If named captures are not used, the regex groups will be matched in order: `month`, `year`.
|
|
92
107
|
|
|
93
108
|
## Development
|
|
94
109
|
|
|
@@ -132,10 +147,30 @@ The gem is available as open source under the terms of the [MIT License](https:/
|
|
|
132
147
|
|
|
133
148
|
Everyone interacting in the Pdfh project’s codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/iax7/pdfh/blob/master/CODE_OF_CONDUCT.md).
|
|
134
149
|
|
|
150
|
+
## Command Options
|
|
151
|
+
|
|
152
|
+
Run with verbose output:
|
|
153
|
+
|
|
154
|
+
```bash
|
|
155
|
+
pdfh -v
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
Run in dry-run mode (no files will be moved):
|
|
159
|
+
|
|
160
|
+
```bash
|
|
161
|
+
pdfh --dry
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
Show version:
|
|
165
|
+
|
|
166
|
+
```bash
|
|
167
|
+
pdfh --version
|
|
168
|
+
```
|
|
169
|
+
|
|
135
170
|
<!-- Links -->
|
|
136
171
|
[rubocop-img]: https://github.com/iax7/pdfh/actions/workflows/rubocop-analysis.yml/badge.svg
|
|
137
172
|
[rubocop-url]: https://github.com/iax7/pdfh/actions/workflows/rubocop-analysis.yml
|
|
138
|
-
[ruby-img]: https://img.shields.io/badge/ruby-
|
|
173
|
+
[ruby-img]: https://img.shields.io/badge/ruby-4.0-blue?style=flat&logo=ruby&logoColor=CC342D&labelColor=white
|
|
139
174
|
[ruby-url]: https://www.ruby-lang.org/en/
|
|
140
175
|
[cc-img]: https://img.shields.io/badge/Conventional%20Commits-1.0.0-%23FE5196?logo=conventionalcommits&logoColor=00&labelColor=fff
|
|
141
176
|
[cc-url]: https://conventionalcommits.org
|
data/Rakefile
CHANGED
|
@@ -3,7 +3,6 @@
|
|
|
3
3
|
require "colorize"
|
|
4
4
|
require "bundler/gem_tasks"
|
|
5
5
|
require "rspec/core/rake_task"
|
|
6
|
-
require "versionomy"
|
|
7
6
|
|
|
8
7
|
RSpec::Core::RakeTask.new(:spec)
|
|
9
8
|
|
|
@@ -16,13 +15,32 @@ task :bump, :type do |_t, args|
|
|
|
16
15
|
version_file = File.join(__dir__, "lib", "pdfh", "version.rb")
|
|
17
16
|
content = File.read(version_file)
|
|
18
17
|
|
|
19
|
-
version_pattern = /(?<major>\d+)\.(?<minor>\d+)\.(?<tiny>\d+)/
|
|
20
|
-
|
|
21
|
-
next_version = Versionomy.parse(current_version.to_s).bump(args.type).to_s
|
|
18
|
+
version_pattern = /VERSION = "(?<major>\d+)\.(?<minor>\d+)\.(?<tiny>\d+)"/
|
|
19
|
+
match = content.match(version_pattern)
|
|
22
20
|
|
|
23
|
-
|
|
21
|
+
major = match[:major].to_i
|
|
22
|
+
minor = match[:minor].to_i
|
|
23
|
+
tiny = match[:tiny].to_i
|
|
24
24
|
|
|
25
|
-
|
|
25
|
+
case args.type.to_sym
|
|
26
|
+
when :major
|
|
27
|
+
major += 1
|
|
28
|
+
minor = 0
|
|
29
|
+
tiny = 0
|
|
30
|
+
when :minor
|
|
31
|
+
minor += 1
|
|
32
|
+
tiny = 0
|
|
33
|
+
when :tiny
|
|
34
|
+
tiny += 1
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
current_version = "#{match[:major]}.#{match[:minor]}.#{match[:tiny]}"
|
|
38
|
+
next_version = "#{major}.#{minor}.#{tiny}"
|
|
39
|
+
|
|
40
|
+
new_content = content.gsub(version_pattern, "VERSION = \"#{next_version}\"")
|
|
41
|
+
File.write(version_file, new_content)
|
|
42
|
+
|
|
43
|
+
puts "Successfully bumped from #{current_version.red} to #{next_version.green}"
|
|
26
44
|
puts "\n> Building v#{next_version.green}..."
|
|
27
45
|
puts `rake build`
|
|
28
46
|
end
|
data/bin/console
CHANGED
|
@@ -5,14 +5,7 @@ require "bundler/setup"
|
|
|
5
5
|
require "pdfh"
|
|
6
6
|
|
|
7
7
|
# You can add fixtures and/or initialization code here to make experimenting
|
|
8
|
-
# with your gem easier.
|
|
8
|
+
# with your gem easier.
|
|
9
9
|
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
p Pdfh::OptParser.parse_argv
|
|
14
|
-
|
|
15
|
-
Pry.start
|
|
16
|
-
|
|
17
|
-
# require "irb"
|
|
18
|
-
# IRB.start(__FILE__)
|
|
10
|
+
require "irb"
|
|
11
|
+
IRB.start(__FILE__)
|
data/bin/run
CHANGED
data/exe/pdfh
CHANGED
data/justfile
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
# Run commands through bundle if present
|
|
2
|
+
|
|
3
|
+
set shell := ["bash", "-c"]
|
|
4
|
+
|
|
5
|
+
# List all available tasks
|
|
6
|
+
default:
|
|
7
|
+
@just --list
|
|
8
|
+
|
|
9
|
+
# --- Installation and setup ---
|
|
10
|
+
|
|
11
|
+
# Install gems and system dependencies via mise
|
|
12
|
+
[group('setup')]
|
|
13
|
+
setup:
|
|
14
|
+
mise install
|
|
15
|
+
bundle install
|
|
16
|
+
|
|
17
|
+
# Update gems
|
|
18
|
+
[group('setup')]
|
|
19
|
+
update:
|
|
20
|
+
bundle update --bundler
|
|
21
|
+
bundle update --all
|
|
22
|
+
|
|
23
|
+
# --- Testing and quality ---
|
|
24
|
+
|
|
25
|
+
# Run all checks (linting and tests)
|
|
26
|
+
[group('test')]
|
|
27
|
+
check: lint test
|
|
28
|
+
|
|
29
|
+
# Run all tests with RSpec
|
|
30
|
+
[group('test')]
|
|
31
|
+
test:
|
|
32
|
+
bundle exec rspec
|
|
33
|
+
|
|
34
|
+
# Run a specific test (e.g., just test-file spec/models/user_spec.rb:42)
|
|
35
|
+
[group('test')]
|
|
36
|
+
test-file path:
|
|
37
|
+
bundle exec rspec {{ path }}
|
|
38
|
+
|
|
39
|
+
# Run the linter (RuboCop) and auto-fix simple issues
|
|
40
|
+
[group('test')]
|
|
41
|
+
lint:
|
|
42
|
+
bundle exec rubocop -a
|
|
43
|
+
|
|
44
|
+
# Open coverage HTML report
|
|
45
|
+
[group('test')]
|
|
46
|
+
coverage:
|
|
47
|
+
@[[ -f coverage/index.html ]] && open coverage/index.html || echo "Coverage report not found"
|
|
48
|
+
|
|
49
|
+
# --- Version management and release ---
|
|
50
|
+
|
|
51
|
+
# Bump version (major|minor|tiny)
|
|
52
|
+
[group('release')]
|
|
53
|
+
bump type='tiny':
|
|
54
|
+
bundle exec rake "bump[{{ type }}]"
|
|
55
|
+
bundle install
|
|
56
|
+
|
|
57
|
+
# Build and install the gem locally
|
|
58
|
+
[group('release')]
|
|
59
|
+
install:
|
|
60
|
+
bundle exec rake install
|
|
61
|
+
|
|
62
|
+
# Create a git tag, build and push gem to RubyGems
|
|
63
|
+
[group('release')]
|
|
64
|
+
release:
|
|
65
|
+
bundle exec rake release
|
data/lib/pdfh/main.rb
CHANGED
|
@@ -7,137 +7,42 @@ module Pdfh
|
|
|
7
7
|
# @param argv [Array<String>]
|
|
8
8
|
# @return [void]
|
|
9
9
|
def start(argv:)
|
|
10
|
-
arg_options =
|
|
11
|
-
|
|
12
|
-
assign_global_utils(@options)
|
|
13
|
-
Pdfh.print_options(arg_options)
|
|
10
|
+
arg_options = Services::OptParser.new(argv: argv).parse_argv
|
|
11
|
+
options = RunOptions.new(**arg_options)
|
|
14
12
|
|
|
15
|
-
|
|
16
|
-
Pdfh.
|
|
13
|
+
# Initialize the global logger
|
|
14
|
+
Pdfh.logger = Console.new(options.verbose?)
|
|
15
|
+
Pdfh.logger.print_options(arg_options)
|
|
17
16
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
Pdfh.error_print(e.message, exit_app: false)
|
|
21
|
-
Pdfh.create_settings_file
|
|
22
|
-
exit(1)
|
|
23
|
-
rescue StandardError => e
|
|
24
|
-
Pdfh.backtrace_print e if Pdfh.verbose?
|
|
25
|
-
Pdfh.error_print(e.message)
|
|
26
|
-
end
|
|
27
|
-
|
|
28
|
-
private
|
|
29
|
-
|
|
30
|
-
attr_reader :options, :settings
|
|
31
|
-
|
|
32
|
-
# @param options [Options]
|
|
33
|
-
# @return [void]
|
|
34
|
-
def assign_global_utils(options)
|
|
35
|
-
Pdfh.instance_variable_set(:@options, options)
|
|
36
|
-
Pdfh.instance_variable_set(:@console, Console.new(options.verbose?))
|
|
37
|
-
end
|
|
38
|
-
|
|
39
|
-
# @param [String] file_name
|
|
40
|
-
# @return [DocumentType, nil]
|
|
41
|
-
def match_doc_type(file_name)
|
|
42
|
-
settings.document_types.each do |type|
|
|
43
|
-
match = type.re_file.match(file_name)
|
|
44
|
-
return type if match
|
|
45
|
-
end
|
|
46
|
-
nil
|
|
47
|
-
end
|
|
48
|
-
|
|
49
|
-
# @return [void]
|
|
50
|
-
def process_provided_files
|
|
51
|
-
type_id = options.type
|
|
52
|
-
raise ArgumentError, "No files provided to process #{type_id.inspect} type." unless options.files?
|
|
53
|
-
|
|
54
|
-
type = settings.document_type(type_id)
|
|
55
|
-
Pdfh.error_print "Type #{type_id.inspect} was not found." if type.nil?
|
|
56
|
-
options.files.each do |file|
|
|
57
|
-
next Pdfh.warn_print "File #{file.inspect} does not exist." unless File.exist?(file)
|
|
58
|
-
next Pdfh.warn_print "File #{file.inspect} is not a pdf." unless File.extname(file) == ".pdf"
|
|
59
|
-
|
|
60
|
-
PdfFileHandler.new(file, type).process_document(settings.base_path)
|
|
61
|
-
end
|
|
62
|
-
end
|
|
63
|
-
|
|
64
|
-
# @return [void]
|
|
65
|
-
def process_lookup_dirs
|
|
66
|
-
settings.lookup_dirs.each do |work_directory|
|
|
67
|
-
process_directory(work_directory)
|
|
68
|
-
end
|
|
69
|
-
end
|
|
17
|
+
settings = Services::SettingsBuilder.call
|
|
18
|
+
Pdfh.logger.debug "Destination path: #{settings.base_path.colorize(:light_blue)}"
|
|
70
19
|
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
def process_zip_files(work_directory)
|
|
74
|
-
@settings.zip_types&.each do |zip_type|
|
|
75
|
-
find_files(work_directory, :zip).each do |file|
|
|
76
|
-
next unless zip_type.re_file.match?(File.basename(file))
|
|
20
|
+
files = Services::DirectoryScanner.new(settings.lookup_dirs).scan
|
|
21
|
+
matcher = Services::DocumentMatcher.new(settings.document_types)
|
|
77
22
|
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
end
|
|
82
|
-
end
|
|
83
|
-
end
|
|
23
|
+
files.each do |file_path|
|
|
24
|
+
Pdfh.logger.info "Working on: #{file_path.colorize(:green)}" if Pdfh.logger.verbose?
|
|
25
|
+
text = Services::PdfTextExtractor.call(file_path)
|
|
84
26
|
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
# @return [Array<String>]
|
|
88
|
-
def find_files(directory, type)
|
|
89
|
-
glob = File.join(directory, "*.#{type}")
|
|
90
|
-
Dir.glob(glob)
|
|
91
|
-
end
|
|
27
|
+
documents = matcher.match(file_path, text)
|
|
28
|
+
next Pdfh.logger.debug "No document type match found for #{file_path.colorize(:yellow)}" if documents.empty?
|
|
92
29
|
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
processed_result = RunResult.new
|
|
97
|
-
files = find_files(work_directory, :pdf)
|
|
98
|
-
files.each do |pdf_file|
|
|
99
|
-
type = match_doc_type(pdf_file)
|
|
100
|
-
if type
|
|
101
|
-
PdfFileHandler.new(pdf_file, type).process_document(settings.base_path)
|
|
102
|
-
processed_result.add_processed(pdf_file)
|
|
103
|
-
else
|
|
104
|
-
processed_result.add_ignored(pdf_file)
|
|
30
|
+
unless documents.one?
|
|
31
|
+
matches = documents.map { _1.type.name.inspect }.join(", ")
|
|
32
|
+
next Pdfh.logger.warn_print "Skipping #{file_path.inspect} as multiple matches found: #{matches}."
|
|
105
33
|
end
|
|
106
|
-
end
|
|
107
|
-
print_processing_results(processed_result)
|
|
108
|
-
end
|
|
109
|
-
|
|
110
|
-
# @return [String]
|
|
111
|
-
def base_name_no_ext(file)
|
|
112
|
-
File.basename(file, File.extname(file))
|
|
113
|
-
end
|
|
114
|
-
|
|
115
|
-
def print_processing_results(result)
|
|
116
|
-
Pdfh.info " (No files processed)".colorize(:light_black) if result.processed.empty?
|
|
117
|
-
return unless Pdfh.verbose?
|
|
118
34
|
|
|
119
|
-
|
|
120
|
-
result.ignored.each.with_index(1) do |file, index|
|
|
121
|
-
Pdfh.ident_print index, base_name_no_ext(file), color: :magenta
|
|
35
|
+
Services::DocumentManager.new(documents.first, base_path: settings.base_path, dry_run: options.dry?).call
|
|
122
36
|
end
|
|
123
|
-
end
|
|
124
|
-
end
|
|
125
|
-
|
|
126
|
-
# keeps track of the processed and ignored files
|
|
127
|
-
class RunResult
|
|
128
|
-
attr_reader :processed, :ignored
|
|
129
37
|
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
38
|
+
nil
|
|
39
|
+
rescue SettingsIOError => e
|
|
40
|
+
Pdfh.logger.error_print(e.message, exit_app: false)
|
|
41
|
+
exit(1)
|
|
42
|
+
rescue StandardError => e
|
|
43
|
+
Pdfh.logger.backtrace_print(e) if Pdfh.logger.verbose?
|
|
44
|
+
Pdfh.logger.error_print(e.message)
|
|
134
45
|
end
|
|
135
|
-
|
|
136
|
-
# @return [void]
|
|
137
|
-
def add_ignored(file) = @ignored << file
|
|
138
|
-
|
|
139
|
-
# @return [void]
|
|
140
|
-
def add_processed(file) = @processed << file
|
|
141
46
|
end
|
|
142
47
|
end
|
|
143
48
|
end
|
data/lib/pdfh/models/document.rb
CHANGED
|
@@ -1,152 +1,67 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
module Pdfh
|
|
4
|
-
#
|
|
4
|
+
# Lightweight struct that connects a PDF file with its matched document type and
|
|
5
|
+
# extracted text. All file metadata, date interpretation, and rename resolution
|
|
6
|
+
# are accessible through dedicated value objects (FileInfo, DateInfo).
|
|
5
7
|
class Document
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
#
|
|
9
|
-
#
|
|
10
|
-
#
|
|
11
|
-
#
|
|
12
|
-
|
|
13
|
-
|
|
8
|
+
# @!attribute [r] file_info
|
|
9
|
+
# @return [FileInfo] File metadata wrapper
|
|
10
|
+
# @!attribute [r] type
|
|
11
|
+
# @return [DocumentType] Matched document type
|
|
12
|
+
# @!attribute [r] text
|
|
13
|
+
# @return [String] Extracted text from the PDF
|
|
14
|
+
# @!attribute [r] date_info
|
|
15
|
+
# @return [DateInfo] Parsed date value object
|
|
16
|
+
attr_reader :file_info, :type, :text, :date_info
|
|
17
|
+
|
|
18
|
+
# @param file [String] Path to the PDF file
|
|
19
|
+
# @param type [DocumentType] Type of the document
|
|
20
|
+
# @param text [String] Extracted text from the PDF
|
|
21
|
+
# @param date_captures [Hash{String => String}] Captured date components from regex
|
|
22
|
+
# @return [self] A new Document instance
|
|
23
|
+
def initialize(file, type, text, date_captures)
|
|
14
24
|
@type = type
|
|
15
25
|
@text = text
|
|
26
|
+
@file_info = FileInfo.new(file)
|
|
27
|
+
@date_info = DateInfo.new(date_captures)
|
|
16
28
|
end
|
|
17
29
|
|
|
18
|
-
# @return [
|
|
19
|
-
def process
|
|
20
|
-
Pdfh.debug "=== Document Type: #{type.name} =============================="
|
|
21
|
-
Pdfh.debug "~~~~~~~~~~~~~~~~~~ Finding a subtype"
|
|
22
|
-
@sub_type = type.sub_type(@text)
|
|
23
|
-
Pdfh.debug " SubType: #{@sub_type}"
|
|
24
|
-
@companion = search_companion_files
|
|
25
|
-
|
|
26
|
-
month, year, @extra = match_date(@sub_type&.re_date || @type.re_date)
|
|
27
|
-
@period = DocumentPeriod.new(day: extra, month: month, month_offset: @sub_type&.month_offset, year: year)
|
|
28
|
-
Pdfh.debug " Period: #{@period.inspect}"
|
|
29
|
-
end
|
|
30
|
-
|
|
31
|
-
# @return [void]
|
|
32
|
-
def print_info
|
|
33
|
-
print_info_line "Type", type.name
|
|
34
|
-
print_info_line "Sub-Type", sub_type
|
|
35
|
-
print_info_line "Period", period
|
|
36
|
-
print_info_line "New Name", new_name
|
|
37
|
-
print_info_line "Store Path", store_path
|
|
38
|
-
print_info_line "Extra files", companion_files(join: true)
|
|
39
|
-
print_info_line "Processed?", "No (in Dry mode)" if Pdfh.dry?
|
|
40
|
-
end
|
|
41
|
-
|
|
42
|
-
# @return [void]
|
|
43
|
-
def print_info_line(property, info)
|
|
44
|
-
Pdfh.ident_print property, info.to_s, color: :light_blue, width: 12
|
|
45
|
-
end
|
|
46
|
-
|
|
47
|
-
# @return [String]
|
|
48
|
-
def file_name_only
|
|
49
|
-
File.basename(@file, file_extension)
|
|
50
|
-
end
|
|
51
|
-
|
|
52
|
-
# @return [String]
|
|
53
|
-
def file_extension
|
|
54
|
-
File.extname(@file)
|
|
55
|
-
end
|
|
56
|
-
|
|
57
|
-
# @return [String]
|
|
58
|
-
def file_name
|
|
59
|
-
File.basename(@file)
|
|
60
|
-
end
|
|
61
|
-
|
|
62
|
-
# @return [String]
|
|
63
|
-
def backup_name
|
|
64
|
-
"#{file_name}.bkp"
|
|
65
|
-
end
|
|
66
|
-
|
|
67
|
-
# @return [String]
|
|
30
|
+
# @return [String] Document type name or "N/A" if type is nil
|
|
68
31
|
def type_name
|
|
69
|
-
type&.name
|
|
70
|
-
end
|
|
71
|
-
|
|
72
|
-
# @return [String]
|
|
73
|
-
def sub_type
|
|
74
|
-
@sub_type&.name&.titleize || "N/A"
|
|
32
|
+
type&.name || "N/A"
|
|
75
33
|
end
|
|
76
34
|
|
|
77
|
-
# @return [
|
|
78
|
-
def
|
|
79
|
-
|
|
80
|
-
original: file_name_only,
|
|
81
|
-
period: period.to_s,
|
|
82
|
-
year: period.year.to_s,
|
|
83
|
-
month: period.month.to_s,
|
|
84
|
-
type: type_name,
|
|
85
|
-
subtype: sub_type,
|
|
86
|
-
extra: extra || ""
|
|
87
|
-
}.freeze
|
|
35
|
+
# @return [String] File name
|
|
36
|
+
def to_s
|
|
37
|
+
file_info.name
|
|
88
38
|
end
|
|
89
39
|
|
|
90
|
-
# @return [String]
|
|
40
|
+
# @return [String] New file name with extension (e.g., "2024-01 Cuenta.pdf")
|
|
91
41
|
def new_name
|
|
92
|
-
|
|
93
|
-
"#{new_name}#{file_extension}"
|
|
42
|
+
"#{@type.name_validator.gsub(rename_data)}#{@file_info.extension}"
|
|
94
43
|
end
|
|
95
44
|
|
|
96
|
-
# @return [String]
|
|
45
|
+
# @return [String] Storage path for the document (e.g., "2024/Edo Cuenta")
|
|
97
46
|
def store_path
|
|
98
|
-
type.
|
|
99
|
-
end
|
|
100
|
-
|
|
101
|
-
# @return [String (frozen)]
|
|
102
|
-
def companion_files(join: false)
|
|
103
|
-
return @companion unless join
|
|
104
|
-
|
|
105
|
-
@companion.empty? ? "N/A" : @companion.join(", ")
|
|
106
|
-
end
|
|
107
|
-
|
|
108
|
-
# @return [String]
|
|
109
|
-
def home_dir
|
|
110
|
-
File.dirname(@file)
|
|
111
|
-
end
|
|
112
|
-
|
|
113
|
-
# @return [String]
|
|
114
|
-
def to_s
|
|
115
|
-
@file
|
|
47
|
+
@type.path_validator.gsub(rename_data)
|
|
116
48
|
end
|
|
117
49
|
|
|
118
50
|
private
|
|
119
51
|
|
|
120
|
-
#
|
|
121
|
-
#
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
return matched.captures.map(&:downcase) if regex.named_captures.empty?
|
|
134
|
-
|
|
135
|
-
extra = matched.captures.size > 2 ? matched[:d] : nil
|
|
136
|
-
[matched[:m].downcase, matched[:y], extra]
|
|
137
|
-
end
|
|
138
|
-
|
|
139
|
-
# @return [Array]
|
|
140
|
-
def search_companion_files
|
|
141
|
-
Pdfh.debug "~~~~~~~~~~~~~~~~~~ Searching Companion files"
|
|
142
|
-
Pdfh.debug " Searching on: #{home_dir.inspect}"
|
|
143
|
-
Dir.chdir(home_dir) do
|
|
144
|
-
files_matching = Dir["#{file_name_only}.*"]
|
|
145
|
-
companion = files_matching.reject { |file| file.include? ".pdf" }
|
|
146
|
-
Pdfh.debug " Found: #{companion.inspect}"
|
|
147
|
-
|
|
148
|
-
companion
|
|
149
|
-
end
|
|
52
|
+
# Used to replace variables in the rename pattern i.e {original}, {period}, etc.
|
|
53
|
+
# @return [Hash{Symbol => String}] Hash containing rename variables
|
|
54
|
+
def rename_data
|
|
55
|
+
@rename_data ||= {
|
|
56
|
+
original: @file_info.stem,
|
|
57
|
+
period: @date_info.period,
|
|
58
|
+
year: @date_info.year.to_s,
|
|
59
|
+
month: @date_info.month.to_s,
|
|
60
|
+
quarter: "Q#{@date_info.quarter}",
|
|
61
|
+
bimester: "B#{@date_info.bimester}",
|
|
62
|
+
name: @type.name,
|
|
63
|
+
day: @date_info.day || ""
|
|
64
|
+
}.freeze
|
|
150
65
|
end
|
|
151
66
|
end
|
|
152
67
|
end
|