pdfh 3.2.0 → 3.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -1
- data/.pre-commit-config.yaml +36 -0
- data/.rubocop.yml +4 -3
- data/.rubocop_todo.yml +15 -36
- data/Gemfile.lock +31 -28
- data/README.md +46 -18
- data/bin/run +3 -1
- data/exe/pdfh +2 -16
- data/lib/pdfh/concerns/password_decodable.rb +31 -0
- data/lib/pdfh/main.rb +65 -16
- data/lib/pdfh/models/document_type.rb +16 -12
- data/lib/pdfh/models/settings.rb +27 -4
- data/lib/pdfh/models/zip_types.rb +17 -0
- data/lib/pdfh/utils/dependency_validator.rb +35 -0
- data/lib/pdfh/utils/opt_parser.rb +61 -46
- data/lib/pdfh/utils/pdf_file_handler.rb +1 -1
- data/lib/pdfh/version.rb +1 -1
- data/lib/pdfh.rb +9 -1
- data/mise.toml +1 -1
- metadata +7 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 29f90e2cbc737c2ea6f15b58f6a46513d90ac68053c770018a5a2549b76adcc6
|
4
|
+
data.tar.gz: 476440f76e4cd8eb0cdc248d9040cbf75fc510656ff1129d83d18bc1f9edf8ec
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 10672062c7041f020920bf4c51a9af88d82130d4db59f8a332a5b05df055bf6079c127b4c8d7d31fb145f9fc88b9527bc330a83f23d2f0dc69c92e843c8246ba
|
7
|
+
data.tar.gz: 606afc6611bb8036cd073c8a5a031dbc0812d2385de77d1b774c2c49b42673f266e4020c679a558dc3d709b3309e1e9d8708d2a8a7b6808f66b4d1ae770da780
|
data/.gitignore
CHANGED
@@ -0,0 +1,36 @@
|
|
1
|
+
# See https://pre-commit.com for more information
|
2
|
+
# See https://pre-commit.com/hooks.html for more hooks
|
3
|
+
default_install_hook_types:
|
4
|
+
- pre-commit
|
5
|
+
- commit-msg
|
6
|
+
|
7
|
+
repos:
|
8
|
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
9
|
+
rev: v5.0.0
|
10
|
+
hooks:
|
11
|
+
- id: trailing-whitespace
|
12
|
+
- id: end-of-file-fixer
|
13
|
+
exclude: ^.idea/
|
14
|
+
- id: check-yaml
|
15
|
+
- id: check-added-large-files
|
16
|
+
- id: check-executables-have-shebangs
|
17
|
+
- id: check-shebang-scripts-are-executable
|
18
|
+
- id: mixed-line-ending
|
19
|
+
- repo: https://github.com/gitleaks/gitleaks
|
20
|
+
rev: v8.25.0
|
21
|
+
hooks:
|
22
|
+
- id: gitleaks
|
23
|
+
- repo: https://github.com/rubocop/rubocop
|
24
|
+
rev: v1.75.4
|
25
|
+
hooks:
|
26
|
+
- id: rubocop
|
27
|
+
- repo: https://github.com/compilerla/conventional-pre-commit
|
28
|
+
rev: v4.1.0
|
29
|
+
hooks:
|
30
|
+
- id: conventional-pre-commit
|
31
|
+
stages: [commit-msg]
|
32
|
+
args: []
|
33
|
+
- repo: https://github.com/codespell-project/codespell
|
34
|
+
rev: v2.4.1
|
35
|
+
hooks:
|
36
|
+
- id: codespell
|
data/.rubocop.yml
CHANGED
@@ -1,10 +1,9 @@
|
|
1
1
|
inherit_from: .rubocop_todo.yml
|
2
2
|
|
3
|
-
|
4
|
-
- rubocop-factory_bot
|
3
|
+
plugins:
|
5
4
|
- rubocop-performance
|
5
|
+
- rubocop-factory_bot
|
6
6
|
- rubocop-rake
|
7
|
-
- rubocop-rspec
|
8
7
|
|
9
8
|
AllCops:
|
10
9
|
NewCops: enable
|
@@ -14,6 +13,8 @@ AllCops:
|
|
14
13
|
- pkg/**/*
|
15
14
|
- tmp/**/*
|
16
15
|
- vendor/**/*
|
16
|
+
SuggestExtensions:
|
17
|
+
rubocop-rspec: false
|
17
18
|
|
18
19
|
Layout/LineLength:
|
19
20
|
Max: 120
|
data/.rubocop_todo.yml
CHANGED
@@ -1,54 +1,33 @@
|
|
1
1
|
# This configuration was generated by
|
2
2
|
# `rubocop --auto-gen-config`
|
3
|
-
# on
|
3
|
+
# on 2025-04-19 20:15:39 UTC using RuboCop version 1.75.2.
|
4
4
|
# The point is for the user to remove these configuration records
|
5
5
|
# one by one as the offenses are removed from the code base.
|
6
6
|
# Note that changes in the inspected code, or installation of new
|
7
7
|
# versions of RuboCop, may require this file to be generated again.
|
8
8
|
|
9
|
-
# Offense count:
|
9
|
+
# Offense count: 10
|
10
10
|
# Configuration parameters: AllowedMethods, AllowedPatterns, CountRepeatedAttributes.
|
11
11
|
Metrics/AbcSize:
|
12
|
-
Max:
|
12
|
+
Max: 38
|
13
|
+
|
14
|
+
# Offense count: 16
|
15
|
+
# Configuration parameters: CountComments, CountAsOne, AllowedMethods, AllowedPatterns.
|
16
|
+
# AllowedMethods: refine
|
17
|
+
Metrics/BlockLength:
|
18
|
+
Max: 131
|
13
19
|
|
14
20
|
# Offense count: 6
|
15
21
|
# Configuration parameters: CountComments, CountAsOne, AllowedMethods, AllowedPatterns.
|
16
22
|
Metrics/MethodLength:
|
17
|
-
Max:
|
18
|
-
|
19
|
-
# Offense count: 2
|
20
|
-
RSpec/AnyInstance:
|
21
|
-
Exclude:
|
22
|
-
- 'spec/pdfh/main_spec.rb'
|
23
|
-
|
24
|
-
# Offense count: 8
|
25
|
-
# Configuration parameters: Include, CustomTransform, IgnoreMethods, SpecSuffixOnly.
|
26
|
-
# Include: **/*_spec*rb*, **/spec/**/*
|
27
|
-
RSpec/FilePath:
|
28
|
-
Exclude:
|
29
|
-
- 'spec/pdfh/models/document_period_spec.rb'
|
30
|
-
- 'spec/pdfh/models/document_spec.rb'
|
31
|
-
- 'spec/pdfh/utils/console_spec.rb'
|
32
|
-
- 'spec/pdfh/utils/month_spec.rb'
|
33
|
-
- 'spec/pdfh/utils/opt_parser_spec.rb'
|
34
|
-
- 'spec/pdfh/utils/pdf_file_handler_spec.rb'
|
35
|
-
- 'spec/pdfh/utils/rename_validator_spec.rb'
|
36
|
-
- 'spec/pdfh/utils/settings_builder_spec.rb'
|
23
|
+
Max: 33
|
37
24
|
|
38
|
-
# Offense count:
|
39
|
-
# Configuration parameters:
|
40
|
-
#
|
41
|
-
|
25
|
+
# Offense count: 1
|
26
|
+
# Configuration parameters: MinNameLength, AllowNamesEndingInNumbers, AllowedNames, ForbiddenNames.
|
27
|
+
# AllowedNames: as, at, by, cc, db, id, if, in, io, ip, of, on, os, pp, to
|
28
|
+
Naming/MethodParameterName:
|
42
29
|
Exclude:
|
43
|
-
- '
|
44
|
-
- 'spec/pdfh/models/document_period_spec.rb'
|
45
|
-
- 'spec/pdfh/models/document_spec.rb'
|
46
|
-
- 'spec/pdfh/utils/console_spec.rb'
|
47
|
-
- 'spec/pdfh/utils/month_spec.rb'
|
48
|
-
- 'spec/pdfh/utils/opt_parser_spec.rb'
|
49
|
-
- 'spec/pdfh/utils/pdf_file_handler_spec.rb'
|
50
|
-
- 'spec/pdfh/utils/rename_validator_spec.rb'
|
51
|
-
- 'spec/pdfh/utils/settings_builder_spec.rb'
|
30
|
+
- 'lib/pdfh/utils/console.rb'
|
52
31
|
|
53
32
|
# Offense count: 3
|
54
33
|
# This cop supports safe autocorrection (--autocorrect).
|
data/Gemfile.lock
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
pdfh (3.
|
4
|
+
pdfh (3.3.0)
|
5
5
|
colorize (~> 1.1.0)
|
6
6
|
|
7
7
|
GEM
|
8
8
|
remote: https://rubygems.org/
|
9
9
|
specs:
|
10
|
-
activesupport (8.0.
|
10
|
+
activesupport (8.0.2)
|
11
11
|
base64
|
12
12
|
benchmark (>= 0.3)
|
13
13
|
bigdecimal
|
@@ -21,7 +21,7 @@ GEM
|
|
21
21
|
tzinfo (~> 2.0, >= 2.0.5)
|
22
22
|
uri (>= 0.13.1)
|
23
23
|
ansi (1.5.0)
|
24
|
-
ast (2.4.
|
24
|
+
ast (2.4.3)
|
25
25
|
base64 (0.2.0)
|
26
26
|
benchmark (0.4.0)
|
27
27
|
bigdecimal (3.1.9)
|
@@ -31,12 +31,12 @@ GEM
|
|
31
31
|
coderay (1.1.3)
|
32
32
|
colorize (1.1.0)
|
33
33
|
concurrent-ruby (1.3.5)
|
34
|
-
connection_pool (2.5.
|
34
|
+
connection_pool (2.5.3)
|
35
35
|
date (3.4.1)
|
36
36
|
debug (1.10.0)
|
37
37
|
irb (~> 1.10)
|
38
38
|
reline (>= 0.3.8)
|
39
|
-
diff-lcs (1.6.
|
39
|
+
diff-lcs (1.6.1)
|
40
40
|
docile (1.4.1)
|
41
41
|
drb (2.2.1)
|
42
42
|
factory_bot (6.5.1)
|
@@ -44,36 +44,37 @@ GEM
|
|
44
44
|
i18n (1.14.7)
|
45
45
|
concurrent-ruby (~> 1.0)
|
46
46
|
io-console (0.8.0)
|
47
|
-
irb (1.15.
|
47
|
+
irb (1.15.2)
|
48
48
|
pp (>= 0.6.0)
|
49
49
|
rdoc (>= 4.0.0)
|
50
50
|
reline (>= 0.4.2)
|
51
|
-
json (2.
|
51
|
+
json (2.11.3)
|
52
52
|
language_server-protocol (3.17.0.4)
|
53
53
|
lint_roller (1.1.0)
|
54
|
-
logger (1.
|
54
|
+
logger (1.7.0)
|
55
55
|
method_source (1.1.0)
|
56
|
-
minitest (5.25.
|
57
|
-
parallel (1.
|
58
|
-
parser (3.3.
|
56
|
+
minitest (5.25.5)
|
57
|
+
parallel (1.27.0)
|
58
|
+
parser (3.3.8.0)
|
59
59
|
ast (~> 2.4.1)
|
60
60
|
racc
|
61
61
|
pp (0.6.2)
|
62
62
|
prettyprint
|
63
63
|
prettyprint (0.2.0)
|
64
|
+
prism (1.4.0)
|
64
65
|
pry (0.15.2)
|
65
66
|
coderay (~> 1.1)
|
66
67
|
method_source (~> 1.0)
|
67
|
-
psych (5.2.
|
68
|
+
psych (5.2.4)
|
68
69
|
date
|
69
70
|
stringio
|
70
71
|
racc (1.8.1)
|
71
72
|
rainbow (3.1.1)
|
72
73
|
rake (13.2.1)
|
73
|
-
rdoc (6.
|
74
|
+
rdoc (6.13.1)
|
74
75
|
psych (>= 4.0.0)
|
75
76
|
regexp_parser (2.10.0)
|
76
|
-
reline (0.6.
|
77
|
+
reline (0.6.1)
|
77
78
|
io-console (~> 0.5)
|
78
79
|
rspec (3.13.0)
|
79
80
|
rspec-core (~> 3.13.0)
|
@@ -81,16 +82,16 @@ GEM
|
|
81
82
|
rspec-mocks (~> 3.13.0)
|
82
83
|
rspec-core (3.13.3)
|
83
84
|
rspec-support (~> 3.13.0)
|
84
|
-
rspec-expectations (3.13.
|
85
|
+
rspec-expectations (3.13.4)
|
85
86
|
diff-lcs (>= 1.2.0, < 2.0)
|
86
87
|
rspec-support (~> 3.13.0)
|
87
|
-
rspec-mocks (3.13.
|
88
|
+
rspec-mocks (3.13.4)
|
88
89
|
diff-lcs (>= 1.2.0, < 2.0)
|
89
90
|
rspec-support (~> 3.13.0)
|
90
|
-
rspec-support (3.13.
|
91
|
+
rspec-support (3.13.3)
|
91
92
|
rspec_junit_formatter (0.6.0)
|
92
93
|
rspec-core (>= 2, < 4, != 2.12.0)
|
93
|
-
rubocop (1.
|
94
|
+
rubocop (1.75.5)
|
94
95
|
json (~> 2.3)
|
95
96
|
language_server-protocol (~> 3.17.0.2)
|
96
97
|
lint_roller (~> 1.1.0)
|
@@ -98,19 +99,21 @@ GEM
|
|
98
99
|
parser (>= 3.3.0.2)
|
99
100
|
rainbow (>= 2.2.2, < 4.0)
|
100
101
|
regexp_parser (>= 2.9.3, < 3.0)
|
101
|
-
rubocop-ast (>= 1.
|
102
|
+
rubocop-ast (>= 1.44.0, < 2.0)
|
102
103
|
ruby-progressbar (~> 1.7)
|
103
104
|
unicode-display_width (>= 2.4.0, < 4.0)
|
104
|
-
rubocop-ast (1.
|
105
|
-
parser (>= 3.3.
|
106
|
-
|
107
|
-
|
108
|
-
|
105
|
+
rubocop-ast (1.44.1)
|
106
|
+
parser (>= 3.3.7.2)
|
107
|
+
prism (~> 1.4)
|
108
|
+
rubocop-capybara (2.22.1)
|
109
|
+
lint_roller (~> 1.1)
|
110
|
+
rubocop (~> 1.72, >= 1.72.1)
|
111
|
+
rubocop-factory_bot (2.27.1)
|
109
112
|
lint_roller (~> 1.1)
|
110
113
|
rubocop (~> 1.72, >= 1.72.1)
|
111
|
-
rubocop-performance (1.
|
114
|
+
rubocop-performance (1.25.0)
|
112
115
|
lint_roller (~> 1.1)
|
113
|
-
rubocop (>= 1.
|
116
|
+
rubocop (>= 1.75.0, < 2.0)
|
114
117
|
rubocop-ast (>= 1.38.0, < 2.0)
|
115
118
|
rubocop-rake (0.7.1)
|
116
119
|
lint_roller (~> 1.1)
|
@@ -134,7 +137,7 @@ GEM
|
|
134
137
|
terminal-table
|
135
138
|
simplecov-html (0.13.1)
|
136
139
|
simplecov_json_formatter (0.1.4)
|
137
|
-
stringio (3.1.
|
140
|
+
stringio (3.1.7)
|
138
141
|
terminal-table (4.0.0)
|
139
142
|
unicode-display_width (>= 1.1.1, < 4)
|
140
143
|
tzinfo (2.0.6)
|
@@ -168,4 +171,4 @@ DEPENDENCIES
|
|
168
171
|
versionomy (~> 0.5)
|
169
172
|
|
170
173
|
BUNDLED WITH
|
171
|
-
2.6.
|
174
|
+
2.6.8
|
data/README.md
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
[![Conventional Commits][cc-img]][cc-url]
|
6
6
|
[![Current version][gem-img]][gem-url]
|
7
7
|
|
8
|
-
Examine all PDF files in
|
8
|
+
Examine all PDF files in lookup directories, remove passwords (if present), rename them, and copy them to a new directory using regular expressions.
|
9
9
|
|
10
10
|
## Installation
|
11
11
|
|
@@ -20,8 +20,7 @@ You need to install pdf handling dependencies in order to use this gem.
|
|
20
20
|
#### macOS
|
21
21
|
|
22
22
|
```bash
|
23
|
-
brew install qpdf # for
|
24
|
-
brew install xpdf # for pdftotext
|
23
|
+
brew install qpdf xpdf # < for pdftotext
|
25
24
|
```
|
26
25
|
|
27
26
|
#### Fedora
|
@@ -38,38 +37,58 @@ sudo pacman -S qpdf poppler
|
|
38
37
|
|
39
38
|
## Usage
|
40
39
|
|
41
|
-
After installing this gem
|
42
|
-
|
40
|
+
After installing this gem, create your configuration file in one of the following directories:
|
43
41
|
- `~/.config/pdfh.yml`
|
44
42
|
- `~/pdfh.yml`
|
45
|
-
- or configure `PDFH_CONFIG_FILE` environment variable
|
43
|
+
- or configure the `PDFH_CONFIG_FILE` environment variable
|
46
44
|
|
45
|
+
Example configuration:
|
47
46
|
```yaml
|
48
47
|
---
|
49
|
-
lookup_dirs:
|
48
|
+
lookup_dirs: # Directories where all PDFs will be analyzed
|
50
49
|
- ~/Downloads
|
51
50
|
destination_base_path: ~/PDFs # Directory where all matching documents will be copied (MUST exist)
|
52
51
|
document_types:
|
53
|
-
- name:
|
52
|
+
- name: My Bank # Description (type)
|
54
53
|
re_file: '.*MyBankReg\.pdf' # Regular expression to match its filename
|
55
|
-
re_date: '
|
56
|
-
pwd:
|
54
|
+
re_date: '\d{1,2} de (\w+) de (\d+)' # Date regular expression
|
55
|
+
pwd: base64_encoded # [OPTIONAL] Password if the document is protected
|
57
56
|
store_path: "{year}/bank_docs" # Relative path to copy this document
|
58
57
|
name_template: '{period} {subtype}' # Template for new filename when copied
|
59
58
|
sub_types: # [OPTIONAL] In case your need an extra category
|
60
|
-
- name:
|
59
|
+
- name: AccountX # Regular expression to match this subtype
|
60
|
+
re_date: '\d{1,2} de (\w+)' # [OPTIONAL] Date regular expression
|
61
61
|
month_offset: -1 # [OPTIONAL] Integer (signed) value to adjust month
|
62
|
+
zip_types: # [OPTIONAL] Zip files to be processed BEFORE the PDFs
|
63
|
+
- name: My Bank 2 # Description
|
64
|
+
re_file: 'Document_MR5664_\d+_\d+.zip' # Regular expression to match its filename
|
65
|
+
pwd: base64_encoded # [OPTIONAL] Password if the document is protected
|
62
66
|
```
|
63
67
|
|
68
|
+
> [!CAUTION]
|
69
|
+
> `pwd` is not encrypted, so be careful with this option. It is stored as a base64 string as a very thin layer of obfuscation.
|
70
|
+
> You can use `echo -n 'password' | base64` to encode your password.
|
71
|
+
|
64
72
|
**Store Path** and **Name Template** supported placeholders:
|
65
73
|
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
74
|
+
Placeholder | Description | Example
|
75
|
+
--- |---------------------------| ---
|
76
|
+
`{original}` | Original filename | MyBankDocument2.pdf
|
77
|
+
`{period}` | Year-Month | 2022-01
|
78
|
+
`{year}` | Year | 2022
|
79
|
+
`{month}` | Month | 01
|
80
|
+
`{type}` | Document type **name** | My Bank
|
81
|
+
`{subtype}` | Sub type **name** | AccountX
|
82
|
+
`{extra}` | day if captured/matched | 01
|
83
|
+
|
84
|
+
`period`, `year`, `month` and `{extra}` are calculated from the date captured by the regular expression.
|
85
|
+
|
86
|
+
### Examples
|
87
|
+
|
88
|
+
Date text | RegEx | Captured
|
89
|
+
--- | --- | ---
|
90
|
+
`01/02/2025` | `(?<d>\d{2}\/(?<m>\d{2})\/(?<y>\d{4})` | d: `01` m: `02` y: `2025`
|
91
|
+
`072025 - ` | `(?<m>\d{2})(?<y>\d{4}) -` | m: `07` y: `2025`
|
73
92
|
|
74
93
|
## Development
|
75
94
|
|
@@ -85,6 +104,15 @@ build pdfh.gemspec
|
|
85
104
|
gem install pdfh-*
|
86
105
|
```
|
87
106
|
|
107
|
+
To release a new version, run:
|
108
|
+
|
109
|
+
```bash
|
110
|
+
rake bump
|
111
|
+
rake release
|
112
|
+
```
|
113
|
+
|
114
|
+
This will create a git tag for the version, push git commits and tags, and upload the `.gem` file to rubygems.org.
|
115
|
+
|
88
116
|
### Conventional Commits
|
89
117
|
|
90
118
|
```bash
|
data/bin/run
CHANGED
data/exe/pdfh
CHANGED
@@ -1,26 +1,12 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
# frozen_string_literal: true
|
3
3
|
|
4
|
-
require "open3"
|
5
4
|
require "pdfh"
|
6
5
|
|
7
|
-
|
8
|
-
# @return [Boolean]
|
9
|
-
def validate_installed(*apps)
|
10
|
-
found_app = []
|
11
|
-
apps.each_with_object(found_app) do |app, result|
|
12
|
-
_stdout, _stderr, status = Open3.capture3("which #{app}")
|
13
|
-
puts "Missing #{app} command." unless status.success?
|
14
|
-
result << status.success?
|
15
|
-
end
|
16
|
-
|
17
|
-
found_app.all?
|
18
|
-
end
|
19
|
-
|
20
|
-
exit(1) unless validate_installed("qpdf", "pdftotext")
|
6
|
+
exit(1) if Pdfh::Utils::DependencyValidator.missing?(*Pdfh::REQUIRED_CMDS)
|
21
7
|
|
22
8
|
begin
|
23
|
-
Pdfh::Main.start
|
9
|
+
Pdfh::Main.start(argv: ARGV)
|
24
10
|
rescue StandardError => e
|
25
11
|
Pdfh.error_print e.message
|
26
12
|
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Pdfh
|
4
|
+
module Concerns
|
5
|
+
# Module that provides password handling capabilities for classes that contain
|
6
|
+
# password attributes. It handles Base64-encoded passwords by automatically
|
7
|
+
# detecting and decoding them when accessed through the password method.
|
8
|
+
module PasswordDecodable
|
9
|
+
# Returns the decoded password if it's Base64 encoded, otherwise returns it as is
|
10
|
+
# @return [String]
|
11
|
+
def password
|
12
|
+
return Base64.decode64(pwd) if base64?
|
13
|
+
|
14
|
+
pwd
|
15
|
+
end
|
16
|
+
|
17
|
+
# @return [Boolean]
|
18
|
+
def password?
|
19
|
+
base64?
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
# @return [boolean]
|
25
|
+
def base64?
|
26
|
+
pwd.is_a?(String) && pwd.size.positive? &&
|
27
|
+
Base64.strict_encode64(Base64.decode64(pwd)) == pwd
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
data/lib/pdfh/main.rb
CHANGED
@@ -4,13 +4,12 @@ module Pdfh
|
|
4
4
|
# Main functionality. This class is intended to manage the pdf documents
|
5
5
|
class Main
|
6
6
|
class << self
|
7
|
+
# @param argv [Array<String>]
|
7
8
|
# @return [void]
|
8
|
-
def start
|
9
|
-
arg_options = Pdfh::OptParser.parse_argv
|
9
|
+
def start(argv:)
|
10
|
+
arg_options = Pdfh::OptParser.new(argv: argv).parse_argv
|
10
11
|
@options = Options.new(arg_options)
|
11
|
-
|
12
|
-
Pdfh.instance_variable_set(:@options, options)
|
13
|
-
Pdfh.instance_variable_set(:@console, Console.new(options.verbose?))
|
12
|
+
assign_global_utils(@options)
|
14
13
|
Pdfh.print_options(arg_options)
|
15
14
|
|
16
15
|
@settings = SettingsBuilder.build
|
@@ -30,8 +29,15 @@ module Pdfh
|
|
30
29
|
|
31
30
|
attr_reader :options, :settings
|
32
31
|
|
32
|
+
# @param options [Options]
|
33
|
+
# @return [void]
|
34
|
+
def assign_global_utils(options)
|
35
|
+
Pdfh.instance_variable_set(:@options, options)
|
36
|
+
Pdfh.instance_variable_set(:@console, Console.new(options.verbose?))
|
37
|
+
end
|
38
|
+
|
33
39
|
# @param [String] file_name
|
34
|
-
# @return [DocumentType]
|
40
|
+
# @return [DocumentType, nil]
|
35
41
|
def match_doc_type(file_name)
|
36
42
|
settings.document_types.each do |type|
|
37
43
|
match = type.re_file.match(file_name)
|
@@ -64,31 +70,74 @@ module Pdfh
|
|
64
70
|
|
65
71
|
# @param [String] work_directory
|
66
72
|
# @return [void]
|
73
|
+
def process_zip_files(work_directory)
|
74
|
+
@settings.zip_types&.each do |zip_type|
|
75
|
+
find_files(work_directory, :zip).each do |file|
|
76
|
+
next unless zip_type.re_file.match?(File.basename(file))
|
77
|
+
|
78
|
+
Pdfh.info " > Processing zip file: #{file.green}"
|
79
|
+
password_opt = "-P #{zip_type.password}" if zip_type.password?
|
80
|
+
`unzip -o #{password_opt} #{file} -d #{work_directory}`
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
# @param directory [String]
|
86
|
+
# @param type [String, Symbol]
|
87
|
+
# @return [Array<String>]
|
88
|
+
def find_files(directory, type)
|
89
|
+
glob = File.join(directory, "*.#{type}")
|
90
|
+
Dir.glob(glob)
|
91
|
+
end
|
92
|
+
|
67
93
|
def process_directory(work_directory)
|
68
94
|
Pdfh.headline(work_directory)
|
69
|
-
|
70
|
-
|
71
|
-
files =
|
95
|
+
process_zip_files(work_directory) if @settings.zip_types?
|
96
|
+
processed_result = RunResult.new
|
97
|
+
files = find_files(work_directory, :pdf)
|
72
98
|
files.each do |pdf_file|
|
73
99
|
type = match_doc_type(pdf_file)
|
74
100
|
if type
|
75
|
-
processed_count += 1
|
76
101
|
PdfFileHandler.new(pdf_file, type).process_document(settings.base_path)
|
102
|
+
processed_result.add_processed(pdf_file)
|
77
103
|
else
|
78
|
-
|
104
|
+
processed_result.add_ignored(pdf_file)
|
79
105
|
end
|
80
106
|
end
|
81
|
-
|
82
|
-
return unless Pdfh.verbose?
|
83
|
-
|
84
|
-
puts "\n No document type found for these PDF files:" if ignored_files.any?
|
85
|
-
ignored_files.each.with_index(1) { |file, index| Pdfh.ident_print index, file, color: :magenta }
|
107
|
+
print_processing_results(processed_result)
|
86
108
|
end
|
87
109
|
|
88
110
|
# @return [String]
|
89
111
|
def base_name_no_ext(file)
|
90
112
|
File.basename(file, File.extname(file))
|
91
113
|
end
|
114
|
+
|
115
|
+
def print_processing_results(result)
|
116
|
+
Pdfh.info " (No files processed)".colorize(:light_black) if result.processed.empty?
|
117
|
+
return unless Pdfh.verbose?
|
118
|
+
|
119
|
+
Pdfh.info "\n No document type found for these PDF files:" if result.ignored.any?
|
120
|
+
result.ignored.each.with_index(1) do |file, index|
|
121
|
+
Pdfh.ident_print index, base_name_no_ext(file), color: :magenta
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
# keeps track of the processed and ignored files
|
127
|
+
class RunResult
|
128
|
+
attr_reader :processed, :ignored
|
129
|
+
|
130
|
+
# @return [self]
|
131
|
+
def initialize
|
132
|
+
@processed = []
|
133
|
+
@ignored = []
|
134
|
+
end
|
135
|
+
|
136
|
+
# @return [void]
|
137
|
+
def add_ignored(file) = @ignored << file
|
138
|
+
|
139
|
+
# @return [void]
|
140
|
+
def add_processed(file) = @processed << file
|
92
141
|
end
|
93
142
|
end
|
94
143
|
end
|
@@ -3,6 +3,22 @@
|
|
3
3
|
module Pdfh
|
4
4
|
# Represents a type of document that can be processed by pdfh
|
5
5
|
class DocumentType
|
6
|
+
include Concerns::PasswordDecodable
|
7
|
+
|
8
|
+
# @!attribute [r] name
|
9
|
+
# @return [String] The name of the document type.
|
10
|
+
# @!attribute [r] re_file
|
11
|
+
# @return [Regexp] The regular expression to match file names.
|
12
|
+
# @!attribute [r] re_date
|
13
|
+
# @return [Regexp] The regular expression to extract dates and its information.
|
14
|
+
# @!attribute [r] pwd
|
15
|
+
# @return [String, nil] The base64 password for the document type, if any.
|
16
|
+
# @!attribute [r] store_path
|
17
|
+
# @return [String] The path where the document will be stored.
|
18
|
+
# @!attribute [r] name_template
|
19
|
+
# @return [String] The template for generating document names.
|
20
|
+
# @!attribute [r] sub_types
|
21
|
+
# @return [Array<DocumentSubType>, nil] The subtypes of the document, if any.
|
6
22
|
attr_reader :name, :re_file, :re_date, :pwd, :store_path, :name_template, :sub_types
|
7
23
|
|
8
24
|
# @param args [Hash]
|
@@ -41,13 +57,6 @@ module Pdfh
|
|
41
57
|
sub_types&.find { |st| /#{st.name}/i.match?(text) }
|
42
58
|
end
|
43
59
|
|
44
|
-
# @return [String]
|
45
|
-
def password
|
46
|
-
return Base64.decode64(pwd) if base64?
|
47
|
-
|
48
|
-
pwd
|
49
|
-
end
|
50
|
-
|
51
60
|
# @param values [Hash{Symbol->String}
|
52
61
|
# @return [String]
|
53
62
|
def generate_new_name(values)
|
@@ -64,11 +73,6 @@ module Pdfh
|
|
64
73
|
|
65
74
|
attr_accessor :path_validator, :name_validator
|
66
75
|
|
67
|
-
# @return [boolean]
|
68
|
-
def base64?
|
69
|
-
pwd.is_a?(String) && Base64.strict_encode64(Base64.decode64(pwd)) == pwd
|
70
|
-
end
|
71
|
-
|
72
76
|
# @param sub_types [Array<Hash{Symbol->String}>]
|
73
77
|
# @return [Array<DocumentSubType>]
|
74
78
|
def extract_subtypes(sub_types)
|
data/lib/pdfh/models/settings.rb
CHANGED
@@ -3,7 +3,13 @@
|
|
3
3
|
module Pdfh
|
4
4
|
# Handles the config yaml data mapping, and associates a file name with a doc type
|
5
5
|
class Settings
|
6
|
-
|
6
|
+
# @!attribute [r] lookup_dirs
|
7
|
+
# @return [Array<String>] List of directories to look up for processing.
|
8
|
+
# @!attribute [r] base_path
|
9
|
+
# @return [String] The base directory path for storing processed files.
|
10
|
+
# @!attribute [r] zip_types
|
11
|
+
# @return [Array<ZipType>, nil] List of zip types to process, or nil if none.
|
12
|
+
attr_reader :lookup_dirs, :base_path, :zip_types
|
7
13
|
|
8
14
|
# @param config_data [Hash]
|
9
15
|
# @return [self]
|
@@ -15,7 +21,8 @@ module Pdfh
|
|
15
21
|
lookup_dirs.each.with_index(1) { |dir, idx| Pdfh.debug " #{idx}. #{dir}" }
|
16
22
|
Pdfh.debug
|
17
23
|
|
18
|
-
|
24
|
+
build_doc_types(config_data[:document_types])
|
25
|
+
build_zip_types(config_data[:zip_types]) if config_data.key?(:zip_types)
|
19
26
|
end
|
20
27
|
|
21
28
|
# @return [Array<DocumentType>]
|
@@ -28,8 +35,14 @@ module Pdfh
|
|
28
35
|
@document_types[id]
|
29
36
|
end
|
30
37
|
|
38
|
+
# @return [Boolean]
|
39
|
+
def zip_types?
|
40
|
+
!!zip_types&.any?
|
41
|
+
end
|
42
|
+
|
31
43
|
private
|
32
44
|
|
45
|
+
# @param lookup_dirs_list [Array[String]]
|
33
46
|
# @return [void]
|
34
47
|
def process_lookup_dirs(lookup_dirs_list)
|
35
48
|
@lookup_dirs = lookup_dirs_list.filter_map do |dir|
|
@@ -44,14 +57,16 @@ module Pdfh
|
|
44
57
|
end
|
45
58
|
|
46
59
|
# @return [void]
|
60
|
+
# @param dir [String]
|
47
61
|
def process_destination_base(dir)
|
48
62
|
@base_path = File.expand_path(dir)
|
49
63
|
raise ArgumentError, "Destination base directory is not configured." if @base_path.nil?
|
50
64
|
raise ArgumentError, "Destination base directory #{@base_path} does not exist." unless File.directory?(@base_path)
|
51
65
|
end
|
52
66
|
|
53
|
-
# @
|
54
|
-
|
67
|
+
# @param doc_types [Array<Hash>]
|
68
|
+
# @return [void]
|
69
|
+
def build_doc_types(doc_types)
|
55
70
|
@document_types = doc_types.each_with_object({}) do |data, result|
|
56
71
|
doc_type = DocumentType.new(data)
|
57
72
|
result.store(doc_type.gid, doc_type)
|
@@ -60,5 +75,13 @@ module Pdfh
|
|
60
75
|
Pdfh.backtrace_print e if Pdfh.verbose?
|
61
76
|
end
|
62
77
|
end
|
78
|
+
|
79
|
+
# @param zip_types [Array<Hash>]
|
80
|
+
# @return [void]
|
81
|
+
def build_zip_types(zip_types)
|
82
|
+
exit(1) if Pdfh::Utils::DependencyValidator.missing?(:unzip)
|
83
|
+
|
84
|
+
@zip_types = zip_types.compact.map { ZipType.new(_1) }
|
85
|
+
end
|
63
86
|
end
|
64
87
|
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Pdfh
|
4
|
+
# Zip files which contains PDF files that need pre-processing
|
5
|
+
class ZipType
|
6
|
+
include Concerns::PasswordDecodable
|
7
|
+
|
8
|
+
attr_reader :name, :re_file, :pwd
|
9
|
+
|
10
|
+
# @param args [Hash]
|
11
|
+
# @return [self]
|
12
|
+
def initialize(args)
|
13
|
+
args.each { |k, v| instance_variable_set(:"@#{k}", v) }
|
14
|
+
@re_file = Regexp.new(re_file)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "open3"
|
4
|
+
|
5
|
+
module Pdfh
|
6
|
+
module Utils
|
7
|
+
# Provides methods to validate external dependencies
|
8
|
+
module DependencyValidator
|
9
|
+
module_function
|
10
|
+
|
11
|
+
# Validates if the required command-line applications are installed
|
12
|
+
# @param apps [Array<String>] names of required command-line applications
|
13
|
+
# @return [Boolean] true if all applications are installed, false otherwise
|
14
|
+
def installed?(*apps)
|
15
|
+
missing = apps.filter_map do |app|
|
16
|
+
_stdout, _stderr, status = Open3.capture3("which #{app}")
|
17
|
+
|
18
|
+
app.to_s unless status.success?
|
19
|
+
end
|
20
|
+
|
21
|
+
if missing.any?
|
22
|
+
errors = missing.map(&:red)
|
23
|
+
puts "Required dependency #{errors.join(", ")} not found. Please install it before continuing."
|
24
|
+
end
|
25
|
+
missing.empty?
|
26
|
+
end
|
27
|
+
|
28
|
+
# @param apps [Array<String>]
|
29
|
+
# @return [Boolean] true if any application is missing, false if all are installed
|
30
|
+
def missing?(*apps)
|
31
|
+
!installed?(*apps)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -5,58 +5,73 @@ require "optparse"
|
|
5
5
|
module Pdfh
|
6
6
|
# Handles Argument options
|
7
7
|
class OptParser
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
opts.on("-v", "--verbose", "Show more output. Useful for debug")
|
21
|
-
opts.on("-d", "--dry", "Dry run, does not write new pdf")
|
8
|
+
# @param argv [Array<String>] command line arguments (ie. ARGV)
|
9
|
+
# @param console [Pdfh::Console, nil]
|
10
|
+
# @return [self]
|
11
|
+
def initialize(argv:, console: nil)
|
12
|
+
@argv = argv
|
13
|
+
@console = console || Console.new(false)
|
14
|
+
@options = {
|
15
|
+
verbose: false,
|
16
|
+
dry: false,
|
17
|
+
type: nil,
|
18
|
+
files: []
|
19
|
+
}
|
22
20
|
end
|
23
21
|
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
puts OPT_PARSER.help
|
36
|
-
exit 1
|
37
|
-
end
|
22
|
+
# @return [Hash] Parsed options including flags and file arguments
|
23
|
+
def parse_argv
|
24
|
+
option_parser = build_option_parser
|
25
|
+
non_option_args = option_parser.parse!(@argv)
|
26
|
+
@options[:files] = non_option_args
|
27
|
+
@options.transform_keys { |key| key.to_s.tr("-", "_").to_sym }
|
28
|
+
rescue OptionParser::InvalidOption => e
|
29
|
+
@console.error_print(e.message, exit_app: false)
|
30
|
+
puts option_parser.help
|
31
|
+
exit 1
|
32
|
+
end
|
38
33
|
|
39
|
-
|
40
|
-
def version
|
41
|
-
puts "#{OPT_PARSER.program_name} v#{Pdfh::VERSION}"
|
42
|
-
end
|
34
|
+
private
|
43
35
|
|
44
|
-
|
45
|
-
|
46
|
-
|
36
|
+
# @return [OptionParser] Configured OptionParser instance
|
37
|
+
def build_option_parser
|
38
|
+
OptionParser.new do |opts|
|
39
|
+
opts.banner = "Usage: #{opts.program_name} [options] [file1.pdf, ...]"
|
40
|
+
opts.separator ""
|
41
|
+
opts.separator "Specific options:"
|
42
|
+
|
43
|
+
opts.on("-tID", "--type=ID", "Document type id (requires a trailing file list)") { @options[:type] = _1 }
|
44
|
+
opts.on("-v", "--verbose", "Show more output. Useful for debug") { @options[:verbose] = true }
|
45
|
+
opts.on("-d", "--dry", "Dry run, does not write new pdf") { @options[:dry] = true }
|
46
|
+
opts.on_tail("-T", "--list-types", "List document types in configuration") { list_types && exit }
|
47
|
+
opts.on_tail("-V", "--version", "Show version") { version || exit }
|
48
|
+
opts.on_tail("-h", "--help", "help (this dialog)") { help || exit }
|
47
49
|
end
|
50
|
+
end
|
51
|
+
|
52
|
+
# @return [nil]
|
53
|
+
def version
|
54
|
+
@console.info "#{build_option_parser.program_name} v#{Pdfh::VERSION}"
|
55
|
+
end
|
56
|
+
|
57
|
+
# @return [nil]
|
58
|
+
def help
|
59
|
+
@console.info build_option_parser
|
60
|
+
end
|
61
|
+
|
62
|
+
# Lists the available document types
|
63
|
+
# @return [nil]
|
64
|
+
def list_types
|
65
|
+
Pdfh.instance_variable_set(:@options, Options.new(@options))
|
66
|
+
Pdfh.instance_variable_set(:@console, @console)
|
48
67
|
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
settings.document_types.each do |type|
|
57
|
-
puts "#{" " * ident}#{type.gid.ljust(max_width).yellow} #{type.name}"
|
58
|
-
end
|
59
|
-
nil
|
68
|
+
settings = SettingsBuilder.build
|
69
|
+
spacing = " " * 2
|
70
|
+
max_width = settings.document_types.map { |t| t.gid.size }.max
|
71
|
+
@console.info "#{spacing}#{"ID".ljust(max_width)} Type Name"
|
72
|
+
@console.info "#{spacing}#{"—" * max_width} #{"—" * 23}"
|
73
|
+
settings.document_types.each do |type|
|
74
|
+
@console.info "#{spacing}#{type.gid.ljust(max_width).yellow} #{type.name}"
|
60
75
|
end
|
61
76
|
end
|
62
77
|
end
|
data/lib/pdfh/version.rb
CHANGED
data/lib/pdfh.rb
CHANGED
@@ -9,15 +9,20 @@ require "yaml"
|
|
9
9
|
|
10
10
|
require_relative "ext/string"
|
11
11
|
|
12
|
+
# Concerns
|
13
|
+
require_relative "pdfh/concerns/password_decodable"
|
14
|
+
|
12
15
|
# Models
|
13
16
|
require_relative "pdfh/models/document"
|
14
17
|
require_relative "pdfh/models/document_period"
|
15
18
|
require_relative "pdfh/models/document_sub_type"
|
16
19
|
require_relative "pdfh/models/document_type"
|
17
20
|
require_relative "pdfh/models/settings"
|
21
|
+
require_relative "pdfh/models/zip_types"
|
18
22
|
|
19
23
|
# Utils
|
20
24
|
require_relative "pdfh/utils/console"
|
25
|
+
require_relative "pdfh/utils/dependency_validator"
|
21
26
|
require_relative "pdfh/utils/month"
|
22
27
|
require_relative "pdfh/utils/opt_parser"
|
23
28
|
require_relative "pdfh/utils/options"
|
@@ -31,6 +36,8 @@ require_relative "pdfh/version"
|
|
31
36
|
|
32
37
|
# Gem entry point
|
33
38
|
module Pdfh
|
39
|
+
REQUIRED_CMDS = %i[qpdf pdftotext].freeze
|
40
|
+
|
34
41
|
# Settings not found
|
35
42
|
class SettingsIOError < StandardError; end
|
36
43
|
|
@@ -45,6 +52,7 @@ module Pdfh
|
|
45
52
|
class << self
|
46
53
|
extend Forwardable
|
47
54
|
def_delegators :@options, :verbose?, :dry?, :file_mode?
|
48
|
-
def_delegators :@console, :ident_print, :warn_print, :error_print, :backtrace_print, :headline, :debug, :info,
|
55
|
+
def_delegators :@console, :ident_print, :warn_print, :error_print, :backtrace_print, :headline, :debug, :info,
|
56
|
+
:print_options
|
49
57
|
end
|
50
58
|
end
|
data/mise.toml
CHANGED
@@ -1,2 +1,2 @@
|
|
1
1
|
[tools]
|
2
|
-
ruby = "3.4.
|
2
|
+
ruby = "3.4.3"
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdfh
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.
|
4
|
+
version: 3.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Isaias Piña
|
8
8
|
bindir: exe
|
9
9
|
cert_chain: []
|
10
|
-
date:
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
11
11
|
dependencies:
|
12
12
|
- !ruby/object:Gem::Dependency
|
13
13
|
name: colorize
|
@@ -33,6 +33,7 @@ extensions: []
|
|
33
33
|
extra_rdoc_files: []
|
34
34
|
files:
|
35
35
|
- ".gitignore"
|
36
|
+
- ".pre-commit-config.yaml"
|
36
37
|
- ".rspec"
|
37
38
|
- ".rubocop.yml"
|
38
39
|
- ".rubocop_todo.yml"
|
@@ -50,14 +51,17 @@ files:
|
|
50
51
|
- exe/pdfh
|
51
52
|
- lib/ext/string.rb
|
52
53
|
- lib/pdfh.rb
|
54
|
+
- lib/pdfh/concerns/password_decodable.rb
|
53
55
|
- lib/pdfh/main.rb
|
54
56
|
- lib/pdfh/models/document.rb
|
55
57
|
- lib/pdfh/models/document_period.rb
|
56
58
|
- lib/pdfh/models/document_sub_type.rb
|
57
59
|
- lib/pdfh/models/document_type.rb
|
58
60
|
- lib/pdfh/models/settings.rb
|
61
|
+
- lib/pdfh/models/zip_types.rb
|
59
62
|
- lib/pdfh/settings_template.rb
|
60
63
|
- lib/pdfh/utils/console.rb
|
64
|
+
- lib/pdfh/utils/dependency_validator.rb
|
61
65
|
- lib/pdfh/utils/month.rb
|
62
66
|
- lib/pdfh/utils/opt_parser.rb
|
63
67
|
- lib/pdfh/utils/options.rb
|
@@ -90,7 +94,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
90
94
|
- !ruby/object:Gem::Version
|
91
95
|
version: '0'
|
92
96
|
requirements: []
|
93
|
-
rubygems_version: 3.6.
|
97
|
+
rubygems_version: 3.6.8
|
94
98
|
specification_version: 4
|
95
99
|
summary: Organize PDF files
|
96
100
|
test_files: []
|