pdfh 0.1.4 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +37 -10
- data/.rubocop.yml +22 -5
- data/.rubocop_todo.yml +152 -0
- data/.ruby-version +1 -1
- data/CHANGELOG.md +11 -0
- data/Gemfile +14 -3
- data/Gemfile.lock +76 -30
- data/README.md +14 -2
- data/Rakefile +19 -2
- data/bin/console +6 -6
- data/exe/pdfh +19 -8
- data/lib/ext/string.rb +13 -0
- data/lib/pdfh.rb +65 -47
- data/lib/pdfh/document.rb +83 -109
- data/lib/pdfh/month.rb +41 -0
- data/lib/pdfh/pdf_handler.rb +54 -0
- data/lib/pdfh/settings.rb +21 -11
- data/lib/pdfh/utils.rb +9 -4
- data/lib/pdfh/version.rb +1 -1
- data/pdfh.gemspec +22 -27
- metadata +13 -81
- data/.ruby-gemset +0 -1
- data/.travis.yml +0 -7
data/lib/pdfh/month.rb
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Pdfh
|
4
|
+
##
|
5
|
+
# Handles Month convertions
|
6
|
+
class Month
|
7
|
+
MONTHS = {
|
8
|
+
enero: 1,
|
9
|
+
febrero: 2,
|
10
|
+
marzo: 3,
|
11
|
+
abril: 4,
|
12
|
+
mayo: 5,
|
13
|
+
junio: 6,
|
14
|
+
julio: 7,
|
15
|
+
agosto: 8,
|
16
|
+
septiembre: 9,
|
17
|
+
octubre: 10,
|
18
|
+
noviembre: 11,
|
19
|
+
diciembre: 12
|
20
|
+
}.freeze
|
21
|
+
|
22
|
+
##
|
23
|
+
# @param [String] month
|
24
|
+
# @return [Integer]
|
25
|
+
def self.normalize(month)
|
26
|
+
# When param is a number
|
27
|
+
month_num = month.to_i
|
28
|
+
return month_num if month_num.between?(1, 12) # (1..12).include?(month_num)
|
29
|
+
|
30
|
+
# When param is a 3 char month: 'mar', 'nov'
|
31
|
+
if month.size == 3
|
32
|
+
MONTHS.each_key do |mon|
|
33
|
+
return MONTHS[mon] if mon.to_s[0, 3] == month
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
# When param has a direct match
|
38
|
+
MONTHS[month.to_sym]
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Pdfh
|
4
|
+
##
|
5
|
+
# Handles the Pdf document text extraction and password removal
|
6
|
+
# TODO: Replace command utils with this gem
|
7
|
+
# require 'pdf-reader'
|
8
|
+
#
|
9
|
+
# reader = PDF::Reader.new(temp)
|
10
|
+
# reader.pages.each do |page|
|
11
|
+
# @text << page.text
|
12
|
+
# end
|
13
|
+
class PdfHandler
|
14
|
+
attr_reader :file, :password
|
15
|
+
|
16
|
+
def initialize(file, password)
|
17
|
+
@file = file
|
18
|
+
@password = password
|
19
|
+
end
|
20
|
+
|
21
|
+
##
|
22
|
+
# Gets the text from the pdf in order to execute
|
23
|
+
# the regular expresiom matches
|
24
|
+
def extract_text
|
25
|
+
temp = `mktemp`.chomp
|
26
|
+
Verbose.print " --> #{temp} temporal file assigned."
|
27
|
+
|
28
|
+
password_opt = "--password='#{@password}'" if @password
|
29
|
+
cmd = %(qpdf #{password_opt} --decrypt --stream-data=uncompress '#{@file}' '#{temp}')
|
30
|
+
Verbose.print " Command: #{cmd}"
|
31
|
+
_result = `#{cmd}`
|
32
|
+
|
33
|
+
cmd2 = %(pdftotext -enc UTF-8 '#{temp}' -)
|
34
|
+
Verbose.print " Command: #{cmd2}"
|
35
|
+
text = `#{cmd2}`
|
36
|
+
Verbose.print " Text extracted: #{text}"
|
37
|
+
text
|
38
|
+
end
|
39
|
+
|
40
|
+
def write_pdf(dir_path, full_path)
|
41
|
+
Verbose.print "~~~~~~~~~~~~~~~~~~ Writing PDFs"
|
42
|
+
raise IOError, "Path #{dir_path} not found." unless Dir.exist?(dir_path)
|
43
|
+
|
44
|
+
password_opt = "--password='#{@password}'" if @password
|
45
|
+
cmd = %(qpdf #{password_opt} --decrypt '#{@file}' '#{full_path}')
|
46
|
+
Verbose.print " Write pdf command: #{cmd}"
|
47
|
+
|
48
|
+
return if Dry.active?
|
49
|
+
|
50
|
+
_result = `#{cmd}`
|
51
|
+
raise IOError, "File #{full_path} was not created." unless File.file?(full_path)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
data/lib/pdfh/settings.rb
CHANGED
@@ -1,27 +1,25 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require
|
4
|
-
require
|
5
|
-
require
|
3
|
+
require "yaml"
|
4
|
+
require "ostruct"
|
5
|
+
require "base64"
|
6
6
|
|
7
7
|
module Pdfh
|
8
8
|
##
|
9
9
|
# Handles the config yaml data mapping, and associates a file name with a doc type
|
10
10
|
class Settings
|
11
|
-
|
11
|
+
attr_reader :scrape_dirs, :base_path, :document_types
|
12
12
|
|
13
13
|
def initialize(file)
|
14
14
|
file_hash = YAML.load_file(file)
|
15
15
|
Verbose.print "Loaded configuration file: #{file}"
|
16
16
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
self.base_path = File.expand_path(file_hash['base_path'])
|
21
|
-
self.document_types = process_doc_types(file_hash['document_types'])
|
17
|
+
@scrape_dirs = process_scrape_dirs(file_hash["scrape_dirs"])
|
18
|
+
@base_path = File.expand_path(file_hash["base_path"])
|
19
|
+
@document_types = process_doc_types(file_hash["document_types"])
|
22
20
|
|
23
|
-
Verbose.print
|
24
|
-
scrape_dirs.each { |
|
21
|
+
Verbose.print "Processing directories:"
|
22
|
+
scrape_dirs.each { |dir| Verbose.print " - #{dir}" }
|
25
23
|
Verbose.print
|
26
24
|
end
|
27
25
|
|
@@ -38,6 +36,18 @@ module Pdfh
|
|
38
36
|
|
39
37
|
private
|
40
38
|
|
39
|
+
def process_scrape_dirs(scrape_dirs_list)
|
40
|
+
scrape_dirs_list.map do |dir|
|
41
|
+
expanded = File.expand_path(dir)
|
42
|
+
dir_exists = File.directory?(expanded)
|
43
|
+
if dir_exists
|
44
|
+
expanded
|
45
|
+
else
|
46
|
+
Verbose.print " ** Directory #{dir} does not exists."
|
47
|
+
end
|
48
|
+
end.compact
|
49
|
+
end
|
50
|
+
|
41
51
|
def process_doc_types(doc_types)
|
42
52
|
doc_types.map do |x|
|
43
53
|
object = OpenStruct.new(x)
|
data/lib/pdfh/utils.rb
CHANGED
@@ -1,10 +1,9 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require
|
3
|
+
require "colorize"
|
4
4
|
|
5
|
+
# Contains all generic short functionality
|
5
6
|
module Pdfh
|
6
|
-
class Error < StandardError; end
|
7
|
-
|
8
7
|
##
|
9
8
|
# Keeps Verbose option in whole project
|
10
9
|
class Verbose
|
@@ -16,7 +15,7 @@ module Pdfh
|
|
16
15
|
@active
|
17
16
|
end
|
18
17
|
|
19
|
-
def print(msg =
|
18
|
+
def print(msg = "")
|
20
19
|
puts msg.colorize(:cyan) if active?
|
21
20
|
end
|
22
21
|
end
|
@@ -34,4 +33,10 @@ module Pdfh
|
|
34
33
|
end
|
35
34
|
end
|
36
35
|
end
|
36
|
+
|
37
|
+
def self.print_error(exception, exit_app: true)
|
38
|
+
line = exception.backtrace[0].match(/:(?<line>\d+)/)[:line]
|
39
|
+
puts "Error, Line[#{line}]: #{exception.message}.".colorize(:red)
|
40
|
+
exit 1 if exit_app
|
41
|
+
end
|
37
42
|
end
|
data/lib/pdfh/version.rb
CHANGED
data/pdfh.gemspec
CHANGED
@@ -1,48 +1,43 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
lib = File.expand_path(
|
4
|
-
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
5
|
-
|
3
|
+
# lib = File.expand_path("lib", __dir__)
|
4
|
+
# $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
5
|
+
require_relative "lib/pdfh/version"
|
6
6
|
|
7
7
|
Gem::Specification.new do |spec|
|
8
|
-
spec.name =
|
8
|
+
spec.name = "pdfh"
|
9
9
|
spec.version = Pdfh::VERSION
|
10
|
-
spec.authors = [
|
11
|
-
spec.email = [
|
10
|
+
spec.authors = ["Isaias Piña"]
|
11
|
+
spec.email = ["iax7@users.noreply.github.com"]
|
12
12
|
|
13
|
-
spec.summary =
|
14
|
-
spec.description =
|
15
|
-
|
16
|
-
spec.
|
17
|
-
spec.
|
13
|
+
spec.summary = "Organize PDF files"
|
14
|
+
spec.description = "Examine all PDF files in scrape directories, remove password (if has one), "\
|
15
|
+
"rename and copy to a new directory using regular expresions."
|
16
|
+
spec.homepage = "https://github.com/iax7/pdfh"
|
17
|
+
spec.license = "MIT"
|
18
|
+
spec.required_ruby_version = Gem::Requirement.new(">= 2.5.0")
|
18
19
|
|
19
20
|
# Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
|
20
21
|
# to allow pushing to a single host or delete this section to allow pushing to any host.
|
21
22
|
if spec.respond_to?(:metadata)
|
22
|
-
spec.metadata[
|
23
|
+
spec.metadata["allowed_push_host"] = "https://rubygems.org"
|
23
24
|
|
24
|
-
spec.metadata[
|
25
|
-
spec.metadata[
|
26
|
-
spec.metadata[
|
25
|
+
spec.metadata["homepage_uri"] = spec.homepage
|
26
|
+
spec.metadata["source_code_uri"] = spec.homepage
|
27
|
+
spec.metadata["changelog_uri"] = "https://raw.githubusercontent.com/iax7/pdfh/master/CHANGELOG.md"
|
27
28
|
else
|
28
|
-
raise
|
29
|
-
|
29
|
+
raise "RubyGems 2.0 or newer is required to protect against " \
|
30
|
+
"public gem pushes."
|
30
31
|
end
|
31
32
|
|
32
33
|
# Specify which files should be added to the gem when it is released.
|
33
34
|
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
34
35
|
spec.files = Dir.chdir(File.expand_path(__dir__)) do
|
35
|
-
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{
|
36
|
+
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{\A(?:\.\w+|docs|test|spec|features)/}) }
|
36
37
|
end
|
37
|
-
spec.bindir =
|
38
|
+
spec.bindir = "exe"
|
38
39
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
39
|
-
spec.require_paths = [
|
40
|
+
spec.require_paths = ["lib"]
|
40
41
|
|
41
|
-
spec.add_dependency
|
42
|
-
|
43
|
-
spec.add_development_dependency 'bundler', '~> 1.17.2'
|
44
|
-
spec.add_development_dependency 'rake', '~> 10.0'
|
45
|
-
spec.add_development_dependency 'rspec', '~> 3.0'
|
46
|
-
spec.add_development_dependency 'simplecov', '~> 0.16.1'
|
47
|
-
spec.add_development_dependency 'simplecov-console', '~> 0.4.2'
|
42
|
+
spec.add_dependency "colorize", "~> 0.8.0"
|
48
43
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdfh
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Isaias Piña
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-03-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: colorize
|
@@ -16,84 +16,14 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: 0.8.
|
19
|
+
version: 0.8.0
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: 0.8.
|
27
|
-
- !ruby/object:Gem::Dependency
|
28
|
-
name: bundler
|
29
|
-
requirement: !ruby/object:Gem::Requirement
|
30
|
-
requirements:
|
31
|
-
- - "~>"
|
32
|
-
- !ruby/object:Gem::Version
|
33
|
-
version: 1.17.2
|
34
|
-
type: :development
|
35
|
-
prerelease: false
|
36
|
-
version_requirements: !ruby/object:Gem::Requirement
|
37
|
-
requirements:
|
38
|
-
- - "~>"
|
39
|
-
- !ruby/object:Gem::Version
|
40
|
-
version: 1.17.2
|
41
|
-
- !ruby/object:Gem::Dependency
|
42
|
-
name: rake
|
43
|
-
requirement: !ruby/object:Gem::Requirement
|
44
|
-
requirements:
|
45
|
-
- - "~>"
|
46
|
-
- !ruby/object:Gem::Version
|
47
|
-
version: '10.0'
|
48
|
-
type: :development
|
49
|
-
prerelease: false
|
50
|
-
version_requirements: !ruby/object:Gem::Requirement
|
51
|
-
requirements:
|
52
|
-
- - "~>"
|
53
|
-
- !ruby/object:Gem::Version
|
54
|
-
version: '10.0'
|
55
|
-
- !ruby/object:Gem::Dependency
|
56
|
-
name: rspec
|
57
|
-
requirement: !ruby/object:Gem::Requirement
|
58
|
-
requirements:
|
59
|
-
- - "~>"
|
60
|
-
- !ruby/object:Gem::Version
|
61
|
-
version: '3.0'
|
62
|
-
type: :development
|
63
|
-
prerelease: false
|
64
|
-
version_requirements: !ruby/object:Gem::Requirement
|
65
|
-
requirements:
|
66
|
-
- - "~>"
|
67
|
-
- !ruby/object:Gem::Version
|
68
|
-
version: '3.0'
|
69
|
-
- !ruby/object:Gem::Dependency
|
70
|
-
name: simplecov
|
71
|
-
requirement: !ruby/object:Gem::Requirement
|
72
|
-
requirements:
|
73
|
-
- - "~>"
|
74
|
-
- !ruby/object:Gem::Version
|
75
|
-
version: 0.16.1
|
76
|
-
type: :development
|
77
|
-
prerelease: false
|
78
|
-
version_requirements: !ruby/object:Gem::Requirement
|
79
|
-
requirements:
|
80
|
-
- - "~>"
|
81
|
-
- !ruby/object:Gem::Version
|
82
|
-
version: 0.16.1
|
83
|
-
- !ruby/object:Gem::Dependency
|
84
|
-
name: simplecov-console
|
85
|
-
requirement: !ruby/object:Gem::Requirement
|
86
|
-
requirements:
|
87
|
-
- - "~>"
|
88
|
-
- !ruby/object:Gem::Version
|
89
|
-
version: 0.4.2
|
90
|
-
type: :development
|
91
|
-
prerelease: false
|
92
|
-
version_requirements: !ruby/object:Gem::Requirement
|
93
|
-
requirements:
|
94
|
-
- - "~>"
|
95
|
-
- !ruby/object:Gem::Version
|
96
|
-
version: 0.4.2
|
26
|
+
version: 0.8.0
|
97
27
|
description: Examine all PDF files in scrape directories, remove password (if has
|
98
28
|
one), rename and copy to a new directory using regular expresions.
|
99
29
|
email:
|
@@ -106,9 +36,8 @@ files:
|
|
106
36
|
- ".gitignore"
|
107
37
|
- ".rspec"
|
108
38
|
- ".rubocop.yml"
|
109
|
-
- ".
|
39
|
+
- ".rubocop_todo.yml"
|
110
40
|
- ".ruby-version"
|
111
|
-
- ".travis.yml"
|
112
41
|
- CHANGELOG.md
|
113
42
|
- CODE_OF_CONDUCT.md
|
114
43
|
- Gemfile
|
@@ -119,8 +48,11 @@ files:
|
|
119
48
|
- bin/console
|
120
49
|
- bin/setup
|
121
50
|
- exe/pdfh
|
51
|
+
- lib/ext/string.rb
|
122
52
|
- lib/pdfh.rb
|
123
53
|
- lib/pdfh/document.rb
|
54
|
+
- lib/pdfh/month.rb
|
55
|
+
- lib/pdfh/pdf_handler.rb
|
124
56
|
- lib/pdfh/settings.rb
|
125
57
|
- lib/pdfh/utils.rb
|
126
58
|
- lib/pdfh/version.rb
|
@@ -133,7 +65,7 @@ metadata:
|
|
133
65
|
homepage_uri: https://github.com/iax7/pdfh
|
134
66
|
source_code_uri: https://github.com/iax7/pdfh
|
135
67
|
changelog_uri: https://raw.githubusercontent.com/iax7/pdfh/master/CHANGELOG.md
|
136
|
-
post_install_message:
|
68
|
+
post_install_message:
|
137
69
|
rdoc_options: []
|
138
70
|
require_paths:
|
139
71
|
- lib
|
@@ -141,15 +73,15 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
141
73
|
requirements:
|
142
74
|
- - ">="
|
143
75
|
- !ruby/object:Gem::Version
|
144
|
-
version: 2.
|
76
|
+
version: 2.5.0
|
145
77
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
146
78
|
requirements:
|
147
79
|
- - ">="
|
148
80
|
- !ruby/object:Gem::Version
|
149
81
|
version: '0'
|
150
82
|
requirements: []
|
151
|
-
rubygems_version: 3.
|
152
|
-
signing_key:
|
83
|
+
rubygems_version: 3.2.4
|
84
|
+
signing_key:
|
153
85
|
specification_version: 4
|
154
86
|
summary: Organize PDF files
|
155
87
|
test_files: []
|
data/.ruby-gemset
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
pdfh
|