uploadconvert 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/uploadconvert.rb +100 -0
- metadata +47 -0
@@ -0,0 +1,100 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'docsplit'
|
3
|
+
require 'open3'
|
4
|
+
|
5
|
+
class UploadConvert
|
6
|
+
|
7
|
+
def initialize(input)
|
8
|
+
@input = input
|
9
|
+
@output = ""
|
10
|
+
@text = ""
|
11
|
+
end
|
12
|
+
|
13
|
+
# Convert PDFs to JSON
|
14
|
+
def pdfTojson
|
15
|
+
# Extract and clean text
|
16
|
+
@text = detectPDFType
|
17
|
+
|
18
|
+
# Extract metadata and generate output
|
19
|
+
extractMetadataPDF
|
20
|
+
outhash = Hash.new
|
21
|
+
@metadata.each{|k, v| outhash[k] = v}
|
22
|
+
outhash[:text] = @text
|
23
|
+
outhash[:input] = @input
|
24
|
+
@output = JSON.pretty_generate(outhash)
|
25
|
+
end
|
26
|
+
|
27
|
+
# Use embedded fonts to detect the type of PDF
|
28
|
+
def detectPDFType
|
29
|
+
out = `pdffonts #{@input}`.split("\n")
|
30
|
+
if out.length > 4
|
31
|
+
return embedPDF
|
32
|
+
else
|
33
|
+
return ocrPDF
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
# Extract text from embedded text PDFs
|
38
|
+
def embedPDF
|
39
|
+
Docsplit.extract_text(@input, :ocr => false)
|
40
|
+
outfile = @input.split(".")
|
41
|
+
text = File.read(outfile[0]+".txt")
|
42
|
+
|
43
|
+
# Clean up text and delete file
|
44
|
+
File.delete(outfile[0]+".txt")
|
45
|
+
cleanPDF(text)
|
46
|
+
end
|
47
|
+
|
48
|
+
# OCR PDFs and turn that text into a JSON
|
49
|
+
def ocrPDF
|
50
|
+
# Extract individual pages
|
51
|
+
Docsplit.extract_images(@input)
|
52
|
+
|
53
|
+
# OCR
|
54
|
+
docs = Dir["*.png"]
|
55
|
+
Docsplit.extract_text(@input, :ocr => true, :output => 'text')
|
56
|
+
outfile = @input.split(".")
|
57
|
+
text = File.read("text/" + outfile[0] + ".txt")
|
58
|
+
|
59
|
+
# Clean up text and files
|
60
|
+
File.delete("text/" + outfile[0]+".txt")
|
61
|
+
Dir.delete("text")
|
62
|
+
docs.each do |d|
|
63
|
+
File.delete(d)
|
64
|
+
end
|
65
|
+
cleanPDF(text)
|
66
|
+
end
|
67
|
+
|
68
|
+
# Removes numbers from edges of legal documents
|
69
|
+
def cleanPDF(text)
|
70
|
+
text.gsub!(/\r?\n/, "\n")
|
71
|
+
text.each_line do |l|
|
72
|
+
lflag = 0
|
73
|
+
(1..28).each do |i|
|
74
|
+
if l == i.to_s+"\n"
|
75
|
+
lflag = 1
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
if lflag != 1 && l
|
80
|
+
@text += l
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
return @text
|
85
|
+
end
|
86
|
+
|
87
|
+
# Extract PDF metadata
|
88
|
+
def extractMetadataPDF
|
89
|
+
@metadata = Hash.new
|
90
|
+
@metadata[:author] = Docsplit.extract_author(@input)
|
91
|
+
@metadata[:creator] = Docsplit.extract_creator(@input)
|
92
|
+
@metadata[:producer] = Docsplit.extract_producer(@input)
|
93
|
+
@metadata[:title] = Docsplit.extract_title(@input)
|
94
|
+
@metadata[:subject] = Docsplit.extract_subject(@input)
|
95
|
+
@metadata[:date] = Docsplit.extract_date(@input)
|
96
|
+
@metadata[:keywords] = Docsplit.extract_keywords(@input)
|
97
|
+
@metadata[:length] = Docsplit.extract_length(@input)
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
metadata
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: uploadconvert
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- M. C. McGrath
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2014-03-12 00:00:00.000000000 Z
|
13
|
+
dependencies: []
|
14
|
+
description: Converts documents to the appropriate format for Transparency Toolkit.
|
15
|
+
email: shidash@shidash.com
|
16
|
+
executables: []
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- lib/uploadconvert.rb
|
21
|
+
homepage: https://github.com/Shidash/UploadConvert
|
22
|
+
licenses:
|
23
|
+
- GPL
|
24
|
+
post_install_message:
|
25
|
+
rdoc_options: []
|
26
|
+
require_paths:
|
27
|
+
- lib
|
28
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
29
|
+
none: false
|
30
|
+
requirements:
|
31
|
+
- - ! '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
35
|
+
none: false
|
36
|
+
requirements:
|
37
|
+
- - ! '>='
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: '0'
|
40
|
+
requirements: []
|
41
|
+
rubyforge_project:
|
42
|
+
rubygems_version: 1.8.23
|
43
|
+
signing_key:
|
44
|
+
specification_version: 3
|
45
|
+
summary: Collected conversion tools for Transparency Toolkit
|
46
|
+
test_files: []
|
47
|
+
has_rdoc:
|