uploadconvert 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (2) hide show
  1. data/lib/uploadconvert.rb +100 -0
  2. metadata +47 -0
@@ -0,0 +1,100 @@
1
+ require 'json'
2
+ require 'docsplit'
3
+ require 'open3'
4
+
5
+ class UploadConvert
6
+
7
+ def initialize(input)
8
+ @input = input
9
+ @output = ""
10
+ @text = ""
11
+ end
12
+
13
+ # Convert PDFs to JSON
14
+ def pdfTojson
15
+ # Extract and clean text
16
+ @text = detectPDFType
17
+
18
+ # Extract metadata and generate output
19
+ extractMetadataPDF
20
+ outhash = Hash.new
21
+ @metadata.each{|k, v| outhash[k] = v}
22
+ outhash[:text] = @text
23
+ outhash[:input] = @input
24
+ @output = JSON.pretty_generate(outhash)
25
+ end
26
+
27
+ # Use embedded fonts to detect the type of PDF
28
+ def detectPDFType
29
+ out = `pdffonts #{@input}`.split("\n")
30
+ if out.length > 4
31
+ return embedPDF
32
+ else
33
+ return ocrPDF
34
+ end
35
+ end
36
+
37
+ # Extract text from embedded text PDFs
38
+ def embedPDF
39
+ Docsplit.extract_text(@input, :ocr => false)
40
+ outfile = @input.split(".")
41
+ text = File.read(outfile[0]+".txt")
42
+
43
+ # Clean up text and delete file
44
+ File.delete(outfile[0]+".txt")
45
+ cleanPDF(text)
46
+ end
47
+
48
+ # OCR PDFs and turn that text into a JSON
49
+ def ocrPDF
50
+ # Extract individual pages
51
+ Docsplit.extract_images(@input)
52
+
53
+ # OCR
54
+ docs = Dir["*.png"]
55
+ Docsplit.extract_text(@input, :ocr => true, :output => 'text')
56
+ outfile = @input.split(".")
57
+ text = File.read("text/" + outfile[0] + ".txt")
58
+
59
+ # Clean up text and files
60
+ File.delete("text/" + outfile[0]+".txt")
61
+ Dir.delete("text")
62
+ docs.each do |d|
63
+ File.delete(d)
64
+ end
65
+ cleanPDF(text)
66
+ end
67
+
68
+ # Removes numbers from edges of legal documents
69
+ def cleanPDF(text)
70
+ text.gsub!(/\r?\n/, "\n")
71
+ text.each_line do |l|
72
+ lflag = 0
73
+ (1..28).each do |i|
74
+ if l == i.to_s+"\n"
75
+ lflag = 1
76
+ end
77
+ end
78
+
79
+ if lflag != 1 && l
80
+ @text += l
81
+ end
82
+ end
83
+
84
+ return @text
85
+ end
86
+
87
+ # Extract PDF metadata
88
+ def extractMetadataPDF
89
+ @metadata = Hash.new
90
+ @metadata[:author] = Docsplit.extract_author(@input)
91
+ @metadata[:creator] = Docsplit.extract_creator(@input)
92
+ @metadata[:producer] = Docsplit.extract_producer(@input)
93
+ @metadata[:title] = Docsplit.extract_title(@input)
94
+ @metadata[:subject] = Docsplit.extract_subject(@input)
95
+ @metadata[:date] = Docsplit.extract_date(@input)
96
+ @metadata[:keywords] = Docsplit.extract_keywords(@input)
97
+ @metadata[:length] = Docsplit.extract_length(@input)
98
+ end
99
+ end
100
+
metadata ADDED
@@ -0,0 +1,47 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: uploadconvert
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - M. C. McGrath
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2014-03-12 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: Converts documents to the appropriate format for Transparency Toolkit.
15
+ email: shidash@shidash.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - lib/uploadconvert.rb
21
+ homepage: https://github.com/Shidash/UploadConvert
22
+ licenses:
23
+ - GPL
24
+ post_install_message:
25
+ rdoc_options: []
26
+ require_paths:
27
+ - lib
28
+ required_ruby_version: !ruby/object:Gem::Requirement
29
+ none: false
30
+ requirements:
31
+ - - ! '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ required_rubygems_version: !ruby/object:Gem::Requirement
35
+ none: false
36
+ requirements:
37
+ - - ! '>='
38
+ - !ruby/object:Gem::Version
39
+ version: '0'
40
+ requirements: []
41
+ rubyforge_project:
42
+ rubygems_version: 1.8.23
43
+ signing_key:
44
+ specification_version: 3
45
+ summary: Collected conversion tools for Transparency Toolkit
46
+ test_files: []
47
+ has_rdoc: