uploadconvert 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. data/lib/uploadconvert.rb +100 -0
  2. metadata +47 -0
@@ -0,0 +1,100 @@
1
+ require 'json'
2
+ require 'docsplit'
3
+ require 'open3'
4
+
5
+ class UploadConvert
6
+
7
+ def initialize(input)
8
+ @input = input
9
+ @output = ""
10
+ @text = ""
11
+ end
12
+
13
+ # Convert PDFs to JSON
14
+ def pdfTojson
15
+ # Extract and clean text
16
+ @text = detectPDFType
17
+
18
+ # Extract metadata and generate output
19
+ extractMetadataPDF
20
+ outhash = Hash.new
21
+ @metadata.each{|k, v| outhash[k] = v}
22
+ outhash[:text] = @text
23
+ outhash[:input] = @input
24
+ @output = JSON.pretty_generate(outhash)
25
+ end
26
+
27
+ # Use embedded fonts to detect the type of PDF
28
+ def detectPDFType
29
+ out = `pdffonts #{@input}`.split("\n")
30
+ if out.length > 4
31
+ return embedPDF
32
+ else
33
+ return ocrPDF
34
+ end
35
+ end
36
+
37
+ # Extract text from embedded text PDFs
38
+ def embedPDF
39
+ Docsplit.extract_text(@input, :ocr => false)
40
+ outfile = @input.split(".")
41
+ text = File.read(outfile[0]+".txt")
42
+
43
+ # Clean up text and delete file
44
+ File.delete(outfile[0]+".txt")
45
+ cleanPDF(text)
46
+ end
47
+
48
+ # OCR PDFs and turn that text into a JSON
49
+ def ocrPDF
50
+ # Extract individual pages
51
+ Docsplit.extract_images(@input)
52
+
53
+ # OCR
54
+ docs = Dir["*.png"]
55
+ Docsplit.extract_text(@input, :ocr => true, :output => 'text')
56
+ outfile = @input.split(".")
57
+ text = File.read("text/" + outfile[0] + ".txt")
58
+
59
+ # Clean up text and files
60
+ File.delete("text/" + outfile[0]+".txt")
61
+ Dir.delete("text")
62
+ docs.each do |d|
63
+ File.delete(d)
64
+ end
65
+ cleanPDF(text)
66
+ end
67
+
68
+ # Removes numbers from edges of legal documents
69
+ def cleanPDF(text)
70
+ text.gsub!(/\r?\n/, "\n")
71
+ text.each_line do |l|
72
+ lflag = 0
73
+ (1..28).each do |i|
74
+ if l == i.to_s+"\n"
75
+ lflag = 1
76
+ end
77
+ end
78
+
79
+ if lflag != 1 && l
80
+ @text += l
81
+ end
82
+ end
83
+
84
+ return @text
85
+ end
86
+
87
+ # Extract PDF metadata
88
+ def extractMetadataPDF
89
+ @metadata = Hash.new
90
+ @metadata[:author] = Docsplit.extract_author(@input)
91
+ @metadata[:creator] = Docsplit.extract_creator(@input)
92
+ @metadata[:producer] = Docsplit.extract_producer(@input)
93
+ @metadata[:title] = Docsplit.extract_title(@input)
94
+ @metadata[:subject] = Docsplit.extract_subject(@input)
95
+ @metadata[:date] = Docsplit.extract_date(@input)
96
+ @metadata[:keywords] = Docsplit.extract_keywords(@input)
97
+ @metadata[:length] = Docsplit.extract_length(@input)
98
+ end
99
+ end
100
+
metadata ADDED
@@ -0,0 +1,47 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: uploadconvert
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - M. C. McGrath
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2014-03-12 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: Converts documents to the appropriate format for Transparency Toolkit.
15
+ email: shidash@shidash.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - lib/uploadconvert.rb
21
+ homepage: https://github.com/Shidash/UploadConvert
22
+ licenses:
23
+ - GPL
24
+ post_install_message:
25
+ rdoc_options: []
26
+ require_paths:
27
+ - lib
28
+ required_ruby_version: !ruby/object:Gem::Requirement
29
+ none: false
30
+ requirements:
31
+ - - ! '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ required_rubygems_version: !ruby/object:Gem::Requirement
35
+ none: false
36
+ requirements:
37
+ - - ! '>='
38
+ - !ruby/object:Gem::Version
39
+ version: '0'
40
+ requirements: []
41
+ rubyforge_project:
42
+ rubygems_version: 1.8.23
43
+ signing_key:
44
+ specification_version: 3
45
+ summary: Collected conversion tools for Transparency Toolkit
46
+ test_files: []
47
+ has_rdoc: