parsefile 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 565694f0c2197d0c77b3b6e96e4da03ee02930b5
4
+ data.tar.gz: c008fd5f1e9ae074e79bacb8aabb4e2b8ef3755d
5
+ SHA512:
6
+ metadata.gz: 27fb72e4c92ebf5a6ab0b21f47a2278067af42a7572622c5cbb2320677235017d2bca030dcb415c1d50820db1f0009e8dcc7f7f06fcb8b311d7d8e973d50a79a
7
+ data.tar.gz: 4e83e5644beb7ca4e073f7e5ef07e8ec697016d242f14462bf7eddf54a9fab6c75750fdaa934c5936594327f1317cd23477a9d9a76723408c648ad2ebb94c0b0
@@ -0,0 +1,56 @@
1
+ require 'json'
2
+ require 'docsplit'
3
+
4
+ class ExtractMetadata
5
+ def initialize(file, input_dir, output_dir)
6
+ @path = file
7
+ @input_dir = input_dir
8
+ @output_dir = output_dir
9
+ end
10
+
11
+ # Extract metadata
12
+ def extract
13
+ outhash = Hash.new
14
+
15
+ # Get relative path
16
+ @rel_path = get_rel_path
17
+ outhash[:rel_path] = @rel_path
18
+
19
+ # Get formatted name and file type
20
+ outhash[:formatted_name] = get_formatted_name
21
+ outhash[:filetype] = get_file_type
22
+
23
+ # Extract file metadata, merge. and return
24
+ outhash.merge!(extract_file_metadata)
25
+ return outhash
26
+ end
27
+
28
+ # Get the relative path
29
+ def get_rel_path
30
+ @path.gsub(@input_dir, "")
31
+ end
32
+
33
+ # Get a formatted file name
34
+ def get_formatted_name
35
+ @rel_path.split(".").first.gsub("_", " ").gsub("/", "")
36
+ end
37
+
38
+ # Get file type
39
+ def get_file_type
40
+ @rel_path.split(".").last
41
+ end
42
+
43
+ # Extract PDF metadata
44
+ def extract_file_metadata
45
+ metadata = Hash.new
46
+ metadata[:author] = Docsplit.extract_author(@path)
47
+ metadata[:creator] = Docsplit.extract_creator(@path)
48
+ metadata[:producer] = Docsplit.extract_producer(@path)
49
+ metadata[:title] = Docsplit.extract_title(@path)
50
+ metadata[:subject] = Docsplit.extract_subject(@path)
51
+ metadata[:date] = Docsplit.extract_date(@path)
52
+ metadata[:keywords] = Docsplit.extract_keywords(@path)
53
+ metadata[:length] = Docsplit.extract_length(@path)
54
+ return metadata
55
+ end
56
+ end
data/lib/ocrfile.rb ADDED
@@ -0,0 +1,77 @@
1
+ require 'fileutils'
2
+ require 'docsplit'
3
+ require 'curb'
4
+
5
+ class OCRFile
6
+ def initialize(file, input_dir, output_dir, rel_path)
7
+ @path = file
8
+ @input_dir = input_dir
9
+ @output_dir = output_dir
10
+ @rel_path = rel_path
11
+ @text = ""
12
+ end
13
+
14
+ # OCR file
15
+ def ocr
16
+ begin
17
+ if File.exist?(@output_dir+@rel_path)
18
+ load_extracted_text(@output_dir+@rel_path)
19
+ elsif @path.include?(".pdf")
20
+ ocr_pdf
21
+ else
22
+ give_me_text
23
+ end
24
+ rescue # Detect errors
25
+ binding.pry
26
+ end
27
+
28
+ return @text
29
+ end
30
+
31
+ # Check if file is pdf
32
+ def is_pdf?
33
+ file_start = File.open(@path, 'r') { |f| f.read(8)}
34
+ file_start.match(/\%PDF-\d+\.?\d+/)
35
+ end
36
+
37
+ # Load text that is already extracted
38
+ def load_extracted_text(file)
39
+ @text = File.read(file)
40
+ end
41
+
42
+ # Send file to give me text
43
+ def give_me_text
44
+ c = Curl::Easy.new("http://givemetext.okfnlabs.org/tika/tika/form")
45
+ c.multipart_form_post = true
46
+ c.http_post(Curl::PostField.file('file', @path))
47
+ @text = c.body_str
48
+ gotten_text_ok?(@text)
49
+ end
50
+
51
+ # Checks if text was successfully extracted
52
+ def gotten_text_ok?(text)
53
+ throw :extraction_error if text.include?("java.io.IOException: Stream Closed")
54
+ end
55
+
56
+ # OCR with tesseract
57
+ def ocr_pdf
58
+ # Split pages to handle large PDFs
59
+ Docsplit.extract_pages(@path, :output => 'pages')
60
+ filename = @path.split("/").last.gsub(".pdf", "")
61
+ docs = Dir['pages/'+filename+'*']
62
+
63
+ # Extract text and save
64
+ Docsplit.extract_text(docs, :output => 'text')
65
+ text_files = Dir['text/'+filename+'*']
66
+ sorted_text = text_files.sort_by {|f| f.split(filename).last.gsub("_", "").gsub(".txt", "").to_i }
67
+ sorted_text.each do |f|
68
+ @text += File.read(f)
69
+ end
70
+
71
+ # Clean up
72
+ FileUtils.rm_f Dir.glob("pages/*")
73
+ Dir.delete("pages")
74
+ FileUtils.rm_f Dir.glob("text/*")
75
+ Dir.delete("text")
76
+ end
77
+ end
data/lib/parsefile.rb ADDED
@@ -0,0 +1,38 @@
1
+ require 'json'
2
+ require 'docsplit'
3
+ require 'fileutils'
4
+ require 'pry'
5
+ require 'dircrawl'
6
+ load 'ocrfile.rb'
7
+ load 'extractmetadata.rb'
8
+
9
+ class ParseFile
10
+ def initialize(file, input_dir, output_dir)
11
+ @path = file
12
+ @input_dir = input_dir
13
+ @output_dir = output_dir
14
+ end
15
+
16
+ # Parse the file
17
+ def parse_file
18
+ # Get metadata
19
+ m = ExtractMetadata.new(@path, @input_dir, @output_dir)
20
+ @metadata = m.extract
21
+
22
+ # OCR File
23
+ o = OCRFile.new(@path, @input_dir, @output_dir, @metadata[:rel_path])
24
+ @text = o.ocr
25
+
26
+ # Generate output and return
27
+ gen_output
28
+ end
29
+
30
+ # Generate output
31
+ def gen_output
32
+ outhash = Hash.new
33
+ outhash[:full_path] = @path
34
+ outhash.merge!(@metadata)
35
+ outhash[:text] = @text
36
+ return JSON.pretty_generate(outhash)
37
+ end
38
+ end
metadata ADDED
@@ -0,0 +1,47 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: parsefile
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - M. C. McGrath
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-01-28 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: OCR file and extract metadata
14
+ email: shidash@shidash.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/extractmetadata.rb
20
+ - lib/ocrfile.rb
21
+ - lib/parsefile.rb
22
+ homepage: https://github.com/TransparencyToolkit/parsefile
23
+ licenses:
24
+ - GPL
25
+ metadata: {}
26
+ post_install_message:
27
+ rdoc_options: []
28
+ require_paths:
29
+ - lib
30
+ required_ruby_version: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - ">="
33
+ - !ruby/object:Gem::Version
34
+ version: '0'
35
+ required_rubygems_version: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - ">="
38
+ - !ruby/object:Gem::Version
39
+ version: '0'
40
+ requirements: []
41
+ rubyforge_project:
42
+ rubygems_version: 2.4.8
43
+ signing_key:
44
+ specification_version: 4
45
+ summary: OCR file and extract metadata
46
+ test_files: []
47
+ has_rdoc: