parsefile 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 565694f0c2197d0c77b3b6e96e4da03ee02930b5
4
+ data.tar.gz: c008fd5f1e9ae074e79bacb8aabb4e2b8ef3755d
5
+ SHA512:
6
+ metadata.gz: 27fb72e4c92ebf5a6ab0b21f47a2278067af42a7572622c5cbb2320677235017d2bca030dcb415c1d50820db1f0009e8dcc7f7f06fcb8b311d7d8e973d50a79a
7
+ data.tar.gz: 4e83e5644beb7ca4e073f7e5ef07e8ec697016d242f14462bf7eddf54a9fab6c75750fdaa934c5936594327f1317cd23477a9d9a76723408c648ad2ebb94c0b0
@@ -0,0 +1,56 @@
1
+ require 'json'
2
+ require 'docsplit'
3
+
4
+ class ExtractMetadata
5
+ def initialize(file, input_dir, output_dir)
6
+ @path = file
7
+ @input_dir = input_dir
8
+ @output_dir = output_dir
9
+ end
10
+
11
+ # Extract metadata
12
+ def extract
13
+ outhash = Hash.new
14
+
15
+ # Get relative path
16
+ @rel_path = get_rel_path
17
+ outhash[:rel_path] = @rel_path
18
+
19
+ # Get formatted name and file type
20
+ outhash[:formatted_name] = get_formatted_name
21
+ outhash[:filetype] = get_file_type
22
+
23
+ # Extract file metadata, merge. and return
24
+ outhash.merge!(extract_file_metadata)
25
+ return outhash
26
+ end
27
+
28
+ # Get the relative path
29
+ def get_rel_path
30
+ @path.gsub(@input_dir, "")
31
+ end
32
+
33
+ # Get a formatted file name
34
+ def get_formatted_name
35
+ @rel_path.split(".").first.gsub("_", " ").gsub("/", "")
36
+ end
37
+
38
+ # Get file type
39
+ def get_file_type
40
+ @rel_path.split(".").last
41
+ end
42
+
43
+ # Extract PDF metadata
44
+ def extract_file_metadata
45
+ metadata = Hash.new
46
+ metadata[:author] = Docsplit.extract_author(@path)
47
+ metadata[:creator] = Docsplit.extract_creator(@path)
48
+ metadata[:producer] = Docsplit.extract_producer(@path)
49
+ metadata[:title] = Docsplit.extract_title(@path)
50
+ metadata[:subject] = Docsplit.extract_subject(@path)
51
+ metadata[:date] = Docsplit.extract_date(@path)
52
+ metadata[:keywords] = Docsplit.extract_keywords(@path)
53
+ metadata[:length] = Docsplit.extract_length(@path)
54
+ return metadata
55
+ end
56
+ end
data/lib/ocrfile.rb ADDED
@@ -0,0 +1,77 @@
1
+ require 'fileutils'
2
+ require 'docsplit'
3
+ require 'curb'
4
+
5
+ class OCRFile
6
+ def initialize(file, input_dir, output_dir, rel_path)
7
+ @path = file
8
+ @input_dir = input_dir
9
+ @output_dir = output_dir
10
+ @rel_path = rel_path
11
+ @text = ""
12
+ end
13
+
14
+ # OCR file
15
+ def ocr
16
+ begin
17
+ if File.exist?(@output_dir+@rel_path)
18
+ load_extracted_text(@output_dir+@rel_path)
19
+ elsif @path.include?(".pdf")
20
+ ocr_pdf
21
+ else
22
+ give_me_text
23
+ end
24
+ rescue # Detect errors
25
+ binding.pry
26
+ end
27
+
28
+ return @text
29
+ end
30
+
31
+ # Check if file is pdf
32
+ def is_pdf?
33
+ file_start = File.open(@path, 'r') { |f| f.read(8)}
34
+ file_start.match(/\%PDF-\d+\.?\d+/)
35
+ end
36
+
37
+ # Load text that is already extracted
38
+ def load_extracted_text(file)
39
+ @text = File.read(file)
40
+ end
41
+
42
+ # Send file to give me text
43
+ def give_me_text
44
+ c = Curl::Easy.new("http://givemetext.okfnlabs.org/tika/tika/form")
45
+ c.multipart_form_post = true
46
+ c.http_post(Curl::PostField.file('file', @path))
47
+ @text = c.body_str
48
+ gotten_text_ok?(@text)
49
+ end
50
+
51
+ # Checks if text was successfully extracted
52
+ def gotten_text_ok?(text)
53
+ throw :extraction_error if text.include?("java.io.IOException: Stream Closed")
54
+ end
55
+
56
+ # OCR with tesseract
57
+ def ocr_pdf
58
+ # Split pages to handle large PDFs
59
+ Docsplit.extract_pages(@path, :output => 'pages')
60
+ filename = @path.split("/").last.gsub(".pdf", "")
61
+ docs = Dir['pages/'+filename+'*']
62
+
63
+ # Extract text and save
64
+ Docsplit.extract_text(docs, :output => 'text')
65
+ text_files = Dir['text/'+filename+'*']
66
+ sorted_text = text_files.sort_by {|f| f.split(filename).last.gsub("_", "").gsub(".txt", "").to_i }
67
+ sorted_text.each do |f|
68
+ @text += File.read(f)
69
+ end
70
+
71
+ # Clean up
72
+ FileUtils.rm_f Dir.glob("pages/*")
73
+ Dir.delete("pages")
74
+ FileUtils.rm_f Dir.glob("text/*")
75
+ Dir.delete("text")
76
+ end
77
+ end
data/lib/parsefile.rb ADDED
@@ -0,0 +1,38 @@
1
+ require 'json'
2
+ require 'docsplit'
3
+ require 'fileutils'
4
+ require 'pry'
5
+ require 'dircrawl'
6
+ load 'ocrfile.rb'
7
+ load 'extractmetadata.rb'
8
+
9
+ class ParseFile
10
+ def initialize(file, input_dir, output_dir)
11
+ @path = file
12
+ @input_dir = input_dir
13
+ @output_dir = output_dir
14
+ end
15
+
16
+ # Parse the file
17
+ def parse_file
18
+ # Get metadata
19
+ m = ExtractMetadata.new(@path, @input_dir, @output_dir)
20
+ @metadata = m.extract
21
+
22
+ # OCR File
23
+ o = OCRFile.new(@path, @input_dir, @output_dir, @metadata[:rel_path])
24
+ @text = o.ocr
25
+
26
+ # Generate output and return
27
+ gen_output
28
+ end
29
+
30
+ # Generate output
31
+ def gen_output
32
+ outhash = Hash.new
33
+ outhash[:full_path] = @path
34
+ outhash.merge!(@metadata)
35
+ outhash[:text] = @text
36
+ return JSON.pretty_generate(outhash)
37
+ end
38
+ end
metadata ADDED
@@ -0,0 +1,47 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: parsefile
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - M. C. McGrath
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-01-28 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: OCR file and extract metadata
14
+ email: shidash@shidash.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/extractmetadata.rb
20
+ - lib/ocrfile.rb
21
+ - lib/parsefile.rb
22
+ homepage: https://github.com/TransparencyToolkit/parsefile
23
+ licenses:
24
+ - GPL
25
+ metadata: {}
26
+ post_install_message:
27
+ rdoc_options: []
28
+ require_paths:
29
+ - lib
30
+ required_ruby_version: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - ">="
33
+ - !ruby/object:Gem::Version
34
+ version: '0'
35
+ required_rubygems_version: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - ">="
38
+ - !ruby/object:Gem::Version
39
+ version: '0'
40
+ requirements: []
41
+ rubyforge_project:
42
+ rubygems_version: 2.4.8
43
+ signing_key:
44
+ specification_version: 4
45
+ summary: OCR file and extract metadata
46
+ test_files: []
47
+ has_rdoc: