parsefile 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/extractmetadata.rb +56 -0
- data/lib/ocrfile.rb +77 -0
- data/lib/parsefile.rb +38 -0
- metadata +47 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 565694f0c2197d0c77b3b6e96e4da03ee02930b5
|
4
|
+
data.tar.gz: c008fd5f1e9ae074e79bacb8aabb4e2b8ef3755d
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 27fb72e4c92ebf5a6ab0b21f47a2278067af42a7572622c5cbb2320677235017d2bca030dcb415c1d50820db1f0009e8dcc7f7f06fcb8b311d7d8e973d50a79a
|
7
|
+
data.tar.gz: 4e83e5644beb7ca4e073f7e5ef07e8ec697016d242f14462bf7eddf54a9fab6c75750fdaa934c5936594327f1317cd23477a9d9a76723408c648ad2ebb94c0b0
|
@@ -0,0 +1,56 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'docsplit'
|
3
|
+
|
4
|
+
class ExtractMetadata
|
5
|
+
def initialize(file, input_dir, output_dir)
|
6
|
+
@path = file
|
7
|
+
@input_dir = input_dir
|
8
|
+
@output_dir = output_dir
|
9
|
+
end
|
10
|
+
|
11
|
+
# Extract metadata
|
12
|
+
def extract
|
13
|
+
outhash = Hash.new
|
14
|
+
|
15
|
+
# Get relative path
|
16
|
+
@rel_path = get_rel_path
|
17
|
+
outhash[:rel_path] = @rel_path
|
18
|
+
|
19
|
+
# Get formatted name and file type
|
20
|
+
outhash[:formatted_name] = get_formatted_name
|
21
|
+
outhash[:filetype] = get_file_type
|
22
|
+
|
23
|
+
# Extract file metadata, merge. and return
|
24
|
+
outhash.merge!(extract_file_metadata)
|
25
|
+
return outhash
|
26
|
+
end
|
27
|
+
|
28
|
+
# Get the relative path
|
29
|
+
def get_rel_path
|
30
|
+
@path.gsub(@input_dir, "")
|
31
|
+
end
|
32
|
+
|
33
|
+
# Get a formatted file name
|
34
|
+
def get_formatted_name
|
35
|
+
@rel_path.split(".").first.gsub("_", " ").gsub("/", "")
|
36
|
+
end
|
37
|
+
|
38
|
+
# Get file type
|
39
|
+
def get_file_type
|
40
|
+
@rel_path.split(".").last
|
41
|
+
end
|
42
|
+
|
43
|
+
# Extract PDF metadata
|
44
|
+
def extract_file_metadata
|
45
|
+
metadata = Hash.new
|
46
|
+
metadata[:author] = Docsplit.extract_author(@path)
|
47
|
+
metadata[:creator] = Docsplit.extract_creator(@path)
|
48
|
+
metadata[:producer] = Docsplit.extract_producer(@path)
|
49
|
+
metadata[:title] = Docsplit.extract_title(@path)
|
50
|
+
metadata[:subject] = Docsplit.extract_subject(@path)
|
51
|
+
metadata[:date] = Docsplit.extract_date(@path)
|
52
|
+
metadata[:keywords] = Docsplit.extract_keywords(@path)
|
53
|
+
metadata[:length] = Docsplit.extract_length(@path)
|
54
|
+
return metadata
|
55
|
+
end
|
56
|
+
end
|
data/lib/ocrfile.rb
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
require 'fileutils'
|
2
|
+
require 'docsplit'
|
3
|
+
require 'curb'
|
4
|
+
|
5
|
+
class OCRFile
|
6
|
+
def initialize(file, input_dir, output_dir, rel_path)
|
7
|
+
@path = file
|
8
|
+
@input_dir = input_dir
|
9
|
+
@output_dir = output_dir
|
10
|
+
@rel_path = rel_path
|
11
|
+
@text = ""
|
12
|
+
end
|
13
|
+
|
14
|
+
# OCR file
|
15
|
+
def ocr
|
16
|
+
begin
|
17
|
+
if File.exist?(@output_dir+@rel_path)
|
18
|
+
load_extracted_text(@output_dir+@rel_path)
|
19
|
+
elsif @path.include?(".pdf")
|
20
|
+
ocr_pdf
|
21
|
+
else
|
22
|
+
give_me_text
|
23
|
+
end
|
24
|
+
rescue # Detect errors
|
25
|
+
binding.pry
|
26
|
+
end
|
27
|
+
|
28
|
+
return @text
|
29
|
+
end
|
30
|
+
|
31
|
+
# Check if file is pdf
|
32
|
+
def is_pdf?
|
33
|
+
file_start = File.open(@path, 'r') { |f| f.read(8)}
|
34
|
+
file_start.match(/\%PDF-\d+\.?\d+/)
|
35
|
+
end
|
36
|
+
|
37
|
+
# Load text that is already extracted
|
38
|
+
def load_extracted_text(file)
|
39
|
+
@text = File.read(file)
|
40
|
+
end
|
41
|
+
|
42
|
+
# Send file to give me text
|
43
|
+
def give_me_text
|
44
|
+
c = Curl::Easy.new("http://givemetext.okfnlabs.org/tika/tika/form")
|
45
|
+
c.multipart_form_post = true
|
46
|
+
c.http_post(Curl::PostField.file('file', @path))
|
47
|
+
@text = c.body_str
|
48
|
+
gotten_text_ok?(@text)
|
49
|
+
end
|
50
|
+
|
51
|
+
# Checks if text was successfully extracted
|
52
|
+
def gotten_text_ok?(text)
|
53
|
+
throw :extraction_error if text.include?("java.io.IOException: Stream Closed")
|
54
|
+
end
|
55
|
+
|
56
|
+
# OCR with tesseract
|
57
|
+
def ocr_pdf
|
58
|
+
# Split pages to handle large PDFs
|
59
|
+
Docsplit.extract_pages(@path, :output => 'pages')
|
60
|
+
filename = @path.split("/").last.gsub(".pdf", "")
|
61
|
+
docs = Dir['pages/'+filename+'*']
|
62
|
+
|
63
|
+
# Extract text and save
|
64
|
+
Docsplit.extract_text(docs, :output => 'text')
|
65
|
+
text_files = Dir['text/'+filename+'*']
|
66
|
+
sorted_text = text_files.sort_by {|f| f.split(filename).last.gsub("_", "").gsub(".txt", "").to_i }
|
67
|
+
sorted_text.each do |f|
|
68
|
+
@text += File.read(f)
|
69
|
+
end
|
70
|
+
|
71
|
+
# Clean up
|
72
|
+
FileUtils.rm_f Dir.glob("pages/*")
|
73
|
+
Dir.delete("pages")
|
74
|
+
FileUtils.rm_f Dir.glob("text/*")
|
75
|
+
Dir.delete("text")
|
76
|
+
end
|
77
|
+
end
|
data/lib/parsefile.rb
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'docsplit'
|
3
|
+
require 'fileutils'
|
4
|
+
require 'pry'
|
5
|
+
require 'dircrawl'
|
6
|
+
load 'ocrfile.rb'
|
7
|
+
load 'extractmetadata.rb'
|
8
|
+
|
9
|
+
class ParseFile
|
10
|
+
def initialize(file, input_dir, output_dir)
|
11
|
+
@path = file
|
12
|
+
@input_dir = input_dir
|
13
|
+
@output_dir = output_dir
|
14
|
+
end
|
15
|
+
|
16
|
+
# Parse the file
|
17
|
+
def parse_file
|
18
|
+
# Get metadata
|
19
|
+
m = ExtractMetadata.new(@path, @input_dir, @output_dir)
|
20
|
+
@metadata = m.extract
|
21
|
+
|
22
|
+
# OCR File
|
23
|
+
o = OCRFile.new(@path, @input_dir, @output_dir, @metadata[:rel_path])
|
24
|
+
@text = o.ocr
|
25
|
+
|
26
|
+
# Generate output and return
|
27
|
+
gen_output
|
28
|
+
end
|
29
|
+
|
30
|
+
# Generate output
|
31
|
+
def gen_output
|
32
|
+
outhash = Hash.new
|
33
|
+
outhash[:full_path] = @path
|
34
|
+
outhash.merge!(@metadata)
|
35
|
+
outhash[:text] = @text
|
36
|
+
return JSON.pretty_generate(outhash)
|
37
|
+
end
|
38
|
+
end
|
metadata
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: parsefile
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- M. C. McGrath
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2016-01-28 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: OCR file and extract metadata
|
14
|
+
email: shidash@shidash.com
|
15
|
+
executables: []
|
16
|
+
extensions: []
|
17
|
+
extra_rdoc_files: []
|
18
|
+
files:
|
19
|
+
- lib/extractmetadata.rb
|
20
|
+
- lib/ocrfile.rb
|
21
|
+
- lib/parsefile.rb
|
22
|
+
homepage: https://github.com/TransparencyToolkit/parsefile
|
23
|
+
licenses:
|
24
|
+
- GPL
|
25
|
+
metadata: {}
|
26
|
+
post_install_message:
|
27
|
+
rdoc_options: []
|
28
|
+
require_paths:
|
29
|
+
- lib
|
30
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
31
|
+
requirements:
|
32
|
+
- - ">="
|
33
|
+
- !ruby/object:Gem::Version
|
34
|
+
version: '0'
|
35
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
37
|
+
- - ">="
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: '0'
|
40
|
+
requirements: []
|
41
|
+
rubyforge_project:
|
42
|
+
rubygems_version: 2.4.8
|
43
|
+
signing_key:
|
44
|
+
specification_version: 4
|
45
|
+
summary: OCR file and extract metadata
|
46
|
+
test_files: []
|
47
|
+
has_rdoc:
|