parsefile 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/extractmetadata.rb +56 -0
- data/lib/ocrfile.rb +77 -0
- data/lib/parsefile.rb +38 -0
- metadata +47 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 565694f0c2197d0c77b3b6e96e4da03ee02930b5
|
4
|
+
data.tar.gz: c008fd5f1e9ae074e79bacb8aabb4e2b8ef3755d
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 27fb72e4c92ebf5a6ab0b21f47a2278067af42a7572622c5cbb2320677235017d2bca030dcb415c1d50820db1f0009e8dcc7f7f06fcb8b311d7d8e973d50a79a
|
7
|
+
data.tar.gz: 4e83e5644beb7ca4e073f7e5ef07e8ec697016d242f14462bf7eddf54a9fab6c75750fdaa934c5936594327f1317cd23477a9d9a76723408c648ad2ebb94c0b0
|
@@ -0,0 +1,56 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'docsplit'
|
3
|
+
|
4
|
+
class ExtractMetadata
|
5
|
+
def initialize(file, input_dir, output_dir)
|
6
|
+
@path = file
|
7
|
+
@input_dir = input_dir
|
8
|
+
@output_dir = output_dir
|
9
|
+
end
|
10
|
+
|
11
|
+
# Extract metadata
|
12
|
+
def extract
|
13
|
+
outhash = Hash.new
|
14
|
+
|
15
|
+
# Get relative path
|
16
|
+
@rel_path = get_rel_path
|
17
|
+
outhash[:rel_path] = @rel_path
|
18
|
+
|
19
|
+
# Get formatted name and file type
|
20
|
+
outhash[:formatted_name] = get_formatted_name
|
21
|
+
outhash[:filetype] = get_file_type
|
22
|
+
|
23
|
+
# Extract file metadata, merge. and return
|
24
|
+
outhash.merge!(extract_file_metadata)
|
25
|
+
return outhash
|
26
|
+
end
|
27
|
+
|
28
|
+
# Get the relative path
|
29
|
+
def get_rel_path
|
30
|
+
@path.gsub(@input_dir, "")
|
31
|
+
end
|
32
|
+
|
33
|
+
# Get a formatted file name
|
34
|
+
def get_formatted_name
|
35
|
+
@rel_path.split(".").first.gsub("_", " ").gsub("/", "")
|
36
|
+
end
|
37
|
+
|
38
|
+
# Get file type
|
39
|
+
def get_file_type
|
40
|
+
@rel_path.split(".").last
|
41
|
+
end
|
42
|
+
|
43
|
+
# Extract PDF metadata
|
44
|
+
def extract_file_metadata
|
45
|
+
metadata = Hash.new
|
46
|
+
metadata[:author] = Docsplit.extract_author(@path)
|
47
|
+
metadata[:creator] = Docsplit.extract_creator(@path)
|
48
|
+
metadata[:producer] = Docsplit.extract_producer(@path)
|
49
|
+
metadata[:title] = Docsplit.extract_title(@path)
|
50
|
+
metadata[:subject] = Docsplit.extract_subject(@path)
|
51
|
+
metadata[:date] = Docsplit.extract_date(@path)
|
52
|
+
metadata[:keywords] = Docsplit.extract_keywords(@path)
|
53
|
+
metadata[:length] = Docsplit.extract_length(@path)
|
54
|
+
return metadata
|
55
|
+
end
|
56
|
+
end
|
data/lib/ocrfile.rb
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
require 'fileutils'
|
2
|
+
require 'docsplit'
|
3
|
+
require 'curb'
|
4
|
+
|
5
|
+
class OCRFile
|
6
|
+
def initialize(file, input_dir, output_dir, rel_path)
|
7
|
+
@path = file
|
8
|
+
@input_dir = input_dir
|
9
|
+
@output_dir = output_dir
|
10
|
+
@rel_path = rel_path
|
11
|
+
@text = ""
|
12
|
+
end
|
13
|
+
|
14
|
+
# OCR file
|
15
|
+
def ocr
|
16
|
+
begin
|
17
|
+
if File.exist?(@output_dir+@rel_path)
|
18
|
+
load_extracted_text(@output_dir+@rel_path)
|
19
|
+
elsif @path.include?(".pdf")
|
20
|
+
ocr_pdf
|
21
|
+
else
|
22
|
+
give_me_text
|
23
|
+
end
|
24
|
+
rescue # Detect errors
|
25
|
+
binding.pry
|
26
|
+
end
|
27
|
+
|
28
|
+
return @text
|
29
|
+
end
|
30
|
+
|
31
|
+
# Check if file is pdf
|
32
|
+
def is_pdf?
|
33
|
+
file_start = File.open(@path, 'r') { |f| f.read(8)}
|
34
|
+
file_start.match(/\%PDF-\d+\.?\d+/)
|
35
|
+
end
|
36
|
+
|
37
|
+
# Load text that is already extracted
|
38
|
+
def load_extracted_text(file)
|
39
|
+
@text = File.read(file)
|
40
|
+
end
|
41
|
+
|
42
|
+
# Send file to give me text
|
43
|
+
def give_me_text
|
44
|
+
c = Curl::Easy.new("http://givemetext.okfnlabs.org/tika/tika/form")
|
45
|
+
c.multipart_form_post = true
|
46
|
+
c.http_post(Curl::PostField.file('file', @path))
|
47
|
+
@text = c.body_str
|
48
|
+
gotten_text_ok?(@text)
|
49
|
+
end
|
50
|
+
|
51
|
+
# Checks if text was successfully extracted
|
52
|
+
def gotten_text_ok?(text)
|
53
|
+
throw :extraction_error if text.include?("java.io.IOException: Stream Closed")
|
54
|
+
end
|
55
|
+
|
56
|
+
# OCR with tesseract
|
57
|
+
def ocr_pdf
|
58
|
+
# Split pages to handle large PDFs
|
59
|
+
Docsplit.extract_pages(@path, :output => 'pages')
|
60
|
+
filename = @path.split("/").last.gsub(".pdf", "")
|
61
|
+
docs = Dir['pages/'+filename+'*']
|
62
|
+
|
63
|
+
# Extract text and save
|
64
|
+
Docsplit.extract_text(docs, :output => 'text')
|
65
|
+
text_files = Dir['text/'+filename+'*']
|
66
|
+
sorted_text = text_files.sort_by {|f| f.split(filename).last.gsub("_", "").gsub(".txt", "").to_i }
|
67
|
+
sorted_text.each do |f|
|
68
|
+
@text += File.read(f)
|
69
|
+
end
|
70
|
+
|
71
|
+
# Clean up
|
72
|
+
FileUtils.rm_f Dir.glob("pages/*")
|
73
|
+
Dir.delete("pages")
|
74
|
+
FileUtils.rm_f Dir.glob("text/*")
|
75
|
+
Dir.delete("text")
|
76
|
+
end
|
77
|
+
end
|
data/lib/parsefile.rb
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'docsplit'
|
3
|
+
require 'fileutils'
|
4
|
+
require 'pry'
|
5
|
+
require 'dircrawl'
|
6
|
+
load 'ocrfile.rb'
|
7
|
+
load 'extractmetadata.rb'
|
8
|
+
|
9
|
+
class ParseFile
|
10
|
+
def initialize(file, input_dir, output_dir)
|
11
|
+
@path = file
|
12
|
+
@input_dir = input_dir
|
13
|
+
@output_dir = output_dir
|
14
|
+
end
|
15
|
+
|
16
|
+
# Parse the file
|
17
|
+
def parse_file
|
18
|
+
# Get metadata
|
19
|
+
m = ExtractMetadata.new(@path, @input_dir, @output_dir)
|
20
|
+
@metadata = m.extract
|
21
|
+
|
22
|
+
# OCR File
|
23
|
+
o = OCRFile.new(@path, @input_dir, @output_dir, @metadata[:rel_path])
|
24
|
+
@text = o.ocr
|
25
|
+
|
26
|
+
# Generate output and return
|
27
|
+
gen_output
|
28
|
+
end
|
29
|
+
|
30
|
+
# Generate output
|
31
|
+
def gen_output
|
32
|
+
outhash = Hash.new
|
33
|
+
outhash[:full_path] = @path
|
34
|
+
outhash.merge!(@metadata)
|
35
|
+
outhash[:text] = @text
|
36
|
+
return JSON.pretty_generate(outhash)
|
37
|
+
end
|
38
|
+
end
|
metadata
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: parsefile
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- M. C. McGrath
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2016-01-28 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: OCR file and extract metadata
|
14
|
+
email: shidash@shidash.com
|
15
|
+
executables: []
|
16
|
+
extensions: []
|
17
|
+
extra_rdoc_files: []
|
18
|
+
files:
|
19
|
+
- lib/extractmetadata.rb
|
20
|
+
- lib/ocrfile.rb
|
21
|
+
- lib/parsefile.rb
|
22
|
+
homepage: https://github.com/TransparencyToolkit/parsefile
|
23
|
+
licenses:
|
24
|
+
- GPL
|
25
|
+
metadata: {}
|
26
|
+
post_install_message:
|
27
|
+
rdoc_options: []
|
28
|
+
require_paths:
|
29
|
+
- lib
|
30
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
31
|
+
requirements:
|
32
|
+
- - ">="
|
33
|
+
- !ruby/object:Gem::Version
|
34
|
+
version: '0'
|
35
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
37
|
+
- - ">="
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: '0'
|
40
|
+
requirements: []
|
41
|
+
rubyforge_project:
|
42
|
+
rubygems_version: 2.4.8
|
43
|
+
signing_key:
|
44
|
+
specification_version: 4
|
45
|
+
summary: OCR file and extract metadata
|
46
|
+
test_files: []
|
47
|
+
has_rdoc:
|