parsefile 0.0.9 → 0.0.10
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/extractmetadata.rb +9 -1
- data/lib/ocrfile.rb +8 -57
- data/lib/parsefile.rb +13 -17
- metadata +3 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bf06a478153c7107295b160bc8b2d76e403e7d8f
|
4
|
+
data.tar.gz: 4122ff2b451ba656d75893119209a1adefd2d5dc
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0d92b7753ae69345a3a456be7f61982969bde31dfad4dde386fd4906a8ba631610a6a2dadf6ae7b3c841c01f12e6b7627d96528745aaeed11abf9166a7b321b8
|
7
|
+
data.tar.gz: 57620b37897bb7b99d523c9deb76948a723f305338a958a646c6e69b35a38ecaa4f3cb848803bbe8652c45f1b83579cdb5c981d9a132fbd88454eda60eaea066
|
data/lib/extractmetadata.rb
CHANGED
@@ -6,6 +6,10 @@ class ExtractMetadata
|
|
6
6
|
@path = file
|
7
7
|
@input_dir = input_dir
|
8
8
|
@output_dir = output_dir
|
9
|
+
@allowed_extensions = [
|
10
|
+
'pdf', 'doc', 'docbook', 'docx', 'txt', 'rtf', 'md', 'csv', 'xls', 'xlsx',
|
11
|
+
'jpg', 'jpeg', 'png', 'gif', 'svg'
|
12
|
+
]
|
9
13
|
end
|
10
14
|
|
11
15
|
# Extract metadata
|
@@ -23,7 +27,11 @@ class ExtractMetadata
|
|
23
27
|
|
24
28
|
# Extract file metadata, merge. and return
|
25
29
|
begin
|
26
|
-
|
30
|
+
if (@allowed_extensions.include? outhash[:filetype])
|
31
|
+
outhash.merge!(extract_file_metadata)
|
32
|
+
else
|
33
|
+
puts "skipping ." + outhash[:filetype] + " file"
|
34
|
+
end
|
27
35
|
rescue
|
28
36
|
end
|
29
37
|
return outhash
|
data/lib/ocrfile.rb
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
require 'fileutils'
|
2
|
-
require 'docsplit'
|
3
2
|
require 'curb'
|
4
3
|
require 'mimemagic'
|
5
4
|
|
@@ -16,42 +15,32 @@ class OCRFile
|
|
16
15
|
# OCR file
|
17
16
|
def ocr
|
18
17
|
begin
|
18
|
+
mime_magic = MimeMagic.by_path(@path)
|
19
19
|
if File.exist?(@output_dir+@rel_path+".json")
|
20
20
|
load_extracted_text(@output_dir+@rel_path+".json")
|
21
|
-
#elsif @path.include?(".pdf")
|
22
|
-
# ocr_pdf
|
23
21
|
else
|
24
22
|
if @tika
|
25
|
-
give_me_text_local
|
23
|
+
give_me_text_local(mime_magic)
|
26
24
|
else
|
27
|
-
|
28
|
-
# give_me_text
|
25
|
+
give_me_text
|
29
26
|
end
|
30
27
|
end
|
31
|
-
rescue
|
32
|
-
|
28
|
+
rescue
|
29
|
+
# Detect errors
|
30
|
+
binding.pry
|
33
31
|
end
|
34
32
|
|
35
33
|
return @text
|
36
34
|
end
|
37
35
|
|
38
|
-
# Check if file is pdf
|
39
|
-
def is_pdf?
|
40
|
-
puts "determined: is_pdf"
|
41
|
-
file_start = File.open(@path, 'r') { |f| f.read(8)}
|
42
|
-
file_start.match(/\%PDF-\d+\.?\d+/)
|
43
|
-
end
|
44
|
-
|
45
36
|
# Load text that is already extracted
|
46
37
|
def load_extracted_text(file)
|
47
|
-
puts "file exists
|
38
|
+
puts "file already exists"
|
48
39
|
@text = JSON.parse(File.read(file))["text"]
|
49
40
|
end
|
50
41
|
|
51
42
|
# Send file to give me text
|
52
43
|
def give_me_text
|
53
|
-
puts "using: give_me_text"
|
54
|
-
|
55
44
|
c = Curl::Easy.new("http://givemetext.okfnlabs.org/tika/tika/form")
|
56
45
|
c.multipart_form_post = true
|
57
46
|
c.http_post(Curl::PostField.file('file', @path))
|
@@ -60,17 +49,13 @@ class OCRFile
|
|
60
49
|
gotten_text_ok?(@text)
|
61
50
|
end
|
62
51
|
|
63
|
-
def give_me_text_local
|
64
|
-
puts "using: give_me_text_local"
|
52
|
+
def give_me_text_local(mime_magic)
|
65
53
|
c = Curl::Easy.new(@tika + "/tika")
|
66
|
-
# TODO: move this mime filtering to a higher global level
|
67
|
-
mime_magic = MimeMagic.by_path(@path)
|
68
54
|
file_data = File.read(@path)
|
69
55
|
c.headers['Content-Type'] = mime_magic.type
|
70
56
|
c.headers['Accept'] = "text/plain"
|
71
57
|
c.http_put(file_data)
|
72
58
|
|
73
|
-
#binding.pry
|
74
59
|
@text = c.body_str
|
75
60
|
gotten_text_ok?(@text)
|
76
61
|
end
|
@@ -80,38 +65,4 @@ class OCRFile
|
|
80
65
|
throw :extraction_error if text.include?("java.io.IOException: Stream Closed")
|
81
66
|
end
|
82
67
|
|
83
|
-
# OCR with tesseract
|
84
|
-
def ocr_pdf
|
85
|
-
puts "using: ocr_pdf"
|
86
|
-
# Dir_paths
|
87
|
-
base = Dir.pwd+"/"
|
88
|
-
|
89
|
-
# Split pages to handle large PDFs
|
90
|
-
Docsplit.extract_pages(@path, :output => base+'pages')
|
91
|
-
filename = @path.split("/").last.gsub(".pdf", "")
|
92
|
-
docs = Dir[base+'pages/'+filename+'*']
|
93
|
-
|
94
|
-
# Rename pages so that they can be processed with spaces
|
95
|
-
docs.each do |d|
|
96
|
-
new_name = d.split("/").last.gsub(" ", "_").gsub("(", "").gsub(")", "")
|
97
|
-
File.rename(d, base+'pages/'+new_name)
|
98
|
-
end
|
99
|
-
filename = filename.gsub(" ", "_").gsub("(", "").gsub(")", "")
|
100
|
-
docs_no_spaces = Dir[base+'pages/'+filename+'*']
|
101
|
-
|
102
|
-
# Extract text and save
|
103
|
-
Docsplit.extract_text(docs_no_spaces, :output => base+'text')
|
104
|
-
text_files = Dir[base+'text/'+filename+'*']
|
105
|
-
sorted_text = text_files.sort_by {|f|
|
106
|
-
f.split(filename).last.gsub("_", "").gsub(".txt", "").to_i }
|
107
|
-
sorted_text.each do |f|
|
108
|
-
@text += File.read(f)
|
109
|
-
end
|
110
|
-
|
111
|
-
# Clean up
|
112
|
-
FileUtils.rm_f Dir.glob(base+"pages/*")
|
113
|
-
Dir.delete(base+"pages")
|
114
|
-
FileUtils.rm_f Dir.glob(base+"text/*")
|
115
|
-
Dir.delete(base+"text")
|
116
|
-
end
|
117
68
|
end
|
data/lib/parsefile.rb
CHANGED
@@ -1,44 +1,40 @@
|
|
1
1
|
require 'json'
|
2
|
-
require 'docsplit'
|
3
2
|
require 'fileutils'
|
4
3
|
require 'pry'
|
5
|
-
|
6
|
-
|
7
|
-
load 'extractmetadata.rb'
|
4
|
+
require 'ocrfile'
|
5
|
+
require 'extractmetadata'
|
8
6
|
|
9
7
|
class ParseFile
|
10
8
|
def initialize(file, input_dir, output_dir, tika)
|
11
9
|
@path = file
|
12
10
|
@input_dir = input_dir
|
13
11
|
@output_dir = output_dir
|
14
|
-
# Pass
|
15
|
-
# Else use OKFNs service over normal HTTP... ZOMG... O.o
|
12
|
+
# Pass URL of a Tika server
|
16
13
|
if tika
|
17
14
|
@tika = tika
|
15
|
+
# Use OKFNs service over normal HTTP... ZOMG... O.o
|
18
16
|
else
|
19
17
|
@tika = nil
|
20
18
|
end
|
21
19
|
end
|
22
20
|
|
23
|
-
# Parse the file
|
24
21
|
def parse_file
|
25
22
|
begin
|
26
|
-
|
27
|
-
|
28
|
-
|
23
|
+
puts "sending file: " + @path
|
24
|
+
|
25
|
+
m = ExtractMetadata.new(@path, @input_dir, @output_dir)
|
26
|
+
@metadata = m.extract
|
29
27
|
|
30
|
-
|
31
|
-
|
32
|
-
@text = o.ocr
|
28
|
+
o = OCRFile.new(@path, @input_dir, @output_dir, @metadata[:rel_path], @tika)
|
29
|
+
@text = o.ocr
|
33
30
|
|
34
|
-
|
35
|
-
|
36
|
-
|
31
|
+
gen_output
|
32
|
+
rescue
|
33
|
+
#TODO: use a global debug / log
|
37
34
|
binding.pry
|
38
35
|
end
|
39
36
|
end
|
40
37
|
|
41
|
-
# Generate output
|
42
38
|
def gen_output
|
43
39
|
outhash = Hash.new
|
44
40
|
outhash[:full_path] = @path
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: parsefile
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.10
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- M. C. McGrath
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2017-
|
12
|
+
date: 2017-05-29 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: OCR file and extract metadata using Apache Tika and Tesseract
|
15
15
|
email: shidash@shidash.com
|
@@ -40,9 +40,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
40
40
|
version: '0'
|
41
41
|
requirements: []
|
42
42
|
rubyforge_project:
|
43
|
-
rubygems_version: 2.
|
43
|
+
rubygems_version: 2.6.11
|
44
44
|
signing_key:
|
45
45
|
specification_version: 4
|
46
46
|
summary: OCR file and extract metadata
|
47
47
|
test_files: []
|
48
|
-
has_rdoc:
|