parsefile 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/extractmetadata.rb +4 -1
- data/lib/ocrfile.rb +23 -12
- data/lib/parsefile.rb +18 -5
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8e7561f81277ae170560990c0efd6f1fac6df28d
|
4
|
+
data.tar.gz: 0efdf3f4c2c98114be285ac6c7c60d9d728aa0e3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 094423025dd474a3aaf7ce4a7c7ee3f1ba7b39946eaee75f25b4e65fe9ea3117fc8fab7a1fd9afd3680a4a722b25eee9bfe6ed92be61be815a1e3f78f39eb6d2
|
7
|
+
data.tar.gz: 5f41812876112bfaa81df0fcf2d90842591abe821429af585d8d90702fe0b8822c4885ae6852c979226884ecc5a7fc1832ab072a36f58c8deaf03226ae6205d7
|
data/lib/extractmetadata.rb
CHANGED
data/lib/ocrfile.rb
CHANGED
@@ -14,8 +14,8 @@ class OCRFile
|
|
14
14
|
# OCR file
|
15
15
|
def ocr
|
16
16
|
begin
|
17
|
-
if File.exist?(@output_dir+@rel_path)
|
18
|
-
load_extracted_text(@output_dir+@rel_path)
|
17
|
+
if File.exist?(@output_dir+@rel_path+".json")
|
18
|
+
load_extracted_text(@output_dir+@rel_path+".json")
|
19
19
|
elsif @path.include?(".pdf")
|
20
20
|
ocr_pdf
|
21
21
|
else
|
@@ -24,7 +24,7 @@ class OCRFile
|
|
24
24
|
rescue # Detect errors
|
25
25
|
binding.pry
|
26
26
|
end
|
27
|
-
|
27
|
+
|
28
28
|
return @text
|
29
29
|
end
|
30
30
|
|
@@ -36,7 +36,7 @@ class OCRFile
|
|
36
36
|
|
37
37
|
# Load text that is already extracted
|
38
38
|
def load_extracted_text(file)
|
39
|
-
@text = File.read(file)
|
39
|
+
@text = JSON.parse(File.read(file))["text"]
|
40
40
|
end
|
41
41
|
|
42
42
|
# Send file to give me text
|
@@ -55,23 +55,34 @@ class OCRFile
|
|
55
55
|
|
56
56
|
# OCR with tesseract
|
57
57
|
def ocr_pdf
|
58
|
+
# Dir_paths
|
59
|
+
base = Dir.pwd+"/"
|
60
|
+
|
58
61
|
# Split pages to handle large PDFs
|
59
|
-
Docsplit.extract_pages(@path, :output => 'pages')
|
62
|
+
Docsplit.extract_pages(@path, :output => base+'pages')
|
60
63
|
filename = @path.split("/").last.gsub(".pdf", "")
|
61
|
-
docs = Dir['pages/'+filename+'*']
|
64
|
+
docs = Dir[base+'pages/'+filename+'*']
|
62
65
|
|
66
|
+
# Rename pages so that they can be processed with spaces
|
67
|
+
docs.each do |d|
|
68
|
+
new_name = d.split("/").last.gsub(" ", "_").gsub("(", "").gsub(")", "")
|
69
|
+
File.rename(d, base+'pages/'+new_name)
|
70
|
+
end
|
71
|
+
filename = filename.gsub(" ", "_").gsub("(", "").gsub(")", "")
|
72
|
+
docs_no_spaces = Dir[base+'pages/'+filename+'*']
|
73
|
+
|
63
74
|
# Extract text and save
|
64
|
-
Docsplit.extract_text(
|
65
|
-
text_files = Dir['text/'+filename+'*']
|
75
|
+
Docsplit.extract_text(docs_no_spaces, :output => base+'text')
|
76
|
+
text_files = Dir[base+'text/'+filename+'*']
|
66
77
|
sorted_text = text_files.sort_by {|f| f.split(filename).last.gsub("_", "").gsub(".txt", "").to_i }
|
67
78
|
sorted_text.each do |f|
|
68
79
|
@text += File.read(f)
|
69
80
|
end
|
70
81
|
|
71
82
|
# Clean up
|
72
|
-
FileUtils.rm_f Dir.glob("pages/*")
|
73
|
-
Dir.delete("pages")
|
74
|
-
FileUtils.rm_f Dir.glob("text/*")
|
75
|
-
Dir.delete("text")
|
83
|
+
FileUtils.rm_f Dir.glob(base+"pages/*")
|
84
|
+
Dir.delete(base+"pages")
|
85
|
+
FileUtils.rm_f Dir.glob(base+"text/*")
|
86
|
+
Dir.delete(base+"text")
|
76
87
|
end
|
77
88
|
end
|
data/lib/parsefile.rb
CHANGED
@@ -2,9 +2,9 @@ require 'json'
|
|
2
2
|
require 'docsplit'
|
3
3
|
require 'fileutils'
|
4
4
|
require 'pry'
|
5
|
-
require 'dircrawl'
|
6
|
-
load 'ocrfile.rb'
|
7
|
-
load 'extractmetadata.rb'
|
5
|
+
#require 'dircrawl'
|
6
|
+
load '/home/shidash/Code/ParseFile/lib/ocrfile.rb' #FIX
|
7
|
+
load '/home/shidash/Code/ParseFile/lib/extractmetadata.rb' # FIX
|
8
8
|
|
9
9
|
class ParseFile
|
10
10
|
def initialize(file, input_dir, output_dir)
|
@@ -15,6 +15,7 @@ class ParseFile
|
|
15
15
|
|
16
16
|
# Parse the file
|
17
17
|
def parse_file
|
18
|
+
begin
|
18
19
|
# Get metadata
|
19
20
|
m = ExtractMetadata.new(@path, @input_dir, @output_dir)
|
20
21
|
@metadata = m.extract
|
@@ -25,6 +26,9 @@ class ParseFile
|
|
25
26
|
|
26
27
|
# Generate output and return
|
27
28
|
gen_output
|
29
|
+
rescue #TODO: Fix!
|
30
|
+
binding.pry
|
31
|
+
end
|
28
32
|
end
|
29
33
|
|
30
34
|
# Generate output
|
@@ -32,7 +36,16 @@ class ParseFile
|
|
32
36
|
outhash = Hash.new
|
33
37
|
outhash[:full_path] = @path
|
34
38
|
outhash.merge!(@metadata)
|
35
|
-
|
36
|
-
|
39
|
+
begin
|
40
|
+
outhash[:text] = @text.to_s.encode('UTF-8', {
|
41
|
+
:invalid => :replace,
|
42
|
+
:undef => :replace,
|
43
|
+
:replace => '?'
|
44
|
+
})
|
45
|
+
return JSON.pretty_generate(outhash)
|
46
|
+
rescue
|
47
|
+
binding.pry
|
48
|
+
end
|
37
49
|
end
|
38
50
|
end
|
51
|
+
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: parsefile
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- M. C. McGrath
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-05-16 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: OCR file and extract metadata
|
14
14
|
email: shidash@shidash.com
|