parsefile 0.0.7 → 0.0.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/ocrfile.rb +6 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2738907314aac77a653fd655f4a200e2a63b6ed6
|
4
|
+
data.tar.gz: 8c780c9b2304699e37f71cdb7f4842918bc61fc9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e1385fdeb0923ebe72068c529341314aabebe1123f161e636762c19ad3325518716c759f9eae5fdd4bc6f12892709a45fe2dc5381e6151e6472677800fb31865
|
7
|
+
data.tar.gz: 919fbc77f37f6d40b15726ae9d5f36b622fdbe37d2789f730da61b5dea61d24ec12f359b607b096dbc5f0afd936e30fb68f23ee50efdd6276e3aa3c9273d6ade
|
data/lib/ocrfile.rb
CHANGED
@@ -28,7 +28,7 @@ class OCRFile
|
|
28
28
|
end
|
29
29
|
end
|
30
30
|
rescue # Detect errors
|
31
|
-
binding.pry
|
31
|
+
#binding.pry
|
32
32
|
end
|
33
33
|
|
34
34
|
return @text
|
@@ -36,17 +36,20 @@ class OCRFile
|
|
36
36
|
|
37
37
|
# Check if file is pdf
|
38
38
|
def is_pdf?
|
39
|
+
puts "determined: is_pdf"
|
39
40
|
file_start = File.open(@path, 'r') { |f| f.read(8)}
|
40
41
|
file_start.match(/\%PDF-\d+\.?\d+/)
|
41
42
|
end
|
42
43
|
|
43
44
|
# Load text that is already extracted
|
44
45
|
def load_extracted_text(file)
|
46
|
+
puts "file exists: load_extracted_text"
|
45
47
|
@text = JSON.parse(File.read(file))["text"]
|
46
48
|
end
|
47
49
|
|
48
50
|
# Send file to give me text
|
49
51
|
def give_me_text
|
52
|
+
puts "using: give_me_text"
|
50
53
|
c = Curl::Easy.new("http://givemetext.okfnlabs.org/tika/tika/form")
|
51
54
|
c.multipart_form_post = true
|
52
55
|
c.http_post(Curl::PostField.file('file', @path))
|
@@ -56,6 +59,7 @@ class OCRFile
|
|
56
59
|
end
|
57
60
|
|
58
61
|
def give_me_text_local
|
62
|
+
puts "using: give_me_text_local"
|
59
63
|
c = Curl::Easy.new(@tika + "/tika")
|
60
64
|
# TODO: move this mime filtering to a higher global level
|
61
65
|
mime_magic = MimeMagic.by_path(@path)
|
@@ -65,7 +69,6 @@ class OCRFile
|
|
65
69
|
c.http_put(file_data)
|
66
70
|
|
67
71
|
#binding.pry
|
68
|
-
|
69
72
|
@text = c.body_str
|
70
73
|
gotten_text_ok?(@text)
|
71
74
|
end
|
@@ -77,6 +80,7 @@ class OCRFile
|
|
77
80
|
|
78
81
|
# OCR with tesseract
|
79
82
|
def ocr_pdf
|
83
|
+
puts "using: ocr_pdf"
|
80
84
|
# Dir_paths
|
81
85
|
base = Dir.pwd+"/"
|
82
86
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: parsefile
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- M. C. McGrath
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2017-01-19 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: OCR file and extract metadata using Apache Tika and Tesseract
|
15
15
|
email: shidash@shidash.com
|