parsefile 0.0.4 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (4) hide show
  1. checksums.yaml +4 -4
  2. data/lib/ocrfile.rb +27 -4
  3. data/lib/parsefile.rb +9 -2
  4. metadata +3 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a2cc7beaa60ab4c121fa0d8ff48be153268399eb
4
- data.tar.gz: 0ec7a2795d6177114c210523650d0d3dbe9b68b1
3
+ metadata.gz: c9b7c462a9aee22375232c5d0a2533cfae0ea46b
4
+ data.tar.gz: d9aba7f3809b24b4d21cceee8f9d55e957e4128c
5
5
  SHA512:
6
- metadata.gz: 2026ab5bce89bfa4b0736682dcbbbd5cddce2624bab53ded19a22586e010a20b0d4c18abc38a30fbca665058223c3151b32737c8606525cd7e9c682a2e616c0d
7
- data.tar.gz: 60958a689b1510a039f6f668b37d3a115691d2776374afca6f0932a664e309b7bbad188954b018645394c95cd092ba506c0c21eea35256d7c695f5446e442b27
6
+ metadata.gz: d172ebfaa962e386490b1012a929f3c20d19f9926dbecfe59defeff19202932d2744b5bb1f1e4bca7acf7956563bdcb14a991092293c3aac5a5429041b130e0b
7
+ data.tar.gz: da667040f692c10b9da0098a52abcb6e7b972a004d0e84313cc84544b930022687a0bbab18de8cdbecc07d7aa535d5ea7e4494eecfb5db65a9599b336cbc8d0a
@@ -1,13 +1,15 @@
1
1
  require 'fileutils'
2
2
  require 'docsplit'
3
3
  require 'curb'
4
+ require 'mimemagic'
4
5
 
5
6
  class OCRFile
6
- def initialize(file, input_dir, output_dir, rel_path)
7
+ def initialize(file, input_dir, output_dir, rel_path, tika)
7
8
  @path = file
8
9
  @input_dir = input_dir
9
10
  @output_dir = output_dir
10
11
  @rel_path = rel_path
12
+ @tika = tika
11
13
  @text = ""
12
14
  end
13
15
 
@@ -19,7 +21,11 @@ class OCRFile
19
21
  elsif @path.include?(".pdf")
20
22
  ocr_pdf
21
23
  else
22
- give_me_text
24
+ if @tika
25
+ give_me_text_local
26
+ else
27
+ give_me_text
28
+ end
23
29
  end
24
30
  rescue # Detect errors
25
31
  binding.pry
@@ -44,10 +50,26 @@ class OCRFile
44
50
  c = Curl::Easy.new("http://givemetext.okfnlabs.org/tika/tika/form")
45
51
  c.multipart_form_post = true
46
52
  c.http_post(Curl::PostField.file('file', @path))
47
- @text = c.body_str
53
+
54
+ @text = c.body_str
48
55
  gotten_text_ok?(@text)
49
56
  end
50
57
 
58
+ def give_me_text_local
59
+ c = Curl::Easy.new(@tika + "/tika")
60
+ # TODO: move this mime filtering to a higher global level
61
+ mime_magic = MimeMagic.by_path(@path)
62
+ file_data = File.read(@path)
63
+ c.headers['Content-Type'] = mime_magic.type
64
+ c.headers['Accept'] = "text/plain"
65
+ c.http_put(file_data)
66
+
67
+ #binding.pry
68
+
69
+ @text = c.body_str
70
+ gotten_text_ok?(@text)
71
+ end
72
+
51
73
  # Checks if text was successfully extracted
52
74
  def gotten_text_ok?(text)
53
75
  throw :extraction_error if text.include?("java.io.IOException: Stream Closed")
@@ -74,7 +96,8 @@ class OCRFile
74
96
  # Extract text and save
75
97
  Docsplit.extract_text(docs_no_spaces, :output => base+'text')
76
98
  text_files = Dir[base+'text/'+filename+'*']
77
- sorted_text = text_files.sort_by {|f| f.split(filename).last.gsub("_", "").gsub(".txt", "").to_i }
99
+ sorted_text = text_files.sort_by {|f|
100
+ f.split(filename).last.gsub("_", "").gsub(".txt", "").to_i }
78
101
  sorted_text.each do |f|
79
102
  @text += File.read(f)
80
103
  end
@@ -7,10 +7,17 @@ load 'ocrfile.rb'
7
7
  load 'extractmetadata.rb'
8
8
 
9
9
  class ParseFile
10
- def initialize(file, input_dir, output_dir)
10
+ def initialize(file, input_dir, output_dir, tika)
11
11
  @path = file
12
12
  @input_dir = input_dir
13
13
  @output_dir = output_dir
14
+ # Pass the url for a custom (or local) Tika server
15
+ # Else use OKFNs service over normal HTTP... ZOMG... O.o
16
+ if tika
17
+ @tika = tika
18
+ else
19
+ @tika = nil
20
+ end
14
21
  end
15
22
 
16
23
  # Parse the file
@@ -21,7 +28,7 @@ class ParseFile
21
28
  @metadata = m.extract
22
29
 
23
30
  # OCR File
24
- o = OCRFile.new(@path, @input_dir, @output_dir, @metadata[:rel_path])
31
+ o = OCRFile.new(@path, @input_dir, @output_dir, @metadata[:rel_path], @tika)
25
32
  @text = o.ocr
26
33
 
27
34
  # Generate output and return
metadata CHANGED
@@ -1,16 +1,17 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parsefile
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
8
+ - Brennan Novak
8
9
  autorequire:
9
10
  bindir: bin
10
11
  cert_chain: []
11
12
  date: 2016-05-16 00:00:00.000000000 Z
12
13
  dependencies: []
13
- description: OCR file and extract metadata
14
+ description: OCR file and extract metadata using Apache Tika and Tesseract
14
15
  email: shidash@shidash.com
15
16
  executables: []
16
17
  extensions: []