parsefile 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. checksums.yaml +4 -4
  2. data/lib/ocrfile.rb +27 -4
  3. data/lib/parsefile.rb +9 -2
  4. metadata +3 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a2cc7beaa60ab4c121fa0d8ff48be153268399eb
4
- data.tar.gz: 0ec7a2795d6177114c210523650d0d3dbe9b68b1
3
+ metadata.gz: c9b7c462a9aee22375232c5d0a2533cfae0ea46b
4
+ data.tar.gz: d9aba7f3809b24b4d21cceee8f9d55e957e4128c
5
5
  SHA512:
6
- metadata.gz: 2026ab5bce89bfa4b0736682dcbbbd5cddce2624bab53ded19a22586e010a20b0d4c18abc38a30fbca665058223c3151b32737c8606525cd7e9c682a2e616c0d
7
- data.tar.gz: 60958a689b1510a039f6f668b37d3a115691d2776374afca6f0932a664e309b7bbad188954b018645394c95cd092ba506c0c21eea35256d7c695f5446e442b27
6
+ metadata.gz: d172ebfaa962e386490b1012a929f3c20d19f9926dbecfe59defeff19202932d2744b5bb1f1e4bca7acf7956563bdcb14a991092293c3aac5a5429041b130e0b
7
+ data.tar.gz: da667040f692c10b9da0098a52abcb6e7b972a004d0e84313cc84544b930022687a0bbab18de8cdbecc07d7aa535d5ea7e4494eecfb5db65a9599b336cbc8d0a
@@ -1,13 +1,15 @@
1
1
  require 'fileutils'
2
2
  require 'docsplit'
3
3
  require 'curb'
4
+ require 'mimemagic'
4
5
 
5
6
  class OCRFile
6
- def initialize(file, input_dir, output_dir, rel_path)
7
+ def initialize(file, input_dir, output_dir, rel_path, tika)
7
8
  @path = file
8
9
  @input_dir = input_dir
9
10
  @output_dir = output_dir
10
11
  @rel_path = rel_path
12
+ @tika = tika
11
13
  @text = ""
12
14
  end
13
15
 
@@ -19,7 +21,11 @@ class OCRFile
19
21
  elsif @path.include?(".pdf")
20
22
  ocr_pdf
21
23
  else
22
- give_me_text
24
+ if @tika
25
+ give_me_text_local
26
+ else
27
+ give_me_text
28
+ end
23
29
  end
24
30
  rescue # Detect errors
25
31
  binding.pry
@@ -44,10 +50,26 @@ class OCRFile
44
50
  c = Curl::Easy.new("http://givemetext.okfnlabs.org/tika/tika/form")
45
51
  c.multipart_form_post = true
46
52
  c.http_post(Curl::PostField.file('file', @path))
47
- @text = c.body_str
53
+
54
+ @text = c.body_str
48
55
  gotten_text_ok?(@text)
49
56
  end
50
57
 
58
+ def give_me_text_local
59
+ c = Curl::Easy.new(@tika + "/tika")
60
+ # TODO: move this mime filtering to a higher global level
61
+ mime_magic = MimeMagic.by_path(@path)
62
+ file_data = File.read(@path)
63
+ c.headers['Content-Type'] = mime_magic.type
64
+ c.headers['Accept'] = "text/plain"
65
+ c.http_put(file_data)
66
+
67
+ #binding.pry
68
+
69
+ @text = c.body_str
70
+ gotten_text_ok?(@text)
71
+ end
72
+
51
73
  # Checks if text was successfully extracted
52
74
  def gotten_text_ok?(text)
53
75
  throw :extraction_error if text.include?("java.io.IOException: Stream Closed")
@@ -74,7 +96,8 @@ class OCRFile
74
96
  # Extract text and save
75
97
  Docsplit.extract_text(docs_no_spaces, :output => base+'text')
76
98
  text_files = Dir[base+'text/'+filename+'*']
77
- sorted_text = text_files.sort_by {|f| f.split(filename).last.gsub("_", "").gsub(".txt", "").to_i }
99
+ sorted_text = text_files.sort_by {|f|
100
+ f.split(filename).last.gsub("_", "").gsub(".txt", "").to_i }
78
101
  sorted_text.each do |f|
79
102
  @text += File.read(f)
80
103
  end
@@ -7,10 +7,17 @@ load 'ocrfile.rb'
7
7
  load 'extractmetadata.rb'
8
8
 
9
9
  class ParseFile
10
- def initialize(file, input_dir, output_dir)
10
+ def initialize(file, input_dir, output_dir, tika)
11
11
  @path = file
12
12
  @input_dir = input_dir
13
13
  @output_dir = output_dir
14
+ # Pass the url for a custom (or local) Tika server
15
+ # Else use OKFNs service over normal HTTP... ZOMG... O.o
16
+ if tika
17
+ @tika = tika
18
+ else
19
+ @tika = nil
20
+ end
14
21
  end
15
22
 
16
23
  # Parse the file
@@ -21,7 +28,7 @@ class ParseFile
21
28
  @metadata = m.extract
22
29
 
23
30
  # OCR File
24
- o = OCRFile.new(@path, @input_dir, @output_dir, @metadata[:rel_path])
31
+ o = OCRFile.new(@path, @input_dir, @output_dir, @metadata[:rel_path], @tika)
25
32
  @text = o.ocr
26
33
 
27
34
  # Generate output and return
metadata CHANGED
@@ -1,16 +1,17 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parsefile
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
8
+ - Brennan Novak
8
9
  autorequire:
9
10
  bindir: bin
10
11
  cert_chain: []
11
12
  date: 2016-05-16 00:00:00.000000000 Z
12
13
  dependencies: []
13
- description: OCR file and extract metadata
14
+ description: OCR file and extract metadata using Apache Tika and Tesseract
14
15
  email: shidash@shidash.com
15
16
  executables: []
16
17
  extensions: []