RubyGems - parsefile - Versions diffs - 0.0.4 → 0.0.5 - Mend

parsefile 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: a2cc7beaa60ab4c121fa0d8ff48be153268399eb
-  data.tar.gz: 0ec7a2795d6177114c210523650d0d3dbe9b68b1
+  metadata.gz: c9b7c462a9aee22375232c5d0a2533cfae0ea46b
+  data.tar.gz: d9aba7f3809b24b4d21cceee8f9d55e957e4128c
 SHA512:
-  metadata.gz: 2026ab5bce89bfa4b0736682dcbbbd5cddce2624bab53ded19a22586e010a20b0d4c18abc38a30fbca665058223c3151b32737c8606525cd7e9c682a2e616c0d
-  data.tar.gz: 60958a689b1510a039f6f668b37d3a115691d2776374afca6f0932a664e309b7bbad188954b018645394c95cd092ba506c0c21eea35256d7c695f5446e442b27
+  metadata.gz: d172ebfaa962e386490b1012a929f3c20d19f9926dbecfe59defeff19202932d2744b5bb1f1e4bca7acf7956563bdcb14a991092293c3aac5a5429041b130e0b
+  data.tar.gz: da667040f692c10b9da0098a52abcb6e7b972a004d0e84313cc84544b930022687a0bbab18de8cdbecc07d7aa535d5ea7e4494eecfb5db65a9599b336cbc8d0a

data/lib/ocrfile.rb CHANGED

@@ -1,13 +1,15 @@
 require 'fileutils'
 require 'docsplit'
 require 'curb'
+require 'mimemagic'
 class OCRFile
-  def initialize(file, input_dir, output_dir, rel_path)
+  def initialize(file, input_dir, output_dir, rel_path, tika)
     @path = file
     @input_dir = input_dir
     @output_dir = output_dir
     @rel_path = rel_path
+	@tika = tika
     @text = ""
   end
@@ -19,7 +21,11 @@ class OCRFile
       elsif @path.include?(".pdf")
         ocr_pdf
       else
-        give_me_text
+        if @tika
+          give_me_text_local
+        else
+          give_me_text
+        end
       end
     rescue # Detect errors
       binding.pry
@@ -44,10 +50,26 @@ class OCRFile
     c = Curl::Easy.new("http://givemetext.okfnlabs.org/tika/tika/form")
     c.multipart_form_post = true
     c.http_post(Curl::PostField.file('file', @path))
-    @text = c.body_str
+	@text = c.body_str
     gotten_text_ok?(@text)
   end
+  def give_me_text_local
+	c = Curl::Easy.new(@tika + "/tika")
+	# TODO: move this mime filtering to a higher global level
+	mime_magic = MimeMagic.by_path(@path)
+	file_data = File.read(@path)
+	c.headers['Content-Type'] = mime_magic.type
+	c.headers['Accept'] = "text/plain"
+	c.http_put(file_data)
+	#binding.pry
+	@text = c.body_str
+	gotten_text_ok?(@text)
+  end
   # Checks if text was successfully extracted
   def gotten_text_ok?(text)
     throw :extraction_error if text.include?("java.io.IOException: Stream Closed")
@@ -74,7 +96,8 @@ class OCRFile
     # Extract text and save
     Docsplit.extract_text(docs_no_spaces, :output => base+'text')
     text_files = Dir[base+'text/'+filename+'*']
-    sorted_text = text_files.sort_by {|f| f.split(filename).last.gsub("_", "").gsub(".txt", "").to_i }
+    sorted_text = text_files.sort_by {|f|
+		f.split(filename).last.gsub("_", "").gsub(".txt", "").to_i }
     sorted_text.each do |f|
       @text += File.read(f)
     end

data/lib/parsefile.rb CHANGED

@@ -7,10 +7,17 @@ load 'ocrfile.rb'
 load 'extractmetadata.rb'
 class ParseFile
-  def initialize(file, input_dir, output_dir)
+  def initialize(file, input_dir, output_dir, tika)
     @path = file
     @input_dir = input_dir
     @output_dir = output_dir
+	# Pass the url for a custom (or local) Tika server
+	# Else use OKFNs service over normal HTTP... ZOMG... O.o
+	if tika
+	  @tika = tika
+	else
+	  @tika = nil
+	end
   end
   # Parse the file
@@ -21,7 +28,7 @@ class ParseFile
     @metadata = m.extract
     # OCR File
-    o = OCRFile.new(@path, @input_dir, @output_dir, @metadata[:rel_path])
+    o = OCRFile.new(@path, @input_dir, @output_dir, @metadata[:rel_path], @tika)
     @text = o.ocr
     # Generate output and return

metadata CHANGED

@@ -1,16 +1,17 @@
 --- !ruby/object:Gem::Specification
 name: parsefile
 version: !ruby/object:Gem::Version
-  version: 0.0.4
+  version: 0.0.5
 platform: ruby
 authors:
 - M. C. McGrath
+- Brennan Novak
 autorequire:
 bindir: bin
 cert_chain: []
 date: 2016-05-16 00:00:00.000000000 Z
 dependencies: []
-description: OCR file and extract metadata
+description: OCR file and extract metadata using Apache Tika and Tesseract
 email: shidash@shidash.com
 executables: []
 extensions: []