RubyGems - parsefile - Versions diffs - 0.0.9 → 0.0.10 - Mend

parsefile 0.0.9 → 0.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 99b0e344729338584b0af0696cb2e6e6ef18edef
-  data.tar.gz: 79b40b78a7af582e57492a4daa75fab5db0462b7
+  metadata.gz: bf06a478153c7107295b160bc8b2d76e403e7d8f
+  data.tar.gz: 4122ff2b451ba656d75893119209a1adefd2d5dc
 SHA512:
-  metadata.gz: 2f2683b2aa5ba9b328f3d8b1bbd9b75bcb5cf9f20aa4c434ee64db7b9ac798e842b998e315532b1253f7b955a2d952abcb9f67fcd9311bc4bf76a473a582a765
-  data.tar.gz: 2dc0d20a1c0eceb636ff89c15c2311a235a23814cae04bf11e41e2b305d0ca7508ce68180d7a441617cb273f0d9158881b8e28e235422ad48ef3886606a72623
+  metadata.gz: 0d92b7753ae69345a3a456be7f61982969bde31dfad4dde386fd4906a8ba631610a6a2dadf6ae7b3c841c01f12e6b7627d96528745aaeed11abf9166a7b321b8
+  data.tar.gz: 57620b37897bb7b99d523c9deb76948a723f305338a958a646c6e69b35a38ecaa4f3cb848803bbe8652c45f1b83579cdb5c981d9a132fbd88454eda60eaea066

data/lib/extractmetadata.rb CHANGED

@@ -6,6 +6,10 @@ class ExtractMetadata
     @path = file
     @input_dir = input_dir
     @output_dir = output_dir
+	@allowed_extensions = [
+      'pdf', 'doc', 'docbook', 'docx', 'txt', 'rtf', 'md', 'csv', 'xls', 'xlsx',
+      'jpg', 'jpeg', 'png', 'gif', 'svg'
+	]
   end
   # Extract metadata
@@ -23,7 +27,11 @@ class ExtractMetadata
     # Extract file metadata, merge. and return
     begin
-      outhash.merge!(extract_file_metadata)
+	  if (@allowed_extensions.include? outhash[:filetype])
+        outhash.merge!(extract_file_metadata)
+      else
+        puts "skipping ." + outhash[:filetype] + " file"
+      end
     rescue
     end
     return outhash

data/lib/ocrfile.rb CHANGED

@@ -1,5 +1,4 @@
 require 'fileutils'
-require 'docsplit'
 require 'curb'
 require 'mimemagic'
@@ -16,42 +15,32 @@ class OCRFile
   # OCR file
   def ocr
     begin
+	  mime_magic = MimeMagic.by_path(@path)
       if File.exist?(@output_dir+@rel_path+".json")
         load_extracted_text(@output_dir+@rel_path+".json")
-      #elsif @path.include?(".pdf")
-      #  ocr_pdf
       else
         if @tika
-          give_me_text_local
+          give_me_text_local(mime_magic)
         else
-          @text = File.read(@path)
-        #  give_me_text
+          give_me_text
         end
       end
-    rescue # Detect errors
-      #binding.pry
+    rescue
+	  # Detect errors
+      binding.pry
     end
     return @text
   end
-  # Check if file is pdf
-  def is_pdf?
-    puts "determined: is_pdf"
-    file_start = File.open(@path, 'r') { |f| f.read(8)}
-    file_start.match(/\%PDF-\d+\.?\d+/)
-  end
   # Load text that is already extracted
   def load_extracted_text(file)
-	puts "file exists: load_extracted_text"
+	puts "file already exists"
     @text = JSON.parse(File.read(file))["text"]
   end
   # Send file to give me text
   def give_me_text
-    puts "using: give_me_text"
     c = Curl::Easy.new("http://givemetext.okfnlabs.org/tika/tika/form")
     c.multipart_form_post = true
     c.http_post(Curl::PostField.file('file', @path))
@@ -60,17 +49,13 @@ class OCRFile
     gotten_text_ok?(@text)
   end
-  def give_me_text_local
-	puts "using: give_me_text_local"
+  def give_me_text_local(mime_magic)
 	c = Curl::Easy.new(@tika + "/tika")
-	# TODO: move this mime filtering to a higher global level
-	mime_magic = MimeMagic.by_path(@path)
 	file_data = File.read(@path)
 	c.headers['Content-Type'] = mime_magic.type
 	c.headers['Accept'] = "text/plain"
 	c.http_put(file_data)
-	#binding.pry
 	@text = c.body_str
 	gotten_text_ok?(@text)
   end
@@ -80,38 +65,4 @@ class OCRFile
     throw :extraction_error if text.include?("java.io.IOException: Stream Closed")
   end
-  # OCR with tesseract
-  def ocr_pdf
-	puts "using: ocr_pdf"
-    # Dir_paths
-    base = Dir.pwd+"/"
-    # Split pages to handle large PDFs
-    Docsplit.extract_pages(@path, :output => base+'pages')
-    filename = @path.split("/").last.gsub(".pdf", "")
-    docs = Dir[base+'pages/'+filename+'*']
-    # Rename pages so that they can be processed with spaces
-    docs.each do |d|
-      new_name = d.split("/").last.gsub(" ", "_").gsub("(", "").gsub(")", "")
-      File.rename(d, base+'pages/'+new_name)
-    end
-    filename = filename.gsub(" ", "_").gsub("(", "").gsub(")", "")
-    docs_no_spaces = Dir[base+'pages/'+filename+'*']
-    # Extract text and save
-    Docsplit.extract_text(docs_no_spaces, :output => base+'text')
-    text_files = Dir[base+'text/'+filename+'*']
-    sorted_text = text_files.sort_by {|f|
-		f.split(filename).last.gsub("_", "").gsub(".txt", "").to_i }
-    sorted_text.each do |f|
-      @text += File.read(f)
-    end
-    # Clean up
-    FileUtils.rm_f Dir.glob(base+"pages/*")
-    Dir.delete(base+"pages")
-    FileUtils.rm_f Dir.glob(base+"text/*")
-    Dir.delete(base+"text")
-  end
 end

data/lib/parsefile.rb CHANGED

@@ -1,44 +1,40 @@
 require 'json'
-require 'docsplit'
 require 'fileutils'
 require 'pry'
-load 'ocrfile.rb'
-load 'extractmetadata.rb'
+require 'ocrfile'
+require 'extractmetadata'
 class ParseFile
   def initialize(file, input_dir, output_dir, tika)
     @path = file
     @input_dir = input_dir
     @output_dir = output_dir
-	# Pass the url for a custom (or local) Tika server
-	# Else use OKFNs service over normal HTTP... ZOMG... O.o
+	# Pass URL of a Tika server
 	if tika
 	  @tika = tika
+	# Use OKFNs service over normal HTTP... ZOMG... O.o
 	else
 	  @tika = nil
 	end
   end
-  # Parse the file
   def parse_file
     begin
-    # Get metadata
-    m = ExtractMetadata.new(@path, @input_dir, @output_dir)
-    @metadata = m.extract
+	  puts "sending file: " + @path
+      m = ExtractMetadata.new(@path, @input_dir, @output_dir)
+      @metadata = m.extract
-    # OCR File
-    o = OCRFile.new(@path, @input_dir, @output_dir, @metadata[:rel_path], @tika)
-    @text = o.ocr
+      o = OCRFile.new(@path, @input_dir, @output_dir, @metadata[:rel_path], @tika)
+      @text = o.ocr
-    # Generate output and return
-    gen_output
-    rescue #TODO: Fix!
+      gen_output
+    rescue
+	  #TODO: use a global debug / log
       binding.pry
     end
   end
-  # Generate output
   def gen_output
     outhash = Hash.new
     outhash[:full_path] = @path

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: parsefile
 version: !ruby/object:Gem::Version
-  version: 0.0.9
+  version: 0.0.10
 platform: ruby
 authors:
 - M. C. McGrath
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2017-03-07 00:00:00.000000000 Z
+date: 2017-05-29 00:00:00.000000000 Z
 dependencies: []
 description: OCR file and extract metadata using Apache Tika and Tesseract
 email: shidash@shidash.com
@@ -40,9 +40,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.4.8
+rubygems_version: 2.6.11
 signing_key:
 specification_version: 4
 summary: OCR file and extract metadata
 test_files: []
-has_rdoc: