RubyGems - parsefile - Versions diffs - 0.0.1 - Mend

parsefile 0.0.1

Files changed (5) hide show

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 565694f0c2197d0c77b3b6e96e4da03ee02930b5
+  data.tar.gz: c008fd5f1e9ae074e79bacb8aabb4e2b8ef3755d
+SHA512:
+  metadata.gz: 27fb72e4c92ebf5a6ab0b21f47a2278067af42a7572622c5cbb2320677235017d2bca030dcb415c1d50820db1f0009e8dcc7f7f06fcb8b311d7d8e973d50a79a
+  data.tar.gz: 4e83e5644beb7ca4e073f7e5ef07e8ec697016d242f14462bf7eddf54a9fab6c75750fdaa934c5936594327f1317cd23477a9d9a76723408c648ad2ebb94c0b0

data/lib/extractmetadata.rb ADDED Viewed

@@ -0,0 +1,56 @@
+require 'json'
+require 'docsplit'
+class ExtractMetadata
+  def initialize(file, input_dir, output_dir)
+    @path = file
+    @input_dir = input_dir
+    @output_dir = output_dir
+  end
+  # Extract metadata
+  def extract
+    outhash = Hash.new
+    # Get relative path
+    @rel_path = get_rel_path
+    outhash[:rel_path] = @rel_path
+    # Get formatted name and file type
+    outhash[:formatted_name] = get_formatted_name
+    outhash[:filetype] = get_file_type
+    # Extract file metadata, merge. and return
+    outhash.merge!(extract_file_metadata)
+    return outhash
+  end
+  # Get the relative path
+  def get_rel_path
+    @path.gsub(@input_dir, "")
+  end
+  # Get a formatted file name
+  def get_formatted_name
+    @rel_path.split(".").first.gsub("_", " ").gsub("/", "")
+  end
+  # Get file type
+  def get_file_type
+    @rel_path.split(".").last
+  end
+  # Extract PDF metadata
+  def extract_file_metadata
+    metadata = Hash.new
+    metadata[:author] = Docsplit.extract_author(@path)
+    metadata[:creator] =  Docsplit.extract_creator(@path)
+    metadata[:producer] = Docsplit.extract_producer(@path)
+    metadata[:title] = Docsplit.extract_title(@path)
+    metadata[:subject] = Docsplit.extract_subject(@path)
+    metadata[:date] = Docsplit.extract_date(@path)
+    metadata[:keywords] = Docsplit.extract_keywords(@path)
+    metadata[:length] = Docsplit.extract_length(@path)
+    return metadata
+  end
+end

data/lib/ocrfile.rb ADDED Viewed

@@ -0,0 +1,77 @@
+require 'fileutils'
+require 'docsplit'
+require 'curb'
+class OCRFile
+  def initialize(file, input_dir, output_dir, rel_path)
+    @path = file
+    @input_dir = input_dir
+    @output_dir = output_dir
+    @rel_path = rel_path
+    @text = ""
+  end
+  # OCR file
+  def ocr
+    begin
+      if File.exist?(@output_dir+@rel_path)
+        load_extracted_text(@output_dir+@rel_path)
+      elsif @path.include?(".pdf")
+        ocr_pdf
+      else
+        give_me_text
+      end
+    rescue # Detect errors
+      binding.pry
+    end
+    return @text
+  end
+  # Check if file is pdf
+  def is_pdf?
+    file_start = File.open(@path, 'r') { |f| f.read(8)}
+    file_start.match(/\%PDF-\d+\.?\d+/)
+  end
+  # Load text that is already extracted
+  def load_extracted_text(file)
+    @text = File.read(file)
+  end
+  # Send file to give me text
+  def give_me_text
+    c = Curl::Easy.new("http://givemetext.okfnlabs.org/tika/tika/form")
+    c.multipart_form_post = true
+    c.http_post(Curl::PostField.file('file', @path))
+    @text = c.body_str
+    gotten_text_ok?(@text)
+  end
+  # Checks if text was successfully extracted
+  def gotten_text_ok?(text)
+    throw :extraction_error if text.include?("java.io.IOException: Stream Closed")
+  end
+  # OCR with tesseract
+  def ocr_pdf
+    # Split pages to handle large PDFs
+    Docsplit.extract_pages(@path, :output => 'pages')
+    filename = @path.split("/").last.gsub(".pdf", "")
+    docs = Dir['pages/'+filename+'*']
+    # Extract text and save
+    Docsplit.extract_text(docs, :output => 'text')
+    text_files = Dir['text/'+filename+'*']
+    sorted_text = text_files.sort_by {|f| f.split(filename).last.gsub("_", "").gsub(".txt", "").to_i }
+    sorted_text.each do |f|
+      @text += File.read(f)
+    end
+    # Clean up
+    FileUtils.rm_f Dir.glob("pages/*")
+    Dir.delete("pages")
+    FileUtils.rm_f Dir.glob("text/*")
+    Dir.delete("text")
+  end
+end

data/lib/parsefile.rb ADDED Viewed

@@ -0,0 +1,38 @@
+require 'json'
+require 'docsplit'
+require 'fileutils'
+require 'pry'
+require 'dircrawl'
+load 'ocrfile.rb'
+load 'extractmetadata.rb'
+class ParseFile
+  def initialize(file, input_dir, output_dir)
+    @path = file
+    @input_dir = input_dir
+    @output_dir = output_dir
+  end
+  # Parse the file
+  def parse_file
+    # Get metadata
+    m = ExtractMetadata.new(@path, @input_dir, @output_dir)
+    @metadata = m.extract
+    # OCR File
+    o = OCRFile.new(@path, @input_dir, @output_dir, @metadata[:rel_path])
+    @text = o.ocr
+    # Generate output and return
+    gen_output
+  end
+  # Generate output
+  def gen_output
+    outhash = Hash.new
+    outhash[:full_path] = @path
+    outhash.merge!(@metadata)
+    outhash[:text] = @text
+    return JSON.pretty_generate(outhash)
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,47 @@
+--- !ruby/object:Gem::Specification
+name: parsefile
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+platform: ruby
+authors:
+- M. C. McGrath
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2016-01-28 00:00:00.000000000 Z
+dependencies: []
+description: OCR file and extract metadata
+email: shidash@shidash.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/extractmetadata.rb
+- lib/ocrfile.rb
+- lib/parsefile.rb
+homepage: https://github.com/TransparencyToolkit/parsefile
+licenses:
+- GPL
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.4.8
+signing_key:
+specification_version: 4
+summary: OCR file and extract metadata
+test_files: []
+has_rdoc: