RubyGems - parsefile - Versions diffs - 0.0.1 - Mend

parsefile 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 565694f0c2197d0c77b3b6e96e4da03ee02930b5
+  data.tar.gz: c008fd5f1e9ae074e79bacb8aabb4e2b8ef3755d
+SHA512:
+  metadata.gz: 27fb72e4c92ebf5a6ab0b21f47a2278067af42a7572622c5cbb2320677235017d2bca030dcb415c1d50820db1f0009e8dcc7f7f06fcb8b311d7d8e973d50a79a
+  data.tar.gz: 4e83e5644beb7ca4e073f7e5ef07e8ec697016d242f14462bf7eddf54a9fab6c75750fdaa934c5936594327f1317cd23477a9d9a76723408c648ad2ebb94c0b0

data/lib/extractmetadata.rb ADDED Viewed

@@ -0,0 +1,56 @@
+require 'json'
+require 'docsplit'
+class ExtractMetadata
+  def initialize(file, input_dir, output_dir)
+    @path = file
+    @input_dir = input_dir
+    @output_dir = output_dir
+  end
+  # Extract metadata
+  def extract
+    outhash = Hash.new
+    # Get relative path
+    @rel_path = get_rel_path
+    outhash[:rel_path] = @rel_path
+    # Get formatted name and file type
+    outhash[:formatted_name] = get_formatted_name
+    outhash[:filetype] = get_file_type
+    # Extract file metadata, merge. and return
+    outhash.merge!(extract_file_metadata)
+    return outhash
+  end
+  # Get the relative path
+  def get_rel_path
+    @path.gsub(@input_dir, "")
+  end
+  # Get a formatted file name
+  def get_formatted_name
+    @rel_path.split(".").first.gsub("_", " ").gsub("/", "")
+  end
+  # Get file type
+  def get_file_type
+    @rel_path.split(".").last
+  end
+  # Extract PDF metadata
+  def extract_file_metadata
+    metadata = Hash.new
+    metadata[:author] = Docsplit.extract_author(@path)
+    metadata[:creator] =  Docsplit.extract_creator(@path)
+    metadata[:producer] = Docsplit.extract_producer(@path)
+    metadata[:title] = Docsplit.extract_title(@path)
+    metadata[:subject] = Docsplit.extract_subject(@path)
+    metadata[:date] = Docsplit.extract_date(@path)
+    metadata[:keywords] = Docsplit.extract_keywords(@path)
+    metadata[:length] = Docsplit.extract_length(@path)
+    return metadata
+  end
+end

data/lib/ocrfile.rb ADDED Viewed

@@ -0,0 +1,77 @@
+require 'fileutils'
+require 'docsplit'
+require 'curb'
+class OCRFile
+  def initialize(file, input_dir, output_dir, rel_path)
+    @path = file
+    @input_dir = input_dir
+    @output_dir = output_dir
+    @rel_path = rel_path
+    @text = ""
+  end
+  # OCR file
+  def ocr
+    begin
+      if File.exist?(@output_dir+@rel_path)
+        load_extracted_text(@output_dir+@rel_path)
+      elsif @path.include?(".pdf")
+        ocr_pdf
+      else
+        give_me_text
+      end
+    rescue # Detect errors
+      binding.pry
+    end
+    return @text
+  end
+  # Check if file is pdf
+  def is_pdf?
+    file_start = File.open(@path, 'r') { |f| f.read(8)}
+    file_start.match(/\%PDF-\d+\.?\d+/)
+  end
+  # Load text that is already extracted
+  def load_extracted_text(file)
+    @text = File.read(file)
+  end
+  # Send file to give me text
+  def give_me_text
+    c = Curl::Easy.new("http://givemetext.okfnlabs.org/tika/tika/form")
+    c.multipart_form_post = true
+    c.http_post(Curl::PostField.file('file', @path))
+    @text = c.body_str
+    gotten_text_ok?(@text)
+  end
+  # Checks if text was successfully extracted
+  def gotten_text_ok?(text)
+    throw :extraction_error if text.include?("java.io.IOException: Stream Closed")
+  end
+  # OCR with tesseract
+  def ocr_pdf
+    # Split pages to handle large PDFs
+    Docsplit.extract_pages(@path, :output => 'pages')
+    filename = @path.split("/").last.gsub(".pdf", "")
+    docs = Dir['pages/'+filename+'*']
+    # Extract text and save
+    Docsplit.extract_text(docs, :output => 'text')
+    text_files = Dir['text/'+filename+'*']
+    sorted_text = text_files.sort_by {|f| f.split(filename).last.gsub("_", "").gsub(".txt", "").to_i }
+    sorted_text.each do |f|
+      @text += File.read(f)
+    end
+    # Clean up
+    FileUtils.rm_f Dir.glob("pages/*")
+    Dir.delete("pages")
+    FileUtils.rm_f Dir.glob("text/*")
+    Dir.delete("text")
+  end
+end

data/lib/parsefile.rb ADDED Viewed

@@ -0,0 +1,38 @@
+require 'json'
+require 'docsplit'
+require 'fileutils'
+require 'pry'
+require 'dircrawl'
+load 'ocrfile.rb'
+load 'extractmetadata.rb'
+class ParseFile
+  def initialize(file, input_dir, output_dir)
+    @path = file
+    @input_dir = input_dir
+    @output_dir = output_dir
+  end
+  # Parse the file
+  def parse_file
+    # Get metadata
+    m = ExtractMetadata.new(@path, @input_dir, @output_dir)
+    @metadata = m.extract
+    # OCR File
+    o = OCRFile.new(@path, @input_dir, @output_dir, @metadata[:rel_path])
+    @text = o.ocr
+    # Generate output and return
+    gen_output
+  end
+  # Generate output
+  def gen_output
+    outhash = Hash.new
+    outhash[:full_path] = @path
+    outhash.merge!(@metadata)
+    outhash[:text] = @text
+    return JSON.pretty_generate(outhash)
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,47 @@
+--- !ruby/object:Gem::Specification
+name: parsefile
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+platform: ruby
+authors:
+- M. C. McGrath
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2016-01-28 00:00:00.000000000 Z
+dependencies: []
+description: OCR file and extract metadata
+email: shidash@shidash.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/extractmetadata.rb
+- lib/ocrfile.rb
+- lib/parsefile.rb
+homepage: https://github.com/TransparencyToolkit/parsefile
+licenses:
+- GPL
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.4.8
+signing_key:
+specification_version: 4
+summary: OCR file and extract metadata
+test_files: []
+has_rdoc: