RubyGems - act_as_page_extractor - Versions diffs - 0.1.0 - Mend

act_as_page_extractor 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

checksums.yaml +7 -0
data/.gitignore +57 -0
data/.rmvrc +1 -0
data/.rspec +3 -0
data/.ruby-gemset +1 -0
data/.ruby-version +1 -0
data/Gemfile +22 -0
data/Gemfile.lock +107 -0
data/LICENSE +21 -0
data/README.md +119 -0
data/Rakefile +6 -0
data/act_as_page_extractor.gemspec +34 -0
data/lib/act_as_page_extractor.rb +126 -0
data/lib/act_as_page_extractor/modules/extracting.rb +35 -0
data/lib/act_as_page_extractor/modules/interface.rb +30 -0
data/lib/act_as_page_extractor/modules/saving.rb +47 -0
data/lib/act_as_page_extractor/modules/tools.rb +54 -0
data/lib/act_as_page_extractor/modules/unzipping.rb +15 -0
data/lib/act_as_page_extractor/modules/validating.rb +22 -0
data/lib/act_as_page_extractor/version.rb +5 -0
data/lib/generators/act_as_page_extractor/migration_generator.rb +49 -0
data/lib/generators/act_as_page_extractor/templates/act_as_page_extractor.rb.erb +14 -0
data/lib/generators/act_as_page_extractor/templates/add_page_extractor_fields_to_documents.rb.erb +8 -0
data/lib/generators/act_as_page_extractor/templates/create_extracted_pages_table.rb.erb +19 -0
data/lib/generators/act_as_page_extractor/templates/extracted_page.rb.erb +3 -0
data/spec/act_as_page_extractor_spec.rb +46 -0
data/spec/spec_helper.rb +8 -0
data/spec/support/models.rb +92 -0
data/test/test-doc-3-pages.doc +0 -0
data/test/test-doc-3-pages.docx +0 -0
data/test/test-doc-3-pages.docx.7z +0 -0
data/test/test-doc-3-pages.docx.rar +0 -0
data/test/test-doc-3-pages.docx.zip +0 -0
data/test/test-doc-3-pages.html +279 -0
data/test/test-doc-3-pages.odt +0 -0
data/test/test-doc-3-pages.pdf +0 -0
data/test/test-doc-3-pages.rtf +339 -0
data/test/test-doc-3-pages.txt +125 -0
data/test/test-doc-3-pages.wrong +0 -0
metadata +279 -0

data/lib/act_as_page_extractor/modules/extracting.rb ADDED Viewed

@@ -0,0 +1,35 @@
+module ActAsPageExtractor
+  def extract_pages
+    convert_to_pdf
+    convert_to_text
+  end
+  def convert_to_pdf
+     @pdf_path = if 'pdf' == @document_path.split('.').last.downcase
+       @document_path
+     else
+      if timeout_wrapper{ Docsplit.extract_pdf(@document_path, output: @tmp_dir)}
+        pdf_path = (@document_path.split('.')[0..-2] + ['pdf']).join('.')
+        pdf_path if File.exists?(pdf_path)
+      end
+    end
+  end
+  def convert_to_text
+    begin
+      @pdf_pages = PdfUtils.info(@pdf_path).pages
+      if @pdf_pages
+        if timeout_wrapper{ Docsplit::extract_text(@pdf_path, ocr: false, pages: 'all', output: @tmp_dir) }
+        else
+          # :nocov:
+          @pdf_pages = nil
+          raise
+          # :nocov:
+        end
+      end
+    # :nocov:
+    rescue
+    end
+    # :nocov:
+  end
+end

data/lib/act_as_page_extractor/modules/interface.rb ADDED Viewed

@@ -0,0 +1,30 @@
+module ActAsPageExtractor
+  def origin_file_name
+    self.send(:extracted_filename).url.to_s.split('/').last
+  end
+  def pdf_path
+    if page_extraction_state == EXTRACTING_STATES[:extracted] && page_extraction_doctype&.downcase != 'pdf'
+      "#{pdf_storage}/#{origin_file_name.split('.').first}.pdf"
+    end
+  end
+  def remove_files
+    FileUtils::rm_rf(pdf_path) if File.exists?(pdf_path.to_s)
+  end
+  def self.start_extraction
+    document_class.where(page_extraction_state: EXTRACTING_STATES[:new]).each(&:page_extract!)
+  end
+  def self.statistics
+    totals_documents = document_class.count
+    supported_documents = document_class.where("page_extraction_doctype ILIKE ANY (array[#{VALIDATE_DOC_TYPES.map{|dt| '\'%'+dt+'%\''}.join(',')}])").count
+    {
+      total: totals_documents,
+      supported_documents: supported_documents,
+      unsupported_documents: totals_documents - supported_documents,
+      states: EXTRACTING_STATES.map{|state, value| [ state, document_class.where(page_extraction_state: value).count] }.to_h,
+    }
+  end
+end

data/lib/act_as_page_extractor/modules/saving.rb ADDED Viewed

@@ -0,0 +1,47 @@
+module ActAsPageExtractor
+  def save_pdf
+    if save_as_pdf &&
+       is_extracted &&
+       @document_path.split('.').last&.downcase != 'pdf'
+      if @pdf_path
+        FileUtils.cp(@pdf_path, pdf_storage)
+      end
+    end
+  end
+  def save_to_db
+    self.update_attributes(page_extraction_state: EXTRACTING_STATES[:extracting])
+    ExtractedPage.transaction do
+      @pdf_pages&.times&.each do |pdf_page|
+        page_filename = "#{@tmp_dir}/#{@document_filename.split('.').first}_#{(pdf_page + 1).to_s}.txt"
+        remove_last_byte(page_filename)
+        content = IO.read(page_filename).delete("<" ">" "&" "\u0001" "\u25A0" "\a")
+        page_attributes = {
+          page:        content,
+          page_number: pdf_page + 1
+        }
+        page_attributes[extracted_document_id] = self.id
+        additional_fields.each do |additional_field|
+          page_attributes[additional_field] = self.send(additional_field.to_sym)
+        end
+        ExtractedPage.create(page_attributes)
+      end
+    end
+  end
+  #fix for openoffice/jodconverter: delete last ugly byte in converted text page
+  def remove_last_byte(file_name)
+    file = File.new(file_name, 'a+')
+    if file.size > 0
+      file.seek(file.size - 1)
+      last_byte = file.getc
+      file.truncate(file.size - 1) if last_byte == "\f"
+    end
+    file.close
+  end
+end

data/lib/act_as_page_extractor/modules/tools.rb ADDED Viewed

@@ -0,0 +1,54 @@
+module ActAsPageExtractor
+  def timeout_wrapper
+    result = nil
+    begin
+      result = Timeout::timeout(60*5) { yield }
+    rescue
+    # :nocov:
+    ensure
+    # :nocov:
+      result
+    end
+  end
+  def is_extracted
+    @pdf_pages.to_i > 0 && self.extracted_pages.count == @pdf_pages
+  end
+  def update_state
+    updated_attributes = if is_extracted
+      {
+        page_extraction_state: EXTRACTING_STATES[:extracted],
+        page_extraction_pages: @pdf_pages
+      }
+    else
+      {
+        page_extraction_state: EXTRACTING_STATES[:'error.extraction'],
+        page_extraction_pages: 0
+      }
+    end.merge({
+        page_extraction_doctype: @document_path&.split('.')&.last,
+        page_extraction_filesize: Filesize.from("#{File.size(@document_path)} B").pretty
+      })
+    self.update_attributes(updated_attributes)
+  end
+  def cleanup_pages
+    self.extracted_pages.destroy_all
+  end
+  # :nocov:
+  def debug_info
+    # ap "@tmp_dir"
+    # ap @tmp_dir
+    # ap "@copy_document_path"
+    # ap @copy_document_path
+    # ap "@document_path"
+    ap @document_path
+    # ap "@pdf_path"
+    # ap @pdf_path
+    # ap "@pdf_pages"
+    ap @pdf_pages
+  end
+  # :nocov:
+end

data/lib/act_as_page_extractor/modules/unzipping.rb ADDED Viewed

@@ -0,0 +1,15 @@
+module ActAsPageExtractor
+ def unzip_document
+    @document_path = @copy_document_path
+    if validate_compress_types
+      result = TotalCompressor.decompress(@copy_document_path)
+      if result[:success] && result[:files].length == 1
+        origin_document_name = @origin_document_path.split("/").last.split('.').first
+        unpacked_document = result[:files].first.split('/').last
+        unpacked_document_format = unpacked_document.split('.').last
+        @document_path = "#{@tmp_dir}/#{origin_document_name}.#{unpacked_document_format}"
+        File.rename(result[:files].first, @document_path)
+      end
+    end
+  end
+end

data/lib/act_as_page_extractor/modules/validating.rb ADDED Viewed

@@ -0,0 +1,22 @@
+module ActAsPageExtractor
+  VALIDATE_COMPRESS_TYPES = ['zip', 'rar', '7z', 'gzip'].freeze
+  VALIDATE_DOC_TYPES = ['txt', 'pdf', 'doc', 'docx',
+                        'rtf', 'odt', 'htm', 'html'].freeze
+  def valid_document
+    validate_size && validate_doc_types
+  end
+  def validate_size
+    mb = 2**20
+    File.size(@copy_document_path) <= 1*mb
+  end
+  def validate_compress_types
+    VALIDATE_COMPRESS_TYPES.include?(@copy_document_path.split('.').last&.downcase)
+  end
+  def validate_doc_types
+    VALIDATE_DOC_TYPES.include?(@document_path.split('.').last&.downcase)
+  end
+end

data/lib/act_as_page_extractor/version.rb ADDED Viewed

@@ -0,0 +1,5 @@
+# :nocov:
+module ActAsPageExtractor
+  VERSION = "0.1.0"
+end
+# :nocov:

data/lib/generators/act_as_page_extractor/migration_generator.rb ADDED Viewed

@@ -0,0 +1,49 @@
+# :nocov:
+require 'rails/generators/active_record'
+require 'rails/generators/base'
+module ActAsPageExtractor
+  module Generators # :nodoc:
+    class MigrationGenerator < Rails::Generators::Base # :nodoc:
+      include Rails::Generators::Migration
+      argument :document_class, type: :string, default: 'Document'
+      argument :additional_fields, type: :array, default: []
+      def self.default_generator_root
+        File.dirname(__FILE__)
+      end
+      def create_migration_file
+        migration_template 'create_extracted_pages_table.rb.erb', "db/migrate/create_#{page_extractor_table_name}.rb"
+        migration_template 'add_page_extractor_fields_to_documents.rb.erb', "db/migrate/add_page_extractor_fields_to_#{documents_table_name}.rb"
+        template "extracted_page.rb.erb", "app/models/extracted_page.rb"
+        template "act_as_page_extractor.rb.erb", "config/initializers/act_as_page_extractor.rb"
+      end
+      private
+      def page_extractor_table_name
+        'extracted_pages'
+      end
+      def migration_class_name_page_extractor
+        "Create#{page_extractor_table_name.camelize}"
+      end
+      def documents_table_name
+        document_class.underscore.pluralize
+      end
+      def migration_class_name_documents
+        "AddPageExtractorFieldsTo#{document_class.pluralize}"
+      end
+      def self.next_migration_number(dirname)
+        ActiveRecord::Generators::Base.next_migration_number(dirname)
+      end
+    end
+  end
+end
+# :nocov:

data/lib/generators/act_as_page_extractor/templates/act_as_page_extractor.rb.erb ADDED Viewed

@@ -0,0 +1,14 @@
+module ActAsPageExtractor
+  def self.import_files(directory: nil)
+    return unless directory
+    Dir["#{directory}/*"].each do |fname|
+      ap fname
+      document = <%=document_class%>.new(
+        name: fname.split('/').last
+      )
+      document.send("#{extracted_filename}=".to_sym, File.open(fname))
+      document.save!
+    end
+  end
+end

data/lib/generators/act_as_page_extractor/templates/add_page_extractor_fields_to_documents.rb.erb ADDED Viewed

@@ -0,0 +1,8 @@
+class <%= migration_class_name_documents %> < ActiveRecord::Migration
+  def change
+    add_column :<%= documents_table_name %>, :page_extraction_state, :string, default: ''
+    add_column :<%= documents_table_name %>, :page_extraction_pages, :integer, default: 0
+    add_column :<%= documents_table_name %>, :page_extraction_doctype, :string, default: ''
+    add_column :<%= documents_table_name %>, :page_extraction_filesize, :string, default: ''
+  end
+end

data/lib/generators/act_as_page_extractor/templates/create_extracted_pages_table.rb.erb ADDED Viewed

@@ -0,0 +1,19 @@
+class <%= migration_class_name_page_extractor %> < ActiveRecord::Migration
+  def change
+    create_table :<%= page_extractor_table_name %> do |t|
+      t.text :page
+      t.integer :<%= documents_table_name.singularize %>_id, null: false
+      <% additional_fields.each do |field|%>
+      t.integer :<%= field %><%end%>
+      t.integer :page_number
+      t.timestamps
+    end
+    add_index :<%= page_extractor_table_name %>, :<%= documents_table_name.singularize %>_id
+    <% additional_fields.each do |field|%>
+    add_index :<%= page_extractor_table_name %>, :<%= field %><%end%>
+    <% additional_fields.each do |field|%>
+    add_index :<%= page_extractor_table_name %>, [:<%= documents_table_name.singularize %>_id, :<%= field %>]<%end%>
+    add_index :<%= page_extractor_table_name %>, [:<%= documents_table_name.singularize %>_id, :page_number]
+  end
+end

data/lib/generators/act_as_page_extractor/templates/extracted_page.rb.erb ADDED Viewed

@@ -0,0 +1,3 @@
+class ExtractedPage < ActiveRecord::Base
+  belongs_to :<%= documents_table_name.singularize %>
+end

data/spec/act_as_page_extractor_spec.rb ADDED Viewed

@@ -0,0 +1,46 @@
+require 'spec_helper'
+require 'act_as_page_extractor'
+describe ActAsPageExtractor do
+  context 'correct extraction' do
+    [
+      'test-doc-3-pages.docx',
+      'test-doc-3-pages.doc',
+      'test-doc-3-pages.pdf',
+      'test-doc-3-pages.rtf',
+      'test-doc-3-pages.odt',
+      'test-doc-3-pages.html',
+      'test-doc-3-pages.txt',
+      'test-doc-3-pages.docx.zip',
+      'test-doc-3-pages.docx.rar',
+      'test-doc-3-pages.docx.7z'
+    ].each do |document|
+      it "extraction valid document #{document}" do
+        book = Book.new({doc_path: document})
+        allow(Book).to receive_message_chain('where') { [book] }
+        ActAsPageExtractor.start_extraction
+        expect(book.page_extraction_state).to eq ActAsPageExtractor::EXTRACTING_STATES[:extracted]
+        expect(ExtractedPage.array.count).to eq 3
+        expect(ExtractedPage.array[0][:page]).to match /require \'act_as_page_extractor\/modules\/interface\'/
+        unless document.match /pdf/
+          expect(book.pdf_path).to match /pdf/
+          expect(book.remove_files.count).to eq 1
+        end
+        expect(ActAsPageExtractor.statistics).to include(supported_documents:  1)
+      end
+    end
+  end
+  context 'incorrect extraction' do
+    [
+      'test-doc-3-pages.wrong',
+    ].each do |document|
+      it "extraction invalid document #{document}" do
+        book = Book.new({doc_path: document})
+        allow(Book).to receive_message_chain('where') { [book] }
+        ActAsPageExtractor.start_extraction
+        expect(book.page_extraction_state).to eq ActAsPageExtractor::EXTRACTING_STATES[:'error.extraction']
+      end
+    end
+  end
+end

data/spec/spec_helper.rb ADDED Viewed

@@ -0,0 +1,8 @@
+if ENV['COVERAGE']
+  require 'simplecov'
+  SimpleCov.start 'rails'
+end
+require 'rspec'
+require 'support/models'
+require 'act_as_page_extractor'
+require 'byebug'

data/spec/support/models.rb ADDED Viewed

@@ -0,0 +1,92 @@
+require 'act_as_page_extractor'
+class Filename
+  attr_accessor :url
+  def initialize(params)
+    @url = params[:url]
+  end
+end
+class Book
+  cattr_accessor :id,
+                :category_id,
+                :user_id,
+                :page_extraction_state,
+                :page_extraction_pages,
+                :page_extraction_doctype,
+                :page_extraction_filesize
+  def self.before_create &block
+    yield
+  end
+  def self.before_destroy *args
+  end
+  def self.count
+    1
+  end
+  include ActAsPageExtractor
+  act_as_page_extractor options: {
+    document_class:    'Book',
+    save_as_pdf:       true,
+    filename:          :filename, # CarrierWave class with 'filename.url' method
+    document_id:       :document_id,
+    additional_fields: [:category_id, :user_id],
+    file_storage:      "#{Dir.pwd}/test/",
+    pdf_storage:       "#{Dir.pwd}/test/uploads/extracted/pdf"
+  }
+  def initialize(params)
+    @doc_path = params[:doc_path]
+    @id = @category_id = @user_id = nil
+    @page_extraction_state = @page_extraction_pages = nil
+    @page_extraction_doctype = @page_extraction_filesize = nil
+    ExtractedPage.cleanup
+  end
+  def filename
+    Filename.new(url: @doc_path)
+  end
+  def extracted_pages
+    array ||= ExtractedPage.array
+    def array.destroy_all
+    end
+    array
+  end
+  def update_attributes params
+    params.each do |key, value|
+      instance_eval("self.#{key} = #{value.class == String ? '\'' + value + '\'': value }")
+    end
+  end
+end
+class ExtractedPage
+  attr_accessor :id, :page, :document_id, :category_id, :page_number, :user_id
+  def document
+  end
+  def self.transaction &block
+    yield
+  end
+  def self.create params
+    @@array ||= []
+    @@array << params
+  end
+  def self.array
+    @@array ||= []
+  end
+  def self.cleanup
+    @@array = []
+  end
+end