RubyGems - act_as_page_extractor - Versions diffs - 0.1.0 - Mend

act_as_page_extractor 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

checksums.yaml +7 -0
data/.gitignore +57 -0
data/.rmvrc +1 -0
data/.rspec +3 -0
data/.ruby-gemset +1 -0
data/.ruby-version +1 -0
data/Gemfile +22 -0
data/Gemfile.lock +107 -0
data/LICENSE +21 -0
data/README.md +119 -0
data/Rakefile +6 -0
data/act_as_page_extractor.gemspec +34 -0
data/lib/act_as_page_extractor.rb +126 -0
data/lib/act_as_page_extractor/modules/extracting.rb +35 -0
data/lib/act_as_page_extractor/modules/interface.rb +30 -0
data/lib/act_as_page_extractor/modules/saving.rb +47 -0
data/lib/act_as_page_extractor/modules/tools.rb +54 -0
data/lib/act_as_page_extractor/modules/unzipping.rb +15 -0
data/lib/act_as_page_extractor/modules/validating.rb +22 -0
data/lib/act_as_page_extractor/version.rb +5 -0
data/lib/generators/act_as_page_extractor/migration_generator.rb +49 -0
data/lib/generators/act_as_page_extractor/templates/act_as_page_extractor.rb.erb +14 -0
data/lib/generators/act_as_page_extractor/templates/add_page_extractor_fields_to_documents.rb.erb +8 -0
data/lib/generators/act_as_page_extractor/templates/create_extracted_pages_table.rb.erb +19 -0
data/lib/generators/act_as_page_extractor/templates/extracted_page.rb.erb +3 -0
data/spec/act_as_page_extractor_spec.rb +46 -0
data/spec/spec_helper.rb +8 -0
data/spec/support/models.rb +92 -0
data/test/test-doc-3-pages.doc +0 -0
data/test/test-doc-3-pages.docx +0 -0
data/test/test-doc-3-pages.docx.7z +0 -0
data/test/test-doc-3-pages.docx.rar +0 -0
data/test/test-doc-3-pages.docx.zip +0 -0
data/test/test-doc-3-pages.html +279 -0
data/test/test-doc-3-pages.odt +0 -0
data/test/test-doc-3-pages.pdf +0 -0
data/test/test-doc-3-pages.rtf +339 -0
data/test/test-doc-3-pages.txt +125 -0
data/test/test-doc-3-pages.wrong +0 -0
metadata +279 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 5b69a8043a6f3c7c01db160cbefaac0901a095e9
+  data.tar.gz: 7f18973005aa3aa1c7ea465bc73b7d4a1fcce266
+SHA512:
+  metadata.gz: fb9b0bb9ae38c501bb7d0d729a893be689695d6eb5cec46ab629bd529da1eb70c6ce0463974cead6cc4d955da915d3c01d569597d4fe1d30ef6441efa4ff01ba
+  data.tar.gz: 1836aa3cad7d8ff323b55afd3433e8842bb03efd069f0d9d947debc54af7324faa737dc08c9d11851861dd493f17d523b771747e8884b2300b9c593c383dacfb

data/.gitignore ADDED Viewed

@@ -0,0 +1,57 @@
+*.gem
+*.rbc
+/.config
+/coverage/
+/InstalledFiles
+/pkg/
+/spec/reports/
+/spec/examples.txt
+/test/tmp/
+/test/version_tmp/
+/tmp/
+# Used by dotenv library to load environment variables.
+# .env
+## Specific to RubyMotion:
+.dat*
+.repl_history
+build/
+*.bridgesupport
+build-iPhoneOS/
+build-iPhoneSimulator/
+## Specific to RubyMotion (use of CocoaPods):
+#
+# We recommend against adding the Pods directory to your .gitignore. However
+# you should judge for yourself, the pros and cons are mentioned at:
+# https://guides.cocoapods.org/using/using-cocoapods.html#should-i-check-the-pods-directory-into-source-control
+#
+# vendor/Pods/
+## Documentation cache and generated files:
+/.yardoc/
+/_yardoc/
+/doc/
+/rdoc/
+## Environment normalization:
+/.bundle/
+/vendor/bundle
+/lib/bundler/man/
+# for a library or gem, you might want to ignore these files since the code is
+# intended to run in multiple environments; otherwise, check them in:
+# Gemfile.lock
+# .ruby-version
+# .ruby-gemset
+# unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
+.rvmrc
+# Ignore all logfiles and tempfiles.
+/log/*
+!/log/.keep
+/public/uploads/*
+.idea/*
+test/uploads

data/.rmvrc ADDED Viewed

	@@ -0,0 +1 @@
1	+ rvm use ruby-2.3.0@act_as_page_extractor --create

data/.rspec ADDED Viewed

@@ -0,0 +1,3 @@
+--color
+#--warnings
+--require spec_helper

data/.ruby-gemset ADDED Viewed

	@@ -0,0 +1 @@
1	+ act_as_page_extractor

data/.ruby-version ADDED Viewed

	@@ -0,0 +1 @@
1	+ ruby-2.3.0

data/Gemfile ADDED Viewed

@@ -0,0 +1,22 @@
+source 'https://rubygems.org'
+# Specify your gem's dependencies in total_compressor.gemspec
+gemspec
+gem 'activerecord', '~> 4.1'
+gem 'awesome_print'
+gem 'docsplit'            # API for OpenOffice jodconverter (any to pdf)
+gem 'pdf_utils'           # getting text from pdf
+gem 'prawn', '~>0.7.1'    # need for pdf_utils
+gem 'pdf-reader'          # need for pdf_utils
+gem 'total_compressor'    # decompressing files
+gem 'filesize'            # pretty size of file
+gem 'byebug'
+group :test do
+  gem 'rspec' , '>= 2.14'
+  gem 'simplecov', require: false, group: :test
+end

data/Gemfile.lock ADDED Viewed

@@ -0,0 +1,107 @@
+PATH
+  remote: .
+  specs:
+    act_as_page_extractor (0.0.1)
+      activerecord (~> 4.1)
+      awesome_print
+      docsplit
+      filesize
+      pdf-reader
+      pdf_utils
+      prawn (~> 0.7.1)
+      total_compressor
+GEM
+  remote: https://rubygems.org/
+  specs:
+    Ascii85 (1.0.2)
+    activemodel (4.2.7.1)
+      activesupport (= 4.2.7.1)
+      builder (~> 3.1)
+    activerecord (4.2.7.1)
+      activemodel (= 4.2.7.1)
+      activesupport (= 4.2.7.1)
+      arel (~> 6.0)
+    activesupport (4.2.7.1)
+      i18n (~> 0.7)
+      json (~> 1.7, >= 1.7.7)
+      minitest (~> 5.1)
+      thread_safe (~> 0.3, >= 0.3.4)
+      tzinfo (~> 1.1)
+    afm (0.2.2)
+    arel (6.0.3)
+    awesome_print (1.7.0)
+    builder (3.2.2)
+    byebug (9.0.5)
+    diff-lcs (1.2.5)
+    docile (1.1.5)
+    docsplit (0.7.6)
+    filesize (0.1.1)
+    hashery (2.1.2)
+    i18n (0.7.0)
+    json (1.8.3)
+    minitest (5.9.0)
+    pdf-reader (1.4.0)
+      Ascii85 (~> 1.0.0)
+      afm (~> 0.2.1)
+      hashery (~> 2.0)
+      ruby-rc4
+      ttfunk
+    pdf_utils (0.1.0)
+    prawn (0.7.2)
+      prawn-core (>= 0.7.2, < 0.8)
+      prawn-layout (>= 0.7.2, < 0.8)
+      prawn-security (>= 0.7.1, < 0.8)
+    prawn-core (0.7.2)
+    prawn-layout (0.7.2)
+    prawn-security (0.7.1)
+    rake (11.2.2)
+    rspec (3.5.0)
+      rspec-core (~> 3.5.0)
+      rspec-expectations (~> 3.5.0)
+      rspec-mocks (~> 3.5.0)
+    rspec-core (3.5.3)
+      rspec-support (~> 3.5.0)
+    rspec-expectations (3.5.0)
+      diff-lcs (>= 1.2.0, < 2.0)
+      rspec-support (~> 3.5.0)
+    rspec-mocks (3.5.0)
+      diff-lcs (>= 1.2.0, < 2.0)
+      rspec-support (~> 3.5.0)
+    rspec-support (3.5.0)
+    ruby-rc4 (0.1.5)
+    rubyzip (0.9.9)
+    simplecov (0.11.2)
+      docile (~> 1.1.0)
+      json (~> 1.8)
+      simplecov-html (~> 0.10.0)
+    simplecov-html (0.10.0)
+    thread_safe (0.3.5)
+    total_compressor (0.1.6)
+      awesome_print
+      rubyzip (~> 0.9.9)
+    ttfunk (1.4.0)
+    tzinfo (1.2.2)
+      thread_safe (~> 0.1)
+PLATFORMS
+  ruby
+DEPENDENCIES
+  act_as_page_extractor!
+  activerecord (~> 4.1)
+  awesome_print
+  bundler (~> 1.3)
+  byebug
+  docsplit
+  filesize
+  pdf-reader
+  pdf_utils
+  prawn (~> 0.7.1)
+  rake
+  rspec (>= 2.14)
+  simplecov
+  total_compressor
+BUNDLED WITH
+   1.13.7

data/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2017 Flower Team
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,119 @@
+act_as_page_extractor
+================
+Library for extracting plain text from documents(files) for further processing (indexing and searching).
+## Installation
+Install appropriate tools before using:
+    sudo apt-get install zlib1g zlib1g-dev zip rar p7zip-full
+Add this line to your application's Gemfile:
+    gem 'act_as_page_extractor'
+## Usage
+For example, for model Document we need execute:
+    $ bundle
+    $ rails g act_as_page_extractor:migration Document category_id user_id
+As a result we get two migration files:
+    class AddPageExtractorFields < ActiveRecord::Migration
+      def change
+        add_column :documents, :page_extraction_state, :string, default: ''
+        add_column :documents, :page_extraction_pages, :integer, default: 0
+        add_column :documents, :page_extraction_doctype, :string, default: ''
+        add_column :documents, :page_extraction_filesize, :string, default: ''
+      end
+    end
+    class CreateExtractedPages < ActiveRecord::Migration
+      def change
+        create_table :extracted_pages do |t|
+          t.text :page
+          t.integer :document_id
+          t.integer :category_id
+          t.integer :user_id
+          t.integer :page_number
+          t.timestamps null: false
+        end
+        add_index :extracted_pages, :document_id
+        add_index :extracted_pages, :category_id
+        add_index :extracted_pages, [:document_id, :category_id]
+        add_index :extracted_pages, [:document_id, :page_number]
+      end
+    end
+Model Document must have field which contains path to file(supports [different archive types](https://github.com/phlowerteam/total_compressor) that contains [txt, pdf, doc/x, txt, html, rtf, ...](https://www.exoplatform.com/docs/public/index.jsp?topic=%2FPLF43%2FPLFAdminGuide.Configuration.JODConverter.html))
+Add to model next parameters for initializing:
+        class Document < ActiveRecord::Base
+          include ActAsPageExtractor
+          act_as_page_extractor options: {
+            document_class:    'Document',
+            save_as_pdf:       true,
+            filename:          :filename,
+            document_id:       :document_id,
+            additional_fields: [:category_id, :user_id],
+            #file_storage:      "/full/path/to/documents/storage",
+            #pdf_storage:       "/full/path/to/extracted/pdf/storage"
+          }
+          has_many :extracted_pages, dependent: :destroy
+      end
+Now our instance has few new methods:
+    document = Document.first
+    document.page_extract!
+    document.extracted_pages
+    document.pdf_path # if option 'save_as_pdf' is 'true'
+    # Access to pages
+    ExtractedPage.count
+    # Importing whole directory of documents
+    ActAsPageExtractor.import_files('/path/to/foler/with/documents')
+    # We can use cron for run the processing of all the new documents
+    ActAsPageExtractor.start_extraction
+    # Getting statistics information of all documents
+    ActAsPageExtractor.statistics
+Parameters of initializing `act_as_page_extractor options: { ... }`:
+`document_class` - name of model (e.g. 'Document)
+`save_as_pdf` - boolean [true, false] when we want save temporary pdf
+`filename` - name of field which contains access to the file and it should be an object with 'url' method that returns path to file (e.g. CarrierWave object with 'filename.url')
+`document_id` - name for saving id
+`additional_fields` - additional fields that added to extracted page (e.g. for indexing, etc.)
+`file_storage` - path for saving tmp files (by default it is "public")
+`pdf_storage` - path for saving pdf (by default it is "public/uploads/extracted/pdf")
+## Run tests
+    $ COVERAGE=true rspec
+## Contributing
+1. Fork it
+2. Create your feature branch (`git checkout -b my-new-feature`)
+3. Commit your changes (`git commit -am 'Add some feature'`)
+4. Push to the branch (`git push origin my-new-feature`)
+5. Create new Pull Request
+## Contacts
+https://github.com/phlowerteam
+phlowerteam@gmail.com
+## License
+Copyright (c) 2017 PhlowerTeam
+MIT License

data/Rakefile ADDED Viewed

@@ -0,0 +1,6 @@
+require "bundler/gem_tasks"
+require "rspec/core/rake_task"
+RSpec::Core::RakeTask.new(:spec)
+task default: :spec

data/act_as_page_extractor.gemspec ADDED Viewed

@@ -0,0 +1,34 @@
+# coding: utf-8
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'act_as_page_extractor/version'
+Gem::Specification.new do |spec|
+  spec.name          = 'act_as_page_extractor'
+  spec.version       = ActAsPageExtractor::VERSION
+  spec.authors       = ['PhlowerTeam']
+  spec.email         = ['phlowerteam@gmail.com']
+  spec.description   = %q{Library (Docsplit wrapper) for text extraction from pdf, doc/x, txt files with OpenOffice}
+  spec.summary       = %q{Uses system calls}
+  spec.homepage      = 'https://github.com/phlowerteam'
+  spec.license       = 'MIT'
+  spec.files         = `git ls-files`.split($/)
+  spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})
+  spec.require_paths = ['lib']
+  spec.add_development_dependency 'bundler', '~> 1.3'
+  spec.add_development_dependency 'rake'
+  spec.add_development_dependency 'byebug'
+  spec.add_development_dependency 'rspec'
+  spec.add_development_dependency 'simplecov'
+  spec.add_runtime_dependency 'activerecord', '~> 4.1'
+  spec.add_runtime_dependency 'awesome_print'
+  spec.add_runtime_dependency 'docsplit'            # API for OpenOffice jodconverter (any to pdf)
+  spec.add_runtime_dependency 'pdf_utils'           # getting text from pdf
+  spec.add_runtime_dependency 'prawn', '~>0.7.1'    # need for pdf_utils
+  spec.add_runtime_dependency 'pdf-reader'          # need for pdf_utils
+  spec.add_runtime_dependency 'total_compressor'    # decompressing files
+  spec.add_runtime_dependency 'filesize'            # pretty size of file
+end

data/lib/act_as_page_extractor.rb ADDED Viewed

@@ -0,0 +1,126 @@
+require 'act_as_page_extractor/version'
+require 'active_record'
+require 'awesome_print'
+require 'filesize'
+require 'total_compressor'
+require 'docsplit'
+require 'pdf_utils'
+require 'prawn'
+require 'pdf-reader'
+require 'act_as_page_extractor/modules/tools.rb'
+require 'act_as_page_extractor/modules/validating.rb'
+require 'act_as_page_extractor/modules/unzipping.rb'
+require 'act_as_page_extractor/modules/extracting.rb'
+require 'act_as_page_extractor/modules/saving.rb'
+require 'act_as_page_extractor/modules/interface'
+module ActAsPageExtractor
+  extend ActiveSupport::Concern
+  included do
+    before_create { self.page_extraction_state = EXTRACTING_STATES[:new] }
+    before_destroy :remove_files
+  end
+  module ClassMethods
+    def act_as_page_extractor(options: {})
+      define_method(:save_as_pdf){|*args| options[:save_as_pdf] }
+      define_method(:extracted_filename){|*args| self.send(options[:filename].to_sym) }
+      ActAsPageExtractor.define_singleton_method(:extracted_filename) {|*args| options[:filename] }
+      ActAsPageExtractor.define_singleton_method(:document_class) {|*args| Object.const_get(options[:document_class]) }
+      define_method(:extracted_document_id){|*args| options[:document_id] }
+      define_method(:additional_fields){|*args| options[:additional_fields] || [] }
+      define_method(:file_storage){|*args| options[:file_storage] || FILE_STORAGE }
+      define_method(:pdf_storage){|*args| options[:pdf_storage] || PDF_STORAGE }
+    end
+  end
+  EXTRACTING_STATES = {
+    new: 'new',
+    extracting: 'extracting',
+    extracted: 'extracted',
+    'error.extraction': 'error.extraction'
+  }.freeze
+  TMP_EXTRACTION_FILE_STORAGE = "#{Dir.pwd}/tmp/page_extraction".freeze
+  FILE_STORAGE = "#{Dir.pwd}/public".freeze
+  PDF_STORAGE = "#{FILE_STORAGE}/uploads/extracted/pdf".freeze
+  def initialized
+    # add all need callbacks
+      #on destroy remove pdf
+    #Add to Readme!!
+    #rails g act_as_page_extractor:migration Document category_id user_id
+    # add to [Document] model:
+    # has_many :extracted_pages, dependent: :destroy
+    create_pdf_dir
+  end
+  def page_extract!
+    initialized
+    cleanup_pages
+    create_tmp_dir
+    begin
+      copy_document
+      unzip_document
+      if valid_document
+        extract_pages
+        save_to_db
+      end
+    ensure
+      update_state
+      save_pdf
+      debug_info
+      finish
+    end
+  end
+  private
+  def create_pdf_dir
+    if save_as_pdf
+      FileUtils::mkdir_p(pdf_storage) unless File.exists?(pdf_storage)
+    end
+  end
+  def create_tmp_dir
+    @tmp_dir = "#{TMP_EXTRACTION_FILE_STORAGE}/#{SecureRandom.hex(6)}"
+    FileUtils::mkdir_p(@tmp_dir) unless File.exists?(@tmp_dir)
+  end
+  def copy_document
+    @origin_document_path = "#{file_storage}#{self.send(:extracted_filename).url.to_s}"
+    FileUtils.cp(@origin_document_path, @tmp_dir)
+    @copy_document_path = "#{@tmp_dir}/#{@origin_document_path.split("/").last}"
+    @document_filename = @origin_document_path.split("/").last
+  end
+  def finish
+    remove_tmp_dir
+  end
+  def remove_tmp_dir
+    FileUtils.rm_rf(@tmp_dir) if @tmp_dir =~ /\/tmp\//
+  end
+end
+# rails g model ExtractedPage page:text document_id:integer category_id:integer page_number:integer
+# Rails 4 way
+# 9.2.7.1 Multiple Callback Methods in One Class
+# 258 page
+# class ActiveRecord::Base
+#   def self.acts_as_page_extractor(document_field=:filename)
+#     auditor = Auditor.new(audit_log)
+#     after_create auditor
+#     after_update auditor
+#     after_destroy auditor
+#   end
+# end