act_as_page_extractor 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +57 -0
  3. data/.rmvrc +1 -0
  4. data/.rspec +3 -0
  5. data/.ruby-gemset +1 -0
  6. data/.ruby-version +1 -0
  7. data/Gemfile +22 -0
  8. data/Gemfile.lock +107 -0
  9. data/LICENSE +21 -0
  10. data/README.md +119 -0
  11. data/Rakefile +6 -0
  12. data/act_as_page_extractor.gemspec +34 -0
  13. data/lib/act_as_page_extractor.rb +126 -0
  14. data/lib/act_as_page_extractor/modules/extracting.rb +35 -0
  15. data/lib/act_as_page_extractor/modules/interface.rb +30 -0
  16. data/lib/act_as_page_extractor/modules/saving.rb +47 -0
  17. data/lib/act_as_page_extractor/modules/tools.rb +54 -0
  18. data/lib/act_as_page_extractor/modules/unzipping.rb +15 -0
  19. data/lib/act_as_page_extractor/modules/validating.rb +22 -0
  20. data/lib/act_as_page_extractor/version.rb +5 -0
  21. data/lib/generators/act_as_page_extractor/migration_generator.rb +49 -0
  22. data/lib/generators/act_as_page_extractor/templates/act_as_page_extractor.rb.erb +14 -0
  23. data/lib/generators/act_as_page_extractor/templates/add_page_extractor_fields_to_documents.rb.erb +8 -0
  24. data/lib/generators/act_as_page_extractor/templates/create_extracted_pages_table.rb.erb +19 -0
  25. data/lib/generators/act_as_page_extractor/templates/extracted_page.rb.erb +3 -0
  26. data/spec/act_as_page_extractor_spec.rb +46 -0
  27. data/spec/spec_helper.rb +8 -0
  28. data/spec/support/models.rb +92 -0
  29. data/test/test-doc-3-pages.doc +0 -0
  30. data/test/test-doc-3-pages.docx +0 -0
  31. data/test/test-doc-3-pages.docx.7z +0 -0
  32. data/test/test-doc-3-pages.docx.rar +0 -0
  33. data/test/test-doc-3-pages.docx.zip +0 -0
  34. data/test/test-doc-3-pages.html +279 -0
  35. data/test/test-doc-3-pages.odt +0 -0
  36. data/test/test-doc-3-pages.pdf +0 -0
  37. data/test/test-doc-3-pages.rtf +339 -0
  38. data/test/test-doc-3-pages.txt +125 -0
  39. data/test/test-doc-3-pages.wrong +0 -0
  40. metadata +279 -0
@@ -0,0 +1,35 @@
1
+ module ActAsPageExtractor
2
+ def extract_pages
3
+ convert_to_pdf
4
+ convert_to_text
5
+ end
6
+
7
+ def convert_to_pdf
8
+ @pdf_path = if 'pdf' == @document_path.split('.').last.downcase
9
+ @document_path
10
+ else
11
+ if timeout_wrapper{ Docsplit.extract_pdf(@document_path, output: @tmp_dir)}
12
+ pdf_path = (@document_path.split('.')[0..-2] + ['pdf']).join('.')
13
+ pdf_path if File.exists?(pdf_path)
14
+ end
15
+ end
16
+ end
17
+
18
+ def convert_to_text
19
+ begin
20
+ @pdf_pages = PdfUtils.info(@pdf_path).pages
21
+ if @pdf_pages
22
+ if timeout_wrapper{ Docsplit::extract_text(@pdf_path, ocr: false, pages: 'all', output: @tmp_dir) }
23
+ else
24
+ # :nocov:
25
+ @pdf_pages = nil
26
+ raise
27
+ # :nocov:
28
+ end
29
+ end
30
+ # :nocov:
31
+ rescue
32
+ end
33
+ # :nocov:
34
+ end
35
+ end
@@ -0,0 +1,30 @@
1
+ module ActAsPageExtractor
2
+ def origin_file_name
3
+ self.send(:extracted_filename).url.to_s.split('/').last
4
+ end
5
+
6
+ def pdf_path
7
+ if page_extraction_state == EXTRACTING_STATES[:extracted] && page_extraction_doctype&.downcase != 'pdf'
8
+ "#{pdf_storage}/#{origin_file_name.split('.').first}.pdf"
9
+ end
10
+ end
11
+
12
+ def remove_files
13
+ FileUtils::rm_rf(pdf_path) if File.exists?(pdf_path.to_s)
14
+ end
15
+
16
+ def self.start_extraction
17
+ document_class.where(page_extraction_state: EXTRACTING_STATES[:new]).each(&:page_extract!)
18
+ end
19
+
20
+ def self.statistics
21
+ totals_documents = document_class.count
22
+ supported_documents = document_class.where("page_extraction_doctype ILIKE ANY (array[#{VALIDATE_DOC_TYPES.map{|dt| '\'%'+dt+'%\''}.join(',')}])").count
23
+ {
24
+ total: totals_documents,
25
+ supported_documents: supported_documents,
26
+ unsupported_documents: totals_documents - supported_documents,
27
+ states: EXTRACTING_STATES.map{|state, value| [ state, document_class.where(page_extraction_state: value).count] }.to_h,
28
+ }
29
+ end
30
+ end
@@ -0,0 +1,47 @@
1
+ module ActAsPageExtractor
2
+ def save_pdf
3
+ if save_as_pdf &&
4
+ is_extracted &&
5
+ @document_path.split('.').last&.downcase != 'pdf'
6
+
7
+ if @pdf_path
8
+ FileUtils.cp(@pdf_path, pdf_storage)
9
+ end
10
+ end
11
+ end
12
+
13
+ def save_to_db
14
+ self.update_attributes(page_extraction_state: EXTRACTING_STATES[:extracting])
15
+ ExtractedPage.transaction do
16
+ @pdf_pages&.times&.each do |pdf_page|
17
+ page_filename = "#{@tmp_dir}/#{@document_filename.split('.').first}_#{(pdf_page + 1).to_s}.txt"
18
+ remove_last_byte(page_filename)
19
+ content = IO.read(page_filename).delete("<" ">" "&" "\u0001" "\u25A0" "\a")
20
+
21
+ page_attributes = {
22
+ page: content,
23
+ page_number: pdf_page + 1
24
+ }
25
+
26
+ page_attributes[extracted_document_id] = self.id
27
+
28
+ additional_fields.each do |additional_field|
29
+ page_attributes[additional_field] = self.send(additional_field.to_sym)
30
+ end
31
+
32
+ ExtractedPage.create(page_attributes)
33
+ end
34
+ end
35
+ end
36
+
37
+ #fix for openoffice/jodconverter: delete last ugly byte in converted text page
38
+ def remove_last_byte(file_name)
39
+ file = File.new(file_name, 'a+')
40
+ if file.size > 0
41
+ file.seek(file.size - 1)
42
+ last_byte = file.getc
43
+ file.truncate(file.size - 1) if last_byte == "\f"
44
+ end
45
+ file.close
46
+ end
47
+ end
@@ -0,0 +1,54 @@
1
+ module ActAsPageExtractor
2
+ def timeout_wrapper
3
+ result = nil
4
+ begin
5
+ result = Timeout::timeout(60*5) { yield }
6
+ rescue
7
+ # :nocov:
8
+ ensure
9
+ # :nocov:
10
+ result
11
+ end
12
+ end
13
+
14
+ def is_extracted
15
+ @pdf_pages.to_i > 0 && self.extracted_pages.count == @pdf_pages
16
+ end
17
+
18
+ def update_state
19
+ updated_attributes = if is_extracted
20
+ {
21
+ page_extraction_state: EXTRACTING_STATES[:extracted],
22
+ page_extraction_pages: @pdf_pages
23
+ }
24
+ else
25
+ {
26
+ page_extraction_state: EXTRACTING_STATES[:'error.extraction'],
27
+ page_extraction_pages: 0
28
+ }
29
+ end.merge({
30
+ page_extraction_doctype: @document_path&.split('.')&.last,
31
+ page_extraction_filesize: Filesize.from("#{File.size(@document_path)} B").pretty
32
+ })
33
+ self.update_attributes(updated_attributes)
34
+ end
35
+
36
+ def cleanup_pages
37
+ self.extracted_pages.destroy_all
38
+ end
39
+
40
+ # :nocov:
41
+ def debug_info
42
+ # ap "@tmp_dir"
43
+ # ap @tmp_dir
44
+ # ap "@copy_document_path"
45
+ # ap @copy_document_path
46
+ # ap "@document_path"
47
+ ap @document_path
48
+ # ap "@pdf_path"
49
+ # ap @pdf_path
50
+ # ap "@pdf_pages"
51
+ ap @pdf_pages
52
+ end
53
+ # :nocov:
54
+ end
@@ -0,0 +1,15 @@
1
+ module ActAsPageExtractor
2
+ def unzip_document
3
+ @document_path = @copy_document_path
4
+ if validate_compress_types
5
+ result = TotalCompressor.decompress(@copy_document_path)
6
+ if result[:success] && result[:files].length == 1
7
+ origin_document_name = @origin_document_path.split("/").last.split('.').first
8
+ unpacked_document = result[:files].first.split('/').last
9
+ unpacked_document_format = unpacked_document.split('.').last
10
+ @document_path = "#{@tmp_dir}/#{origin_document_name}.#{unpacked_document_format}"
11
+ File.rename(result[:files].first, @document_path)
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,22 @@
1
+ module ActAsPageExtractor
2
+ VALIDATE_COMPRESS_TYPES = ['zip', 'rar', '7z', 'gzip'].freeze
3
+ VALIDATE_DOC_TYPES = ['txt', 'pdf', 'doc', 'docx',
4
+ 'rtf', 'odt', 'htm', 'html'].freeze
5
+
6
+ def valid_document
7
+ validate_size && validate_doc_types
8
+ end
9
+
10
+ def validate_size
11
+ mb = 2**20
12
+ File.size(@copy_document_path) <= 1*mb
13
+ end
14
+
15
+ def validate_compress_types
16
+ VALIDATE_COMPRESS_TYPES.include?(@copy_document_path.split('.').last&.downcase)
17
+ end
18
+
19
+ def validate_doc_types
20
+ VALIDATE_DOC_TYPES.include?(@document_path.split('.').last&.downcase)
21
+ end
22
+ end
@@ -0,0 +1,5 @@
1
+ # :nocov:
2
+ module ActAsPageExtractor
3
+ VERSION = "0.1.0"
4
+ end
5
+ # :nocov:
@@ -0,0 +1,49 @@
1
+ # :nocov:
2
+ require 'rails/generators/active_record'
3
+ require 'rails/generators/base'
4
+
5
+ module ActAsPageExtractor
6
+ module Generators # :nodoc:
7
+ class MigrationGenerator < Rails::Generators::Base # :nodoc:
8
+ include Rails::Generators::Migration
9
+
10
+ argument :document_class, type: :string, default: 'Document'
11
+ argument :additional_fields, type: :array, default: []
12
+
13
+ def self.default_generator_root
14
+ File.dirname(__FILE__)
15
+ end
16
+
17
+ def create_migration_file
18
+ migration_template 'create_extracted_pages_table.rb.erb', "db/migrate/create_#{page_extractor_table_name}.rb"
19
+ migration_template 'add_page_extractor_fields_to_documents.rb.erb', "db/migrate/add_page_extractor_fields_to_#{documents_table_name}.rb"
20
+ template "extracted_page.rb.erb", "app/models/extracted_page.rb"
21
+ template "act_as_page_extractor.rb.erb", "config/initializers/act_as_page_extractor.rb"
22
+ end
23
+
24
+ private
25
+
26
+ def page_extractor_table_name
27
+ 'extracted_pages'
28
+ end
29
+
30
+ def migration_class_name_page_extractor
31
+ "Create#{page_extractor_table_name.camelize}"
32
+ end
33
+
34
+ def documents_table_name
35
+ document_class.underscore.pluralize
36
+ end
37
+
38
+
39
+ def migration_class_name_documents
40
+ "AddPageExtractorFieldsTo#{document_class.pluralize}"
41
+ end
42
+
43
+ def self.next_migration_number(dirname)
44
+ ActiveRecord::Generators::Base.next_migration_number(dirname)
45
+ end
46
+ end
47
+ end
48
+ end
49
+ # :nocov:
@@ -0,0 +1,14 @@
1
+ module ActAsPageExtractor
2
+ def self.import_files(directory: nil)
3
+ return unless directory
4
+
5
+ Dir["#{directory}/*"].each do |fname|
6
+ ap fname
7
+ document = <%=document_class%>.new(
8
+ name: fname.split('/').last
9
+ )
10
+ document.send("#{extracted_filename}=".to_sym, File.open(fname))
11
+ document.save!
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,8 @@
1
+ class <%= migration_class_name_documents %> < ActiveRecord::Migration
2
+ def change
3
+ add_column :<%= documents_table_name %>, :page_extraction_state, :string, default: ''
4
+ add_column :<%= documents_table_name %>, :page_extraction_pages, :integer, default: 0
5
+ add_column :<%= documents_table_name %>, :page_extraction_doctype, :string, default: ''
6
+ add_column :<%= documents_table_name %>, :page_extraction_filesize, :string, default: ''
7
+ end
8
+ end
@@ -0,0 +1,19 @@
1
+ class <%= migration_class_name_page_extractor %> < ActiveRecord::Migration
2
+ def change
3
+ create_table :<%= page_extractor_table_name %> do |t|
4
+ t.text :page
5
+ t.integer :<%= documents_table_name.singularize %>_id, null: false
6
+ <% additional_fields.each do |field|%>
7
+ t.integer :<%= field %><%end%>
8
+ t.integer :page_number
9
+ t.timestamps
10
+ end
11
+
12
+ add_index :<%= page_extractor_table_name %>, :<%= documents_table_name.singularize %>_id
13
+ <% additional_fields.each do |field|%>
14
+ add_index :<%= page_extractor_table_name %>, :<%= field %><%end%>
15
+ <% additional_fields.each do |field|%>
16
+ add_index :<%= page_extractor_table_name %>, [:<%= documents_table_name.singularize %>_id, :<%= field %>]<%end%>
17
+ add_index :<%= page_extractor_table_name %>, [:<%= documents_table_name.singularize %>_id, :page_number]
18
+ end
19
+ end
@@ -0,0 +1,3 @@
1
+ class ExtractedPage < ActiveRecord::Base
2
+ belongs_to :<%= documents_table_name.singularize %>
3
+ end
@@ -0,0 +1,46 @@
1
+ require 'spec_helper'
2
+ require 'act_as_page_extractor'
3
+
4
+ describe ActAsPageExtractor do
5
+ context 'correct extraction' do
6
+ [
7
+ 'test-doc-3-pages.docx',
8
+ 'test-doc-3-pages.doc',
9
+ 'test-doc-3-pages.pdf',
10
+ 'test-doc-3-pages.rtf',
11
+ 'test-doc-3-pages.odt',
12
+ 'test-doc-3-pages.html',
13
+ 'test-doc-3-pages.txt',
14
+ 'test-doc-3-pages.docx.zip',
15
+ 'test-doc-3-pages.docx.rar',
16
+ 'test-doc-3-pages.docx.7z'
17
+ ].each do |document|
18
+ it "extraction valid document #{document}" do
19
+ book = Book.new({doc_path: document})
20
+ allow(Book).to receive_message_chain('where') { [book] }
21
+ ActAsPageExtractor.start_extraction
22
+ expect(book.page_extraction_state).to eq ActAsPageExtractor::EXTRACTING_STATES[:extracted]
23
+ expect(ExtractedPage.array.count).to eq 3
24
+ expect(ExtractedPage.array[0][:page]).to match /require \'act_as_page_extractor\/modules\/interface\'/
25
+ unless document.match /pdf/
26
+ expect(book.pdf_path).to match /pdf/
27
+ expect(book.remove_files.count).to eq 1
28
+ end
29
+ expect(ActAsPageExtractor.statistics).to include(supported_documents: 1)
30
+ end
31
+ end
32
+ end
33
+
34
+ context 'incorrect extraction' do
35
+ [
36
+ 'test-doc-3-pages.wrong',
37
+ ].each do |document|
38
+ it "extraction invalid document #{document}" do
39
+ book = Book.new({doc_path: document})
40
+ allow(Book).to receive_message_chain('where') { [book] }
41
+ ActAsPageExtractor.start_extraction
42
+ expect(book.page_extraction_state).to eq ActAsPageExtractor::EXTRACTING_STATES[:'error.extraction']
43
+ end
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,8 @@
1
+ if ENV['COVERAGE']
2
+ require 'simplecov'
3
+ SimpleCov.start 'rails'
4
+ end
5
+ require 'rspec'
6
+ require 'support/models'
7
+ require 'act_as_page_extractor'
8
+ require 'byebug'
@@ -0,0 +1,92 @@
1
+ require 'act_as_page_extractor'
2
+
3
+ class Filename
4
+ attr_accessor :url
5
+ def initialize(params)
6
+ @url = params[:url]
7
+ end
8
+ end
9
+
10
+ class Book
11
+ cattr_accessor :id,
12
+ :category_id,
13
+ :user_id,
14
+ :page_extraction_state,
15
+ :page_extraction_pages,
16
+ :page_extraction_doctype,
17
+ :page_extraction_filesize
18
+
19
+ def self.before_create &block
20
+ yield
21
+ end
22
+
23
+ def self.before_destroy *args
24
+ end
25
+
26
+ def self.count
27
+ 1
28
+ end
29
+
30
+ include ActAsPageExtractor
31
+
32
+ act_as_page_extractor options: {
33
+ document_class: 'Book',
34
+ save_as_pdf: true,
35
+ filename: :filename, # CarrierWave class with 'filename.url' method
36
+ document_id: :document_id,
37
+ additional_fields: [:category_id, :user_id],
38
+ file_storage: "#{Dir.pwd}/test/",
39
+ pdf_storage: "#{Dir.pwd}/test/uploads/extracted/pdf"
40
+ }
41
+
42
+ def initialize(params)
43
+ @doc_path = params[:doc_path]
44
+ @id = @category_id = @user_id = nil
45
+ @page_extraction_state = @page_extraction_pages = nil
46
+ @page_extraction_doctype = @page_extraction_filesize = nil
47
+ ExtractedPage.cleanup
48
+ end
49
+
50
+ def filename
51
+ Filename.new(url: @doc_path)
52
+ end
53
+
54
+ def extracted_pages
55
+ array ||= ExtractedPage.array
56
+
57
+ def array.destroy_all
58
+ end
59
+
60
+ array
61
+ end
62
+
63
+ def update_attributes params
64
+ params.each do |key, value|
65
+ instance_eval("self.#{key} = #{value.class == String ? '\'' + value + '\'': value }")
66
+ end
67
+ end
68
+ end
69
+
70
+ class ExtractedPage
71
+ attr_accessor :id, :page, :document_id, :category_id, :page_number, :user_id
72
+
73
+ def document
74
+ end
75
+
76
+ def self.transaction &block
77
+ yield
78
+ end
79
+
80
+ def self.create params
81
+ @@array ||= []
82
+ @@array << params
83
+ end
84
+
85
+ def self.array
86
+ @@array ||= []
87
+ end
88
+
89
+ def self.cleanup
90
+ @@array = []
91
+ end
92
+ end