act_as_page_extractor 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +57 -0
  3. data/.rmvrc +1 -0
  4. data/.rspec +3 -0
  5. data/.ruby-gemset +1 -0
  6. data/.ruby-version +1 -0
  7. data/Gemfile +22 -0
  8. data/Gemfile.lock +107 -0
  9. data/LICENSE +21 -0
  10. data/README.md +119 -0
  11. data/Rakefile +6 -0
  12. data/act_as_page_extractor.gemspec +34 -0
  13. data/lib/act_as_page_extractor.rb +126 -0
  14. data/lib/act_as_page_extractor/modules/extracting.rb +35 -0
  15. data/lib/act_as_page_extractor/modules/interface.rb +30 -0
  16. data/lib/act_as_page_extractor/modules/saving.rb +47 -0
  17. data/lib/act_as_page_extractor/modules/tools.rb +54 -0
  18. data/lib/act_as_page_extractor/modules/unzipping.rb +15 -0
  19. data/lib/act_as_page_extractor/modules/validating.rb +22 -0
  20. data/lib/act_as_page_extractor/version.rb +5 -0
  21. data/lib/generators/act_as_page_extractor/migration_generator.rb +49 -0
  22. data/lib/generators/act_as_page_extractor/templates/act_as_page_extractor.rb.erb +14 -0
  23. data/lib/generators/act_as_page_extractor/templates/add_page_extractor_fields_to_documents.rb.erb +8 -0
  24. data/lib/generators/act_as_page_extractor/templates/create_extracted_pages_table.rb.erb +19 -0
  25. data/lib/generators/act_as_page_extractor/templates/extracted_page.rb.erb +3 -0
  26. data/spec/act_as_page_extractor_spec.rb +46 -0
  27. data/spec/spec_helper.rb +8 -0
  28. data/spec/support/models.rb +92 -0
  29. data/test/test-doc-3-pages.doc +0 -0
  30. data/test/test-doc-3-pages.docx +0 -0
  31. data/test/test-doc-3-pages.docx.7z +0 -0
  32. data/test/test-doc-3-pages.docx.rar +0 -0
  33. data/test/test-doc-3-pages.docx.zip +0 -0
  34. data/test/test-doc-3-pages.html +279 -0
  35. data/test/test-doc-3-pages.odt +0 -0
  36. data/test/test-doc-3-pages.pdf +0 -0
  37. data/test/test-doc-3-pages.rtf +339 -0
  38. data/test/test-doc-3-pages.txt +125 -0
  39. data/test/test-doc-3-pages.wrong +0 -0
  40. metadata +279 -0
@@ -0,0 +1,35 @@
1
+ module ActAsPageExtractor
2
+ def extract_pages
3
+ convert_to_pdf
4
+ convert_to_text
5
+ end
6
+
7
+ def convert_to_pdf
8
+ @pdf_path = if 'pdf' == @document_path.split('.').last.downcase
9
+ @document_path
10
+ else
11
+ if timeout_wrapper{ Docsplit.extract_pdf(@document_path, output: @tmp_dir)}
12
+ pdf_path = (@document_path.split('.')[0..-2] + ['pdf']).join('.')
13
+ pdf_path if File.exists?(pdf_path)
14
+ end
15
+ end
16
+ end
17
+
18
+ def convert_to_text
19
+ begin
20
+ @pdf_pages = PdfUtils.info(@pdf_path).pages
21
+ if @pdf_pages
22
+ if timeout_wrapper{ Docsplit::extract_text(@pdf_path, ocr: false, pages: 'all', output: @tmp_dir) }
23
+ else
24
+ # :nocov:
25
+ @pdf_pages = nil
26
+ raise
27
+ # :nocov:
28
+ end
29
+ end
30
+ # :nocov:
31
+ rescue
32
+ end
33
+ # :nocov:
34
+ end
35
+ end
@@ -0,0 +1,30 @@
1
+ module ActAsPageExtractor
2
+ def origin_file_name
3
+ self.send(:extracted_filename).url.to_s.split('/').last
4
+ end
5
+
6
+ def pdf_path
7
+ if page_extraction_state == EXTRACTING_STATES[:extracted] && page_extraction_doctype&.downcase != 'pdf'
8
+ "#{pdf_storage}/#{origin_file_name.split('.').first}.pdf"
9
+ end
10
+ end
11
+
12
+ def remove_files
13
+ FileUtils::rm_rf(pdf_path) if File.exists?(pdf_path.to_s)
14
+ end
15
+
16
+ def self.start_extraction
17
+ document_class.where(page_extraction_state: EXTRACTING_STATES[:new]).each(&:page_extract!)
18
+ end
19
+
20
+ def self.statistics
21
+ totals_documents = document_class.count
22
+ supported_documents = document_class.where("page_extraction_doctype ILIKE ANY (array[#{VALIDATE_DOC_TYPES.map{|dt| '\'%'+dt+'%\''}.join(',')}])").count
23
+ {
24
+ total: totals_documents,
25
+ supported_documents: supported_documents,
26
+ unsupported_documents: totals_documents - supported_documents,
27
+ states: EXTRACTING_STATES.map{|state, value| [ state, document_class.where(page_extraction_state: value).count] }.to_h,
28
+ }
29
+ end
30
+ end
@@ -0,0 +1,47 @@
1
+ module ActAsPageExtractor
2
+ def save_pdf
3
+ if save_as_pdf &&
4
+ is_extracted &&
5
+ @document_path.split('.').last&.downcase != 'pdf'
6
+
7
+ if @pdf_path
8
+ FileUtils.cp(@pdf_path, pdf_storage)
9
+ end
10
+ end
11
+ end
12
+
13
+ def save_to_db
14
+ self.update_attributes(page_extraction_state: EXTRACTING_STATES[:extracting])
15
+ ExtractedPage.transaction do
16
+ @pdf_pages&.times&.each do |pdf_page|
17
+ page_filename = "#{@tmp_dir}/#{@document_filename.split('.').first}_#{(pdf_page + 1).to_s}.txt"
18
+ remove_last_byte(page_filename)
19
+ content = IO.read(page_filename).delete("<" ">" "&" "\u0001" "\u25A0" "\a")
20
+
21
+ page_attributes = {
22
+ page: content,
23
+ page_number: pdf_page + 1
24
+ }
25
+
26
+ page_attributes[extracted_document_id] = self.id
27
+
28
+ additional_fields.each do |additional_field|
29
+ page_attributes[additional_field] = self.send(additional_field.to_sym)
30
+ end
31
+
32
+ ExtractedPage.create(page_attributes)
33
+ end
34
+ end
35
+ end
36
+
37
+ #fix for openoffice/jodconverter: delete last ugly byte in converted text page
38
+ def remove_last_byte(file_name)
39
+ file = File.new(file_name, 'a+')
40
+ if file.size > 0
41
+ file.seek(file.size - 1)
42
+ last_byte = file.getc
43
+ file.truncate(file.size - 1) if last_byte == "\f"
44
+ end
45
+ file.close
46
+ end
47
+ end
@@ -0,0 +1,54 @@
1
+ module ActAsPageExtractor
2
+ def timeout_wrapper
3
+ result = nil
4
+ begin
5
+ result = Timeout::timeout(60*5) { yield }
6
+ rescue
7
+ # :nocov:
8
+ ensure
9
+ # :nocov:
10
+ result
11
+ end
12
+ end
13
+
14
+ def is_extracted
15
+ @pdf_pages.to_i > 0 && self.extracted_pages.count == @pdf_pages
16
+ end
17
+
18
+ def update_state
19
+ updated_attributes = if is_extracted
20
+ {
21
+ page_extraction_state: EXTRACTING_STATES[:extracted],
22
+ page_extraction_pages: @pdf_pages
23
+ }
24
+ else
25
+ {
26
+ page_extraction_state: EXTRACTING_STATES[:'error.extraction'],
27
+ page_extraction_pages: 0
28
+ }
29
+ end.merge({
30
+ page_extraction_doctype: @document_path&.split('.')&.last,
31
+ page_extraction_filesize: Filesize.from("#{File.size(@document_path)} B").pretty
32
+ })
33
+ self.update_attributes(updated_attributes)
34
+ end
35
+
36
+ def cleanup_pages
37
+ self.extracted_pages.destroy_all
38
+ end
39
+
40
+ # :nocov:
41
+ def debug_info
42
+ # ap "@tmp_dir"
43
+ # ap @tmp_dir
44
+ # ap "@copy_document_path"
45
+ # ap @copy_document_path
46
+ # ap "@document_path"
47
+ ap @document_path
48
+ # ap "@pdf_path"
49
+ # ap @pdf_path
50
+ # ap "@pdf_pages"
51
+ ap @pdf_pages
52
+ end
53
+ # :nocov:
54
+ end
@@ -0,0 +1,15 @@
1
+ module ActAsPageExtractor
2
+ def unzip_document
3
+ @document_path = @copy_document_path
4
+ if validate_compress_types
5
+ result = TotalCompressor.decompress(@copy_document_path)
6
+ if result[:success] && result[:files].length == 1
7
+ origin_document_name = @origin_document_path.split("/").last.split('.').first
8
+ unpacked_document = result[:files].first.split('/').last
9
+ unpacked_document_format = unpacked_document.split('.').last
10
+ @document_path = "#{@tmp_dir}/#{origin_document_name}.#{unpacked_document_format}"
11
+ File.rename(result[:files].first, @document_path)
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,22 @@
1
+ module ActAsPageExtractor
2
+ VALIDATE_COMPRESS_TYPES = ['zip', 'rar', '7z', 'gzip'].freeze
3
+ VALIDATE_DOC_TYPES = ['txt', 'pdf', 'doc', 'docx',
4
+ 'rtf', 'odt', 'htm', 'html'].freeze
5
+
6
+ def valid_document
7
+ validate_size && validate_doc_types
8
+ end
9
+
10
+ def validate_size
11
+ mb = 2**20
12
+ File.size(@copy_document_path) <= 1*mb
13
+ end
14
+
15
+ def validate_compress_types
16
+ VALIDATE_COMPRESS_TYPES.include?(@copy_document_path.split('.').last&.downcase)
17
+ end
18
+
19
+ def validate_doc_types
20
+ VALIDATE_DOC_TYPES.include?(@document_path.split('.').last&.downcase)
21
+ end
22
+ end
@@ -0,0 +1,5 @@
1
+ # :nocov:
2
+ module ActAsPageExtractor
3
+ VERSION = "0.1.0"
4
+ end
5
+ # :nocov:
@@ -0,0 +1,49 @@
1
+ # :nocov:
2
+ require 'rails/generators/active_record'
3
+ require 'rails/generators/base'
4
+
5
+ module ActAsPageExtractor
6
+ module Generators # :nodoc:
7
+ class MigrationGenerator < Rails::Generators::Base # :nodoc:
8
+ include Rails::Generators::Migration
9
+
10
+ argument :document_class, type: :string, default: 'Document'
11
+ argument :additional_fields, type: :array, default: []
12
+
13
+ def self.default_generator_root
14
+ File.dirname(__FILE__)
15
+ end
16
+
17
+ def create_migration_file
18
+ migration_template 'create_extracted_pages_table.rb.erb', "db/migrate/create_#{page_extractor_table_name}.rb"
19
+ migration_template 'add_page_extractor_fields_to_documents.rb.erb', "db/migrate/add_page_extractor_fields_to_#{documents_table_name}.rb"
20
+ template "extracted_page.rb.erb", "app/models/extracted_page.rb"
21
+ template "act_as_page_extractor.rb.erb", "config/initializers/act_as_page_extractor.rb"
22
+ end
23
+
24
+ private
25
+
26
+ def page_extractor_table_name
27
+ 'extracted_pages'
28
+ end
29
+
30
+ def migration_class_name_page_extractor
31
+ "Create#{page_extractor_table_name.camelize}"
32
+ end
33
+
34
+ def documents_table_name
35
+ document_class.underscore.pluralize
36
+ end
37
+
38
+
39
+ def migration_class_name_documents
40
+ "AddPageExtractorFieldsTo#{document_class.pluralize}"
41
+ end
42
+
43
+ def self.next_migration_number(dirname)
44
+ ActiveRecord::Generators::Base.next_migration_number(dirname)
45
+ end
46
+ end
47
+ end
48
+ end
49
+ # :nocov:
@@ -0,0 +1,14 @@
1
+ module ActAsPageExtractor
2
+ def self.import_files(directory: nil)
3
+ return unless directory
4
+
5
+ Dir["#{directory}/*"].each do |fname|
6
+ ap fname
7
+ document = <%=document_class%>.new(
8
+ name: fname.split('/').last
9
+ )
10
+ document.send("#{extracted_filename}=".to_sym, File.open(fname))
11
+ document.save!
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,8 @@
1
+ class <%= migration_class_name_documents %> < ActiveRecord::Migration
2
+ def change
3
+ add_column :<%= documents_table_name %>, :page_extraction_state, :string, default: ''
4
+ add_column :<%= documents_table_name %>, :page_extraction_pages, :integer, default: 0
5
+ add_column :<%= documents_table_name %>, :page_extraction_doctype, :string, default: ''
6
+ add_column :<%= documents_table_name %>, :page_extraction_filesize, :string, default: ''
7
+ end
8
+ end
@@ -0,0 +1,19 @@
1
+ class <%= migration_class_name_page_extractor %> < ActiveRecord::Migration
2
+ def change
3
+ create_table :<%= page_extractor_table_name %> do |t|
4
+ t.text :page
5
+ t.integer :<%= documents_table_name.singularize %>_id, null: false
6
+ <% additional_fields.each do |field|%>
7
+ t.integer :<%= field %><%end%>
8
+ t.integer :page_number
9
+ t.timestamps
10
+ end
11
+
12
+ add_index :<%= page_extractor_table_name %>, :<%= documents_table_name.singularize %>_id
13
+ <% additional_fields.each do |field|%>
14
+ add_index :<%= page_extractor_table_name %>, :<%= field %><%end%>
15
+ <% additional_fields.each do |field|%>
16
+ add_index :<%= page_extractor_table_name %>, [:<%= documents_table_name.singularize %>_id, :<%= field %>]<%end%>
17
+ add_index :<%= page_extractor_table_name %>, [:<%= documents_table_name.singularize %>_id, :page_number]
18
+ end
19
+ end
@@ -0,0 +1,3 @@
1
+ class ExtractedPage < ActiveRecord::Base
2
+ belongs_to :<%= documents_table_name.singularize %>
3
+ end
@@ -0,0 +1,46 @@
1
+ require 'spec_helper'
2
+ require 'act_as_page_extractor'
3
+
4
+ describe ActAsPageExtractor do
5
+ context 'correct extraction' do
6
+ [
7
+ 'test-doc-3-pages.docx',
8
+ 'test-doc-3-pages.doc',
9
+ 'test-doc-3-pages.pdf',
10
+ 'test-doc-3-pages.rtf',
11
+ 'test-doc-3-pages.odt',
12
+ 'test-doc-3-pages.html',
13
+ 'test-doc-3-pages.txt',
14
+ 'test-doc-3-pages.docx.zip',
15
+ 'test-doc-3-pages.docx.rar',
16
+ 'test-doc-3-pages.docx.7z'
17
+ ].each do |document|
18
+ it "extraction valid document #{document}" do
19
+ book = Book.new({doc_path: document})
20
+ allow(Book).to receive_message_chain('where') { [book] }
21
+ ActAsPageExtractor.start_extraction
22
+ expect(book.page_extraction_state).to eq ActAsPageExtractor::EXTRACTING_STATES[:extracted]
23
+ expect(ExtractedPage.array.count).to eq 3
24
+ expect(ExtractedPage.array[0][:page]).to match /require \'act_as_page_extractor\/modules\/interface\'/
25
+ unless document.match /pdf/
26
+ expect(book.pdf_path).to match /pdf/
27
+ expect(book.remove_files.count).to eq 1
28
+ end
29
+ expect(ActAsPageExtractor.statistics).to include(supported_documents: 1)
30
+ end
31
+ end
32
+ end
33
+
34
+ context 'incorrect extraction' do
35
+ [
36
+ 'test-doc-3-pages.wrong',
37
+ ].each do |document|
38
+ it "extraction invalid document #{document}" do
39
+ book = Book.new({doc_path: document})
40
+ allow(Book).to receive_message_chain('where') { [book] }
41
+ ActAsPageExtractor.start_extraction
42
+ expect(book.page_extraction_state).to eq ActAsPageExtractor::EXTRACTING_STATES[:'error.extraction']
43
+ end
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,8 @@
1
+ if ENV['COVERAGE']
2
+ require 'simplecov'
3
+ SimpleCov.start 'rails'
4
+ end
5
+ require 'rspec'
6
+ require 'support/models'
7
+ require 'act_as_page_extractor'
8
+ require 'byebug'
@@ -0,0 +1,92 @@
1
+ require 'act_as_page_extractor'
2
+
3
+ class Filename
4
+ attr_accessor :url
5
+ def initialize(params)
6
+ @url = params[:url]
7
+ end
8
+ end
9
+
10
+ class Book
11
+ cattr_accessor :id,
12
+ :category_id,
13
+ :user_id,
14
+ :page_extraction_state,
15
+ :page_extraction_pages,
16
+ :page_extraction_doctype,
17
+ :page_extraction_filesize
18
+
19
+ def self.before_create &block
20
+ yield
21
+ end
22
+
23
+ def self.before_destroy *args
24
+ end
25
+
26
+ def self.count
27
+ 1
28
+ end
29
+
30
+ include ActAsPageExtractor
31
+
32
+ act_as_page_extractor options: {
33
+ document_class: 'Book',
34
+ save_as_pdf: true,
35
+ filename: :filename, # CarrierWave class with 'filename.url' method
36
+ document_id: :document_id,
37
+ additional_fields: [:category_id, :user_id],
38
+ file_storage: "#{Dir.pwd}/test/",
39
+ pdf_storage: "#{Dir.pwd}/test/uploads/extracted/pdf"
40
+ }
41
+
42
+ def initialize(params)
43
+ @doc_path = params[:doc_path]
44
+ @id = @category_id = @user_id = nil
45
+ @page_extraction_state = @page_extraction_pages = nil
46
+ @page_extraction_doctype = @page_extraction_filesize = nil
47
+ ExtractedPage.cleanup
48
+ end
49
+
50
+ def filename
51
+ Filename.new(url: @doc_path)
52
+ end
53
+
54
+ def extracted_pages
55
+ array ||= ExtractedPage.array
56
+
57
+ def array.destroy_all
58
+ end
59
+
60
+ array
61
+ end
62
+
63
+ def update_attributes params
64
+ params.each do |key, value|
65
+ instance_eval("self.#{key} = #{value.class == String ? '\'' + value + '\'': value }")
66
+ end
67
+ end
68
+ end
69
+
70
+ class ExtractedPage
71
+ attr_accessor :id, :page, :document_id, :category_id, :page_number, :user_id
72
+
73
+ def document
74
+ end
75
+
76
+ def self.transaction &block
77
+ yield
78
+ end
79
+
80
+ def self.create params
81
+ @@array ||= []
82
+ @@array << params
83
+ end
84
+
85
+ def self.array
86
+ @@array ||= []
87
+ end
88
+
89
+ def self.cleanup
90
+ @@array = []
91
+ end
92
+ end