act_as_page_extractor 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +57 -0
- data/.rmvrc +1 -0
- data/.rspec +3 -0
- data/.ruby-gemset +1 -0
- data/.ruby-version +1 -0
- data/Gemfile +22 -0
- data/Gemfile.lock +107 -0
- data/LICENSE +21 -0
- data/README.md +119 -0
- data/Rakefile +6 -0
- data/act_as_page_extractor.gemspec +34 -0
- data/lib/act_as_page_extractor.rb +126 -0
- data/lib/act_as_page_extractor/modules/extracting.rb +35 -0
- data/lib/act_as_page_extractor/modules/interface.rb +30 -0
- data/lib/act_as_page_extractor/modules/saving.rb +47 -0
- data/lib/act_as_page_extractor/modules/tools.rb +54 -0
- data/lib/act_as_page_extractor/modules/unzipping.rb +15 -0
- data/lib/act_as_page_extractor/modules/validating.rb +22 -0
- data/lib/act_as_page_extractor/version.rb +5 -0
- data/lib/generators/act_as_page_extractor/migration_generator.rb +49 -0
- data/lib/generators/act_as_page_extractor/templates/act_as_page_extractor.rb.erb +14 -0
- data/lib/generators/act_as_page_extractor/templates/add_page_extractor_fields_to_documents.rb.erb +8 -0
- data/lib/generators/act_as_page_extractor/templates/create_extracted_pages_table.rb.erb +19 -0
- data/lib/generators/act_as_page_extractor/templates/extracted_page.rb.erb +3 -0
- data/spec/act_as_page_extractor_spec.rb +46 -0
- data/spec/spec_helper.rb +8 -0
- data/spec/support/models.rb +92 -0
- data/test/test-doc-3-pages.doc +0 -0
- data/test/test-doc-3-pages.docx +0 -0
- data/test/test-doc-3-pages.docx.7z +0 -0
- data/test/test-doc-3-pages.docx.rar +0 -0
- data/test/test-doc-3-pages.docx.zip +0 -0
- data/test/test-doc-3-pages.html +279 -0
- data/test/test-doc-3-pages.odt +0 -0
- data/test/test-doc-3-pages.pdf +0 -0
- data/test/test-doc-3-pages.rtf +339 -0
- data/test/test-doc-3-pages.txt +125 -0
- data/test/test-doc-3-pages.wrong +0 -0
- metadata +279 -0
@@ -0,0 +1,35 @@
|
|
1
|
+
module ActAsPageExtractor
|
2
|
+
def extract_pages
|
3
|
+
convert_to_pdf
|
4
|
+
convert_to_text
|
5
|
+
end
|
6
|
+
|
7
|
+
def convert_to_pdf
|
8
|
+
@pdf_path = if 'pdf' == @document_path.split('.').last.downcase
|
9
|
+
@document_path
|
10
|
+
else
|
11
|
+
if timeout_wrapper{ Docsplit.extract_pdf(@document_path, output: @tmp_dir)}
|
12
|
+
pdf_path = (@document_path.split('.')[0..-2] + ['pdf']).join('.')
|
13
|
+
pdf_path if File.exists?(pdf_path)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def convert_to_text
|
19
|
+
begin
|
20
|
+
@pdf_pages = PdfUtils.info(@pdf_path).pages
|
21
|
+
if @pdf_pages
|
22
|
+
if timeout_wrapper{ Docsplit::extract_text(@pdf_path, ocr: false, pages: 'all', output: @tmp_dir) }
|
23
|
+
else
|
24
|
+
# :nocov:
|
25
|
+
@pdf_pages = nil
|
26
|
+
raise
|
27
|
+
# :nocov:
|
28
|
+
end
|
29
|
+
end
|
30
|
+
# :nocov:
|
31
|
+
rescue
|
32
|
+
end
|
33
|
+
# :nocov:
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module ActAsPageExtractor
|
2
|
+
def origin_file_name
|
3
|
+
self.send(:extracted_filename).url.to_s.split('/').last
|
4
|
+
end
|
5
|
+
|
6
|
+
def pdf_path
|
7
|
+
if page_extraction_state == EXTRACTING_STATES[:extracted] && page_extraction_doctype&.downcase != 'pdf'
|
8
|
+
"#{pdf_storage}/#{origin_file_name.split('.').first}.pdf"
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
def remove_files
|
13
|
+
FileUtils::rm_rf(pdf_path) if File.exists?(pdf_path.to_s)
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.start_extraction
|
17
|
+
document_class.where(page_extraction_state: EXTRACTING_STATES[:new]).each(&:page_extract!)
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.statistics
|
21
|
+
totals_documents = document_class.count
|
22
|
+
supported_documents = document_class.where("page_extraction_doctype ILIKE ANY (array[#{VALIDATE_DOC_TYPES.map{|dt| '\'%'+dt+'%\''}.join(',')}])").count
|
23
|
+
{
|
24
|
+
total: totals_documents,
|
25
|
+
supported_documents: supported_documents,
|
26
|
+
unsupported_documents: totals_documents - supported_documents,
|
27
|
+
states: EXTRACTING_STATES.map{|state, value| [ state, document_class.where(page_extraction_state: value).count] }.to_h,
|
28
|
+
}
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
module ActAsPageExtractor
|
2
|
+
def save_pdf
|
3
|
+
if save_as_pdf &&
|
4
|
+
is_extracted &&
|
5
|
+
@document_path.split('.').last&.downcase != 'pdf'
|
6
|
+
|
7
|
+
if @pdf_path
|
8
|
+
FileUtils.cp(@pdf_path, pdf_storage)
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
def save_to_db
|
14
|
+
self.update_attributes(page_extraction_state: EXTRACTING_STATES[:extracting])
|
15
|
+
ExtractedPage.transaction do
|
16
|
+
@pdf_pages&.times&.each do |pdf_page|
|
17
|
+
page_filename = "#{@tmp_dir}/#{@document_filename.split('.').first}_#{(pdf_page + 1).to_s}.txt"
|
18
|
+
remove_last_byte(page_filename)
|
19
|
+
content = IO.read(page_filename).delete("<" ">" "&" "\u0001" "\u25A0" "\a")
|
20
|
+
|
21
|
+
page_attributes = {
|
22
|
+
page: content,
|
23
|
+
page_number: pdf_page + 1
|
24
|
+
}
|
25
|
+
|
26
|
+
page_attributes[extracted_document_id] = self.id
|
27
|
+
|
28
|
+
additional_fields.each do |additional_field|
|
29
|
+
page_attributes[additional_field] = self.send(additional_field.to_sym)
|
30
|
+
end
|
31
|
+
|
32
|
+
ExtractedPage.create(page_attributes)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
#fix for openoffice/jodconverter: delete last ugly byte in converted text page
|
38
|
+
def remove_last_byte(file_name)
|
39
|
+
file = File.new(file_name, 'a+')
|
40
|
+
if file.size > 0
|
41
|
+
file.seek(file.size - 1)
|
42
|
+
last_byte = file.getc
|
43
|
+
file.truncate(file.size - 1) if last_byte == "\f"
|
44
|
+
end
|
45
|
+
file.close
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
module ActAsPageExtractor
|
2
|
+
def timeout_wrapper
|
3
|
+
result = nil
|
4
|
+
begin
|
5
|
+
result = Timeout::timeout(60*5) { yield }
|
6
|
+
rescue
|
7
|
+
# :nocov:
|
8
|
+
ensure
|
9
|
+
# :nocov:
|
10
|
+
result
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
def is_extracted
|
15
|
+
@pdf_pages.to_i > 0 && self.extracted_pages.count == @pdf_pages
|
16
|
+
end
|
17
|
+
|
18
|
+
def update_state
|
19
|
+
updated_attributes = if is_extracted
|
20
|
+
{
|
21
|
+
page_extraction_state: EXTRACTING_STATES[:extracted],
|
22
|
+
page_extraction_pages: @pdf_pages
|
23
|
+
}
|
24
|
+
else
|
25
|
+
{
|
26
|
+
page_extraction_state: EXTRACTING_STATES[:'error.extraction'],
|
27
|
+
page_extraction_pages: 0
|
28
|
+
}
|
29
|
+
end.merge({
|
30
|
+
page_extraction_doctype: @document_path&.split('.')&.last,
|
31
|
+
page_extraction_filesize: Filesize.from("#{File.size(@document_path)} B").pretty
|
32
|
+
})
|
33
|
+
self.update_attributes(updated_attributes)
|
34
|
+
end
|
35
|
+
|
36
|
+
def cleanup_pages
|
37
|
+
self.extracted_pages.destroy_all
|
38
|
+
end
|
39
|
+
|
40
|
+
# :nocov:
|
41
|
+
def debug_info
|
42
|
+
# ap "@tmp_dir"
|
43
|
+
# ap @tmp_dir
|
44
|
+
# ap "@copy_document_path"
|
45
|
+
# ap @copy_document_path
|
46
|
+
# ap "@document_path"
|
47
|
+
ap @document_path
|
48
|
+
# ap "@pdf_path"
|
49
|
+
# ap @pdf_path
|
50
|
+
# ap "@pdf_pages"
|
51
|
+
ap @pdf_pages
|
52
|
+
end
|
53
|
+
# :nocov:
|
54
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module ActAsPageExtractor
|
2
|
+
def unzip_document
|
3
|
+
@document_path = @copy_document_path
|
4
|
+
if validate_compress_types
|
5
|
+
result = TotalCompressor.decompress(@copy_document_path)
|
6
|
+
if result[:success] && result[:files].length == 1
|
7
|
+
origin_document_name = @origin_document_path.split("/").last.split('.').first
|
8
|
+
unpacked_document = result[:files].first.split('/').last
|
9
|
+
unpacked_document_format = unpacked_document.split('.').last
|
10
|
+
@document_path = "#{@tmp_dir}/#{origin_document_name}.#{unpacked_document_format}"
|
11
|
+
File.rename(result[:files].first, @document_path)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module ActAsPageExtractor
|
2
|
+
VALIDATE_COMPRESS_TYPES = ['zip', 'rar', '7z', 'gzip'].freeze
|
3
|
+
VALIDATE_DOC_TYPES = ['txt', 'pdf', 'doc', 'docx',
|
4
|
+
'rtf', 'odt', 'htm', 'html'].freeze
|
5
|
+
|
6
|
+
def valid_document
|
7
|
+
validate_size && validate_doc_types
|
8
|
+
end
|
9
|
+
|
10
|
+
def validate_size
|
11
|
+
mb = 2**20
|
12
|
+
File.size(@copy_document_path) <= 1*mb
|
13
|
+
end
|
14
|
+
|
15
|
+
def validate_compress_types
|
16
|
+
VALIDATE_COMPRESS_TYPES.include?(@copy_document_path.split('.').last&.downcase)
|
17
|
+
end
|
18
|
+
|
19
|
+
def validate_doc_types
|
20
|
+
VALIDATE_DOC_TYPES.include?(@document_path.split('.').last&.downcase)
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
# :nocov:
|
2
|
+
require 'rails/generators/active_record'
|
3
|
+
require 'rails/generators/base'
|
4
|
+
|
5
|
+
module ActAsPageExtractor
|
6
|
+
module Generators # :nodoc:
|
7
|
+
class MigrationGenerator < Rails::Generators::Base # :nodoc:
|
8
|
+
include Rails::Generators::Migration
|
9
|
+
|
10
|
+
argument :document_class, type: :string, default: 'Document'
|
11
|
+
argument :additional_fields, type: :array, default: []
|
12
|
+
|
13
|
+
def self.default_generator_root
|
14
|
+
File.dirname(__FILE__)
|
15
|
+
end
|
16
|
+
|
17
|
+
def create_migration_file
|
18
|
+
migration_template 'create_extracted_pages_table.rb.erb', "db/migrate/create_#{page_extractor_table_name}.rb"
|
19
|
+
migration_template 'add_page_extractor_fields_to_documents.rb.erb', "db/migrate/add_page_extractor_fields_to_#{documents_table_name}.rb"
|
20
|
+
template "extracted_page.rb.erb", "app/models/extracted_page.rb"
|
21
|
+
template "act_as_page_extractor.rb.erb", "config/initializers/act_as_page_extractor.rb"
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
def page_extractor_table_name
|
27
|
+
'extracted_pages'
|
28
|
+
end
|
29
|
+
|
30
|
+
def migration_class_name_page_extractor
|
31
|
+
"Create#{page_extractor_table_name.camelize}"
|
32
|
+
end
|
33
|
+
|
34
|
+
def documents_table_name
|
35
|
+
document_class.underscore.pluralize
|
36
|
+
end
|
37
|
+
|
38
|
+
|
39
|
+
def migration_class_name_documents
|
40
|
+
"AddPageExtractorFieldsTo#{document_class.pluralize}"
|
41
|
+
end
|
42
|
+
|
43
|
+
def self.next_migration_number(dirname)
|
44
|
+
ActiveRecord::Generators::Base.next_migration_number(dirname)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
# :nocov:
|
@@ -0,0 +1,14 @@
|
|
1
|
+
module ActAsPageExtractor
|
2
|
+
def self.import_files(directory: nil)
|
3
|
+
return unless directory
|
4
|
+
|
5
|
+
Dir["#{directory}/*"].each do |fname|
|
6
|
+
ap fname
|
7
|
+
document = <%=document_class%>.new(
|
8
|
+
name: fname.split('/').last
|
9
|
+
)
|
10
|
+
document.send("#{extracted_filename}=".to_sym, File.open(fname))
|
11
|
+
document.save!
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
data/lib/generators/act_as_page_extractor/templates/add_page_extractor_fields_to_documents.rb.erb
ADDED
@@ -0,0 +1,8 @@
|
|
1
|
+
class <%= migration_class_name_documents %> < ActiveRecord::Migration
|
2
|
+
def change
|
3
|
+
add_column :<%= documents_table_name %>, :page_extraction_state, :string, default: ''
|
4
|
+
add_column :<%= documents_table_name %>, :page_extraction_pages, :integer, default: 0
|
5
|
+
add_column :<%= documents_table_name %>, :page_extraction_doctype, :string, default: ''
|
6
|
+
add_column :<%= documents_table_name %>, :page_extraction_filesize, :string, default: ''
|
7
|
+
end
|
8
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
class <%= migration_class_name_page_extractor %> < ActiveRecord::Migration
|
2
|
+
def change
|
3
|
+
create_table :<%= page_extractor_table_name %> do |t|
|
4
|
+
t.text :page
|
5
|
+
t.integer :<%= documents_table_name.singularize %>_id, null: false
|
6
|
+
<% additional_fields.each do |field|%>
|
7
|
+
t.integer :<%= field %><%end%>
|
8
|
+
t.integer :page_number
|
9
|
+
t.timestamps
|
10
|
+
end
|
11
|
+
|
12
|
+
add_index :<%= page_extractor_table_name %>, :<%= documents_table_name.singularize %>_id
|
13
|
+
<% additional_fields.each do |field|%>
|
14
|
+
add_index :<%= page_extractor_table_name %>, :<%= field %><%end%>
|
15
|
+
<% additional_fields.each do |field|%>
|
16
|
+
add_index :<%= page_extractor_table_name %>, [:<%= documents_table_name.singularize %>_id, :<%= field %>]<%end%>
|
17
|
+
add_index :<%= page_extractor_table_name %>, [:<%= documents_table_name.singularize %>_id, :page_number]
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'act_as_page_extractor'
|
3
|
+
|
4
|
+
describe ActAsPageExtractor do
|
5
|
+
context 'correct extraction' do
|
6
|
+
[
|
7
|
+
'test-doc-3-pages.docx',
|
8
|
+
'test-doc-3-pages.doc',
|
9
|
+
'test-doc-3-pages.pdf',
|
10
|
+
'test-doc-3-pages.rtf',
|
11
|
+
'test-doc-3-pages.odt',
|
12
|
+
'test-doc-3-pages.html',
|
13
|
+
'test-doc-3-pages.txt',
|
14
|
+
'test-doc-3-pages.docx.zip',
|
15
|
+
'test-doc-3-pages.docx.rar',
|
16
|
+
'test-doc-3-pages.docx.7z'
|
17
|
+
].each do |document|
|
18
|
+
it "extraction valid document #{document}" do
|
19
|
+
book = Book.new({doc_path: document})
|
20
|
+
allow(Book).to receive_message_chain('where') { [book] }
|
21
|
+
ActAsPageExtractor.start_extraction
|
22
|
+
expect(book.page_extraction_state).to eq ActAsPageExtractor::EXTRACTING_STATES[:extracted]
|
23
|
+
expect(ExtractedPage.array.count).to eq 3
|
24
|
+
expect(ExtractedPage.array[0][:page]).to match /require \'act_as_page_extractor\/modules\/interface\'/
|
25
|
+
unless document.match /pdf/
|
26
|
+
expect(book.pdf_path).to match /pdf/
|
27
|
+
expect(book.remove_files.count).to eq 1
|
28
|
+
end
|
29
|
+
expect(ActAsPageExtractor.statistics).to include(supported_documents: 1)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
context 'incorrect extraction' do
|
35
|
+
[
|
36
|
+
'test-doc-3-pages.wrong',
|
37
|
+
].each do |document|
|
38
|
+
it "extraction invalid document #{document}" do
|
39
|
+
book = Book.new({doc_path: document})
|
40
|
+
allow(Book).to receive_message_chain('where') { [book] }
|
41
|
+
ActAsPageExtractor.start_extraction
|
42
|
+
expect(book.page_extraction_state).to eq ActAsPageExtractor::EXTRACTING_STATES[:'error.extraction']
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,92 @@
|
|
1
|
+
require 'act_as_page_extractor'
|
2
|
+
|
3
|
+
class Filename
|
4
|
+
attr_accessor :url
|
5
|
+
def initialize(params)
|
6
|
+
@url = params[:url]
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
class Book
|
11
|
+
cattr_accessor :id,
|
12
|
+
:category_id,
|
13
|
+
:user_id,
|
14
|
+
:page_extraction_state,
|
15
|
+
:page_extraction_pages,
|
16
|
+
:page_extraction_doctype,
|
17
|
+
:page_extraction_filesize
|
18
|
+
|
19
|
+
def self.before_create &block
|
20
|
+
yield
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.before_destroy *args
|
24
|
+
end
|
25
|
+
|
26
|
+
def self.count
|
27
|
+
1
|
28
|
+
end
|
29
|
+
|
30
|
+
include ActAsPageExtractor
|
31
|
+
|
32
|
+
act_as_page_extractor options: {
|
33
|
+
document_class: 'Book',
|
34
|
+
save_as_pdf: true,
|
35
|
+
filename: :filename, # CarrierWave class with 'filename.url' method
|
36
|
+
document_id: :document_id,
|
37
|
+
additional_fields: [:category_id, :user_id],
|
38
|
+
file_storage: "#{Dir.pwd}/test/",
|
39
|
+
pdf_storage: "#{Dir.pwd}/test/uploads/extracted/pdf"
|
40
|
+
}
|
41
|
+
|
42
|
+
def initialize(params)
|
43
|
+
@doc_path = params[:doc_path]
|
44
|
+
@id = @category_id = @user_id = nil
|
45
|
+
@page_extraction_state = @page_extraction_pages = nil
|
46
|
+
@page_extraction_doctype = @page_extraction_filesize = nil
|
47
|
+
ExtractedPage.cleanup
|
48
|
+
end
|
49
|
+
|
50
|
+
def filename
|
51
|
+
Filename.new(url: @doc_path)
|
52
|
+
end
|
53
|
+
|
54
|
+
def extracted_pages
|
55
|
+
array ||= ExtractedPage.array
|
56
|
+
|
57
|
+
def array.destroy_all
|
58
|
+
end
|
59
|
+
|
60
|
+
array
|
61
|
+
end
|
62
|
+
|
63
|
+
def update_attributes params
|
64
|
+
params.each do |key, value|
|
65
|
+
instance_eval("self.#{key} = #{value.class == String ? '\'' + value + '\'': value }")
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
class ExtractedPage
|
71
|
+
attr_accessor :id, :page, :document_id, :category_id, :page_number, :user_id
|
72
|
+
|
73
|
+
def document
|
74
|
+
end
|
75
|
+
|
76
|
+
def self.transaction &block
|
77
|
+
yield
|
78
|
+
end
|
79
|
+
|
80
|
+
def self.create params
|
81
|
+
@@array ||= []
|
82
|
+
@@array << params
|
83
|
+
end
|
84
|
+
|
85
|
+
def self.array
|
86
|
+
@@array ||= []
|
87
|
+
end
|
88
|
+
|
89
|
+
def self.cleanup
|
90
|
+
@@array = []
|
91
|
+
end
|
92
|
+
end
|