act_as_page_extractor 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +57 -0
- data/.rmvrc +1 -0
- data/.rspec +3 -0
- data/.ruby-gemset +1 -0
- data/.ruby-version +1 -0
- data/Gemfile +22 -0
- data/Gemfile.lock +107 -0
- data/LICENSE +21 -0
- data/README.md +119 -0
- data/Rakefile +6 -0
- data/act_as_page_extractor.gemspec +34 -0
- data/lib/act_as_page_extractor.rb +126 -0
- data/lib/act_as_page_extractor/modules/extracting.rb +35 -0
- data/lib/act_as_page_extractor/modules/interface.rb +30 -0
- data/lib/act_as_page_extractor/modules/saving.rb +47 -0
- data/lib/act_as_page_extractor/modules/tools.rb +54 -0
- data/lib/act_as_page_extractor/modules/unzipping.rb +15 -0
- data/lib/act_as_page_extractor/modules/validating.rb +22 -0
- data/lib/act_as_page_extractor/version.rb +5 -0
- data/lib/generators/act_as_page_extractor/migration_generator.rb +49 -0
- data/lib/generators/act_as_page_extractor/templates/act_as_page_extractor.rb.erb +14 -0
- data/lib/generators/act_as_page_extractor/templates/add_page_extractor_fields_to_documents.rb.erb +8 -0
- data/lib/generators/act_as_page_extractor/templates/create_extracted_pages_table.rb.erb +19 -0
- data/lib/generators/act_as_page_extractor/templates/extracted_page.rb.erb +3 -0
- data/spec/act_as_page_extractor_spec.rb +46 -0
- data/spec/spec_helper.rb +8 -0
- data/spec/support/models.rb +92 -0
- data/test/test-doc-3-pages.doc +0 -0
- data/test/test-doc-3-pages.docx +0 -0
- data/test/test-doc-3-pages.docx.7z +0 -0
- data/test/test-doc-3-pages.docx.rar +0 -0
- data/test/test-doc-3-pages.docx.zip +0 -0
- data/test/test-doc-3-pages.html +279 -0
- data/test/test-doc-3-pages.odt +0 -0
- data/test/test-doc-3-pages.pdf +0 -0
- data/test/test-doc-3-pages.rtf +339 -0
- data/test/test-doc-3-pages.txt +125 -0
- data/test/test-doc-3-pages.wrong +0 -0
- metadata +279 -0
@@ -0,0 +1,35 @@
|
|
1
|
+
module ActAsPageExtractor
|
2
|
+
def extract_pages
|
3
|
+
convert_to_pdf
|
4
|
+
convert_to_text
|
5
|
+
end
|
6
|
+
|
7
|
+
def convert_to_pdf
|
8
|
+
@pdf_path = if 'pdf' == @document_path.split('.').last.downcase
|
9
|
+
@document_path
|
10
|
+
else
|
11
|
+
if timeout_wrapper{ Docsplit.extract_pdf(@document_path, output: @tmp_dir)}
|
12
|
+
pdf_path = (@document_path.split('.')[0..-2] + ['pdf']).join('.')
|
13
|
+
pdf_path if File.exists?(pdf_path)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def convert_to_text
|
19
|
+
begin
|
20
|
+
@pdf_pages = PdfUtils.info(@pdf_path).pages
|
21
|
+
if @pdf_pages
|
22
|
+
if timeout_wrapper{ Docsplit::extract_text(@pdf_path, ocr: false, pages: 'all', output: @tmp_dir) }
|
23
|
+
else
|
24
|
+
# :nocov:
|
25
|
+
@pdf_pages = nil
|
26
|
+
raise
|
27
|
+
# :nocov:
|
28
|
+
end
|
29
|
+
end
|
30
|
+
# :nocov:
|
31
|
+
rescue
|
32
|
+
end
|
33
|
+
# :nocov:
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module ActAsPageExtractor
|
2
|
+
def origin_file_name
|
3
|
+
self.send(:extracted_filename).url.to_s.split('/').last
|
4
|
+
end
|
5
|
+
|
6
|
+
def pdf_path
|
7
|
+
if page_extraction_state == EXTRACTING_STATES[:extracted] && page_extraction_doctype&.downcase != 'pdf'
|
8
|
+
"#{pdf_storage}/#{origin_file_name.split('.').first}.pdf"
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
def remove_files
|
13
|
+
FileUtils::rm_rf(pdf_path) if File.exists?(pdf_path.to_s)
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.start_extraction
|
17
|
+
document_class.where(page_extraction_state: EXTRACTING_STATES[:new]).each(&:page_extract!)
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.statistics
|
21
|
+
totals_documents = document_class.count
|
22
|
+
supported_documents = document_class.where("page_extraction_doctype ILIKE ANY (array[#{VALIDATE_DOC_TYPES.map{|dt| '\'%'+dt+'%\''}.join(',')}])").count
|
23
|
+
{
|
24
|
+
total: totals_documents,
|
25
|
+
supported_documents: supported_documents,
|
26
|
+
unsupported_documents: totals_documents - supported_documents,
|
27
|
+
states: EXTRACTING_STATES.map{|state, value| [ state, document_class.where(page_extraction_state: value).count] }.to_h,
|
28
|
+
}
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
module ActAsPageExtractor
|
2
|
+
def save_pdf
|
3
|
+
if save_as_pdf &&
|
4
|
+
is_extracted &&
|
5
|
+
@document_path.split('.').last&.downcase != 'pdf'
|
6
|
+
|
7
|
+
if @pdf_path
|
8
|
+
FileUtils.cp(@pdf_path, pdf_storage)
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
def save_to_db
|
14
|
+
self.update_attributes(page_extraction_state: EXTRACTING_STATES[:extracting])
|
15
|
+
ExtractedPage.transaction do
|
16
|
+
@pdf_pages&.times&.each do |pdf_page|
|
17
|
+
page_filename = "#{@tmp_dir}/#{@document_filename.split('.').first}_#{(pdf_page + 1).to_s}.txt"
|
18
|
+
remove_last_byte(page_filename)
|
19
|
+
content = IO.read(page_filename).delete("<" ">" "&" "\u0001" "\u25A0" "\a")
|
20
|
+
|
21
|
+
page_attributes = {
|
22
|
+
page: content,
|
23
|
+
page_number: pdf_page + 1
|
24
|
+
}
|
25
|
+
|
26
|
+
page_attributes[extracted_document_id] = self.id
|
27
|
+
|
28
|
+
additional_fields.each do |additional_field|
|
29
|
+
page_attributes[additional_field] = self.send(additional_field.to_sym)
|
30
|
+
end
|
31
|
+
|
32
|
+
ExtractedPage.create(page_attributes)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
#fix for openoffice/jodconverter: delete last ugly byte in converted text page
|
38
|
+
def remove_last_byte(file_name)
|
39
|
+
file = File.new(file_name, 'a+')
|
40
|
+
if file.size > 0
|
41
|
+
file.seek(file.size - 1)
|
42
|
+
last_byte = file.getc
|
43
|
+
file.truncate(file.size - 1) if last_byte == "\f"
|
44
|
+
end
|
45
|
+
file.close
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
module ActAsPageExtractor
|
2
|
+
def timeout_wrapper
|
3
|
+
result = nil
|
4
|
+
begin
|
5
|
+
result = Timeout::timeout(60*5) { yield }
|
6
|
+
rescue
|
7
|
+
# :nocov:
|
8
|
+
ensure
|
9
|
+
# :nocov:
|
10
|
+
result
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
def is_extracted
|
15
|
+
@pdf_pages.to_i > 0 && self.extracted_pages.count == @pdf_pages
|
16
|
+
end
|
17
|
+
|
18
|
+
def update_state
|
19
|
+
updated_attributes = if is_extracted
|
20
|
+
{
|
21
|
+
page_extraction_state: EXTRACTING_STATES[:extracted],
|
22
|
+
page_extraction_pages: @pdf_pages
|
23
|
+
}
|
24
|
+
else
|
25
|
+
{
|
26
|
+
page_extraction_state: EXTRACTING_STATES[:'error.extraction'],
|
27
|
+
page_extraction_pages: 0
|
28
|
+
}
|
29
|
+
end.merge({
|
30
|
+
page_extraction_doctype: @document_path&.split('.')&.last,
|
31
|
+
page_extraction_filesize: Filesize.from("#{File.size(@document_path)} B").pretty
|
32
|
+
})
|
33
|
+
self.update_attributes(updated_attributes)
|
34
|
+
end
|
35
|
+
|
36
|
+
def cleanup_pages
|
37
|
+
self.extracted_pages.destroy_all
|
38
|
+
end
|
39
|
+
|
40
|
+
# :nocov:
|
41
|
+
def debug_info
|
42
|
+
# ap "@tmp_dir"
|
43
|
+
# ap @tmp_dir
|
44
|
+
# ap "@copy_document_path"
|
45
|
+
# ap @copy_document_path
|
46
|
+
# ap "@document_path"
|
47
|
+
ap @document_path
|
48
|
+
# ap "@pdf_path"
|
49
|
+
# ap @pdf_path
|
50
|
+
# ap "@pdf_pages"
|
51
|
+
ap @pdf_pages
|
52
|
+
end
|
53
|
+
# :nocov:
|
54
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module ActAsPageExtractor
|
2
|
+
def unzip_document
|
3
|
+
@document_path = @copy_document_path
|
4
|
+
if validate_compress_types
|
5
|
+
result = TotalCompressor.decompress(@copy_document_path)
|
6
|
+
if result[:success] && result[:files].length == 1
|
7
|
+
origin_document_name = @origin_document_path.split("/").last.split('.').first
|
8
|
+
unpacked_document = result[:files].first.split('/').last
|
9
|
+
unpacked_document_format = unpacked_document.split('.').last
|
10
|
+
@document_path = "#{@tmp_dir}/#{origin_document_name}.#{unpacked_document_format}"
|
11
|
+
File.rename(result[:files].first, @document_path)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module ActAsPageExtractor
|
2
|
+
VALIDATE_COMPRESS_TYPES = ['zip', 'rar', '7z', 'gzip'].freeze
|
3
|
+
VALIDATE_DOC_TYPES = ['txt', 'pdf', 'doc', 'docx',
|
4
|
+
'rtf', 'odt', 'htm', 'html'].freeze
|
5
|
+
|
6
|
+
def valid_document
|
7
|
+
validate_size && validate_doc_types
|
8
|
+
end
|
9
|
+
|
10
|
+
def validate_size
|
11
|
+
mb = 2**20
|
12
|
+
File.size(@copy_document_path) <= 1*mb
|
13
|
+
end
|
14
|
+
|
15
|
+
def validate_compress_types
|
16
|
+
VALIDATE_COMPRESS_TYPES.include?(@copy_document_path.split('.').last&.downcase)
|
17
|
+
end
|
18
|
+
|
19
|
+
def validate_doc_types
|
20
|
+
VALIDATE_DOC_TYPES.include?(@document_path.split('.').last&.downcase)
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
# :nocov:
|
2
|
+
require 'rails/generators/active_record'
|
3
|
+
require 'rails/generators/base'
|
4
|
+
|
5
|
+
module ActAsPageExtractor
|
6
|
+
module Generators # :nodoc:
|
7
|
+
class MigrationGenerator < Rails::Generators::Base # :nodoc:
|
8
|
+
include Rails::Generators::Migration
|
9
|
+
|
10
|
+
argument :document_class, type: :string, default: 'Document'
|
11
|
+
argument :additional_fields, type: :array, default: []
|
12
|
+
|
13
|
+
def self.default_generator_root
|
14
|
+
File.dirname(__FILE__)
|
15
|
+
end
|
16
|
+
|
17
|
+
def create_migration_file
|
18
|
+
migration_template 'create_extracted_pages_table.rb.erb', "db/migrate/create_#{page_extractor_table_name}.rb"
|
19
|
+
migration_template 'add_page_extractor_fields_to_documents.rb.erb', "db/migrate/add_page_extractor_fields_to_#{documents_table_name}.rb"
|
20
|
+
template "extracted_page.rb.erb", "app/models/extracted_page.rb"
|
21
|
+
template "act_as_page_extractor.rb.erb", "config/initializers/act_as_page_extractor.rb"
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
def page_extractor_table_name
|
27
|
+
'extracted_pages'
|
28
|
+
end
|
29
|
+
|
30
|
+
def migration_class_name_page_extractor
|
31
|
+
"Create#{page_extractor_table_name.camelize}"
|
32
|
+
end
|
33
|
+
|
34
|
+
def documents_table_name
|
35
|
+
document_class.underscore.pluralize
|
36
|
+
end
|
37
|
+
|
38
|
+
|
39
|
+
def migration_class_name_documents
|
40
|
+
"AddPageExtractorFieldsTo#{document_class.pluralize}"
|
41
|
+
end
|
42
|
+
|
43
|
+
def self.next_migration_number(dirname)
|
44
|
+
ActiveRecord::Generators::Base.next_migration_number(dirname)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
# :nocov:
|
@@ -0,0 +1,14 @@
|
|
1
|
+
module ActAsPageExtractor
|
2
|
+
def self.import_files(directory: nil)
|
3
|
+
return unless directory
|
4
|
+
|
5
|
+
Dir["#{directory}/*"].each do |fname|
|
6
|
+
ap fname
|
7
|
+
document = <%=document_class%>.new(
|
8
|
+
name: fname.split('/').last
|
9
|
+
)
|
10
|
+
document.send("#{extracted_filename}=".to_sym, File.open(fname))
|
11
|
+
document.save!
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
data/lib/generators/act_as_page_extractor/templates/add_page_extractor_fields_to_documents.rb.erb
ADDED
@@ -0,0 +1,8 @@
|
|
1
|
+
class <%= migration_class_name_documents %> < ActiveRecord::Migration
|
2
|
+
def change
|
3
|
+
add_column :<%= documents_table_name %>, :page_extraction_state, :string, default: ''
|
4
|
+
add_column :<%= documents_table_name %>, :page_extraction_pages, :integer, default: 0
|
5
|
+
add_column :<%= documents_table_name %>, :page_extraction_doctype, :string, default: ''
|
6
|
+
add_column :<%= documents_table_name %>, :page_extraction_filesize, :string, default: ''
|
7
|
+
end
|
8
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
class <%= migration_class_name_page_extractor %> < ActiveRecord::Migration
|
2
|
+
def change
|
3
|
+
create_table :<%= page_extractor_table_name %> do |t|
|
4
|
+
t.text :page
|
5
|
+
t.integer :<%= documents_table_name.singularize %>_id, null: false
|
6
|
+
<% additional_fields.each do |field|%>
|
7
|
+
t.integer :<%= field %><%end%>
|
8
|
+
t.integer :page_number
|
9
|
+
t.timestamps
|
10
|
+
end
|
11
|
+
|
12
|
+
add_index :<%= page_extractor_table_name %>, :<%= documents_table_name.singularize %>_id
|
13
|
+
<% additional_fields.each do |field|%>
|
14
|
+
add_index :<%= page_extractor_table_name %>, :<%= field %><%end%>
|
15
|
+
<% additional_fields.each do |field|%>
|
16
|
+
add_index :<%= page_extractor_table_name %>, [:<%= documents_table_name.singularize %>_id, :<%= field %>]<%end%>
|
17
|
+
add_index :<%= page_extractor_table_name %>, [:<%= documents_table_name.singularize %>_id, :page_number]
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'act_as_page_extractor'
|
3
|
+
|
4
|
+
describe ActAsPageExtractor do
|
5
|
+
context 'correct extraction' do
|
6
|
+
[
|
7
|
+
'test-doc-3-pages.docx',
|
8
|
+
'test-doc-3-pages.doc',
|
9
|
+
'test-doc-3-pages.pdf',
|
10
|
+
'test-doc-3-pages.rtf',
|
11
|
+
'test-doc-3-pages.odt',
|
12
|
+
'test-doc-3-pages.html',
|
13
|
+
'test-doc-3-pages.txt',
|
14
|
+
'test-doc-3-pages.docx.zip',
|
15
|
+
'test-doc-3-pages.docx.rar',
|
16
|
+
'test-doc-3-pages.docx.7z'
|
17
|
+
].each do |document|
|
18
|
+
it "extraction valid document #{document}" do
|
19
|
+
book = Book.new({doc_path: document})
|
20
|
+
allow(Book).to receive_message_chain('where') { [book] }
|
21
|
+
ActAsPageExtractor.start_extraction
|
22
|
+
expect(book.page_extraction_state).to eq ActAsPageExtractor::EXTRACTING_STATES[:extracted]
|
23
|
+
expect(ExtractedPage.array.count).to eq 3
|
24
|
+
expect(ExtractedPage.array[0][:page]).to match /require \'act_as_page_extractor\/modules\/interface\'/
|
25
|
+
unless document.match /pdf/
|
26
|
+
expect(book.pdf_path).to match /pdf/
|
27
|
+
expect(book.remove_files.count).to eq 1
|
28
|
+
end
|
29
|
+
expect(ActAsPageExtractor.statistics).to include(supported_documents: 1)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
context 'incorrect extraction' do
|
35
|
+
[
|
36
|
+
'test-doc-3-pages.wrong',
|
37
|
+
].each do |document|
|
38
|
+
it "extraction invalid document #{document}" do
|
39
|
+
book = Book.new({doc_path: document})
|
40
|
+
allow(Book).to receive_message_chain('where') { [book] }
|
41
|
+
ActAsPageExtractor.start_extraction
|
42
|
+
expect(book.page_extraction_state).to eq ActAsPageExtractor::EXTRACTING_STATES[:'error.extraction']
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,92 @@
|
|
1
|
+
require 'act_as_page_extractor'
|
2
|
+
|
3
|
+
class Filename
|
4
|
+
attr_accessor :url
|
5
|
+
def initialize(params)
|
6
|
+
@url = params[:url]
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
class Book
|
11
|
+
cattr_accessor :id,
|
12
|
+
:category_id,
|
13
|
+
:user_id,
|
14
|
+
:page_extraction_state,
|
15
|
+
:page_extraction_pages,
|
16
|
+
:page_extraction_doctype,
|
17
|
+
:page_extraction_filesize
|
18
|
+
|
19
|
+
def self.before_create &block
|
20
|
+
yield
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.before_destroy *args
|
24
|
+
end
|
25
|
+
|
26
|
+
def self.count
|
27
|
+
1
|
28
|
+
end
|
29
|
+
|
30
|
+
include ActAsPageExtractor
|
31
|
+
|
32
|
+
act_as_page_extractor options: {
|
33
|
+
document_class: 'Book',
|
34
|
+
save_as_pdf: true,
|
35
|
+
filename: :filename, # CarrierWave class with 'filename.url' method
|
36
|
+
document_id: :document_id,
|
37
|
+
additional_fields: [:category_id, :user_id],
|
38
|
+
file_storage: "#{Dir.pwd}/test/",
|
39
|
+
pdf_storage: "#{Dir.pwd}/test/uploads/extracted/pdf"
|
40
|
+
}
|
41
|
+
|
42
|
+
def initialize(params)
|
43
|
+
@doc_path = params[:doc_path]
|
44
|
+
@id = @category_id = @user_id = nil
|
45
|
+
@page_extraction_state = @page_extraction_pages = nil
|
46
|
+
@page_extraction_doctype = @page_extraction_filesize = nil
|
47
|
+
ExtractedPage.cleanup
|
48
|
+
end
|
49
|
+
|
50
|
+
def filename
|
51
|
+
Filename.new(url: @doc_path)
|
52
|
+
end
|
53
|
+
|
54
|
+
def extracted_pages
|
55
|
+
array ||= ExtractedPage.array
|
56
|
+
|
57
|
+
def array.destroy_all
|
58
|
+
end
|
59
|
+
|
60
|
+
array
|
61
|
+
end
|
62
|
+
|
63
|
+
def update_attributes params
|
64
|
+
params.each do |key, value|
|
65
|
+
instance_eval("self.#{key} = #{value.class == String ? '\'' + value + '\'': value }")
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
class ExtractedPage
|
71
|
+
attr_accessor :id, :page, :document_id, :category_id, :page_number, :user_id
|
72
|
+
|
73
|
+
def document
|
74
|
+
end
|
75
|
+
|
76
|
+
def self.transaction &block
|
77
|
+
yield
|
78
|
+
end
|
79
|
+
|
80
|
+
def self.create params
|
81
|
+
@@array ||= []
|
82
|
+
@@array << params
|
83
|
+
end
|
84
|
+
|
85
|
+
def self.array
|
86
|
+
@@array ||= []
|
87
|
+
end
|
88
|
+
|
89
|
+
def self.cleanup
|
90
|
+
@@array = []
|
91
|
+
end
|
92
|
+
end
|