act_as_page_extractor 0.6.3 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/coverage.yml +32 -0
- data/.gitignore +1 -0
- data/Aptfile.sh +55 -0
- data/CHANGELOG.md +67 -0
- data/Gemfile +4 -2
- data/Gemfile.lock +13 -9
- data/README.md +27 -32
- data/act_as_page_extractor.gemspec +4 -3
- data/lib/act_as_page_extractor/modules/extracting.rb +20 -14
- data/lib/act_as_page_extractor/modules/interface.rb +1 -1
- data/lib/act_as_page_extractor/modules/tools.rb +14 -4
- data/lib/act_as_page_extractor/modules/unzipping.rb +14 -0
- data/lib/act_as_page_extractor/modules/validating.rb +15 -9
- data/lib/act_as_page_extractor/version.rb +1 -1
- data/lib/act_as_page_extractor.rb +27 -17
- data/lib/generators/act_as_page_extractor/templates/add_page_extractor_fields_to_documents.rb.erb +1 -0
- data/spec/act_as_page_extractor_spec.rb +58 -21
- data/spec/spec_helper.rb +1 -1
- data/spec/support/models.rb +11 -2
- data/test/Oscar_Wilde_The_Happy_Prince_en.doc +0 -0
- data/test/Oscar_Wilde_The_Happy_Prince_en.docx +0 -0
- data/test/Oscar_Wilde_The_Happy_Prince_en.docx.7z +0 -0
- data/test/Oscar_Wilde_The_Happy_Prince_en.docx.rar +0 -0
- data/test/Oscar_Wilde_The_Happy_Prince_en.docx.zip +0 -0
- data/test/Oscar_Wilde_The_Happy_Prince_en.html +395 -0
- data/test/Oscar_Wilde_The_Happy_Prince_en.odt +0 -0
- data/test/Oscar_Wilde_The_Happy_Prince_en.pdf +0 -0
- data/test/Oscar_Wilde_The_Happy_Prince_en.rtf +257 -0
- data/test/Oscar_Wilde_The_Happy_Prince_en.txt +79 -0
- data/test/Oscar_Wilde_The_Happy_Prince_en.wrong +0 -0
- metadata +36 -33
- data/test/test-doc-3-pages.doc +0 -0
- data/test/test-doc-3-pages.docx +0 -0
- data/test/test-doc-3-pages.docx.7z +0 -0
- data/test/test-doc-3-pages.docx.rar +0 -0
- data/test/test-doc-3-pages.docx.zip +0 -0
- data/test/test-doc-3-pages.html +0 -279
- data/test/test-doc-3-pages.odt +0 -0
- data/test/test-doc-3-pages.pdf +0 -0
- data/test/test-doc-3-pages.rtf +0 -339
- data/test/test-doc-3-pages.txt +0 -125
- data/test/test-doc-3-pages.wrong +0 -0
@@ -4,42 +4,79 @@ require 'act_as_page_extractor'
|
|
4
4
|
describe ActAsPageExtractor do
|
5
5
|
context 'correct extraction' do
|
6
6
|
[
|
7
|
-
'
|
8
|
-
'
|
9
|
-
'
|
10
|
-
'
|
11
|
-
'
|
12
|
-
'
|
13
|
-
'
|
14
|
-
'
|
15
|
-
'
|
16
|
-
'
|
7
|
+
'Oscar_Wilde_The_Happy_Prince_en.docx',
|
8
|
+
'Oscar_Wilde_The_Happy_Prince_en.doc',
|
9
|
+
'Oscar_Wilde_The_Happy_Prince_en.pdf',
|
10
|
+
'Oscar_Wilde_The_Happy_Prince_en.rtf',
|
11
|
+
'Oscar_Wilde_The_Happy_Prince_en.odt',
|
12
|
+
'Oscar_Wilde_The_Happy_Prince_en.html',
|
13
|
+
'Oscar_Wilde_The_Happy_Prince_en.txt',
|
14
|
+
'Oscar_Wilde_The_Happy_Prince_en.docx.zip',
|
15
|
+
'Oscar_Wilde_The_Happy_Prince_en.docx.rar',
|
16
|
+
'Oscar_Wilde_The_Happy_Prince_en.docx.7z'
|
17
17
|
].each do |document|
|
18
|
-
it "
|
18
|
+
it "extracts valid document #{document}" do
|
19
19
|
book = Book.new({doc_path: document})
|
20
20
|
allow(Book).to receive_message_chain('where') { [book] }
|
21
21
|
ActAsPageExtractor.start_extraction
|
22
22
|
expect(book.page_extraction_state).to eq ActAsPageExtractor::EXTRACTING_STATES[:extracted]
|
23
|
-
expect(ExtractedPage.array.count).to eq
|
24
|
-
expect(ExtractedPage.array[0][:page]).to match /
|
23
|
+
expect(ExtractedPage.array.count).to eq 4
|
24
|
+
expect(ExtractedPage.array[0][:page]).to match /on a tall column, stood the statue of the Happy Prince/
|
25
25
|
unless document.match /pdf/
|
26
26
|
expect(book.pdf_path).to match /pdf/
|
27
27
|
expect(book.remove_files.count).to eq 1
|
28
|
+
expect(book.pages_extraction_errors).to be_empty
|
28
29
|
end
|
29
30
|
expect(ActAsPageExtractor.statistics).to include(supported_documents: 1)
|
30
31
|
end
|
31
32
|
end
|
32
33
|
end
|
33
34
|
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
35
|
+
describe 'errors processing' do
|
36
|
+
let(:book) { Book.new({doc_path: document}) }
|
37
|
+
|
38
|
+
before do
|
39
|
+
allow(Book).to receive_message_chain('where') { [book] }
|
40
|
+
end
|
41
|
+
|
42
|
+
context 'when invalid doctype' do
|
43
|
+
let(:document) { 'Oscar_Wilde_The_Happy_Prince_en.wrong' }
|
44
|
+
|
45
|
+
it "logs invalid doctype" do
|
46
|
+
ActAsPageExtractor.start_extraction
|
47
|
+
expect(book.page_extraction_state).to eq 'error_doctype'
|
48
|
+
expect(book.pages_extraction_errors).to match('error_doctype')
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
context 'with extraction timeout' do
|
53
|
+
let(:error_msg) { 'execution expired' }
|
54
|
+
let(:document) { 'Oscar_Wilde_The_Happy_Prince_en.docx' }
|
55
|
+
|
56
|
+
before do
|
57
|
+
allow(Docsplit).to receive(:extract_pdf).and_raise(Timeout::Error.new(error_msg))
|
58
|
+
end
|
59
|
+
|
60
|
+
it "logs timeout error" do
|
61
|
+
ActAsPageExtractor.start_extraction
|
62
|
+
expect(book.page_extraction_state).to eq 'error_extraction'
|
63
|
+
expect(book.pages_extraction_errors).to match(error_msg)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
context 'when Docsplit returns failure' do
|
68
|
+
let(:error_msg) { 'Unknown Docsplit error' }
|
69
|
+
let(:document) { 'Oscar_Wilde_The_Happy_Prince_en.docx' }
|
70
|
+
|
71
|
+
before do
|
72
|
+
allow(Docsplit).to receive(:extract_pdf).and_raise(Timeout::Error.new(error_msg))
|
73
|
+
allow(Docsplit).to receive(:extract_text).and_raise(Timeout::Error.new(error_msg))
|
74
|
+
end
|
75
|
+
|
76
|
+
it "logs Docsplit error" do
|
41
77
|
ActAsPageExtractor.start_extraction
|
42
|
-
expect(book.page_extraction_state).to eq
|
78
|
+
expect(book.page_extraction_state).to eq 'error_extraction'
|
79
|
+
expect(book.pages_extraction_errors).to match(error_msg)
|
43
80
|
end
|
44
81
|
end
|
45
82
|
end
|
data/spec/spec_helper.rb
CHANGED
data/spec/support/models.rb
CHANGED
@@ -14,7 +14,8 @@ class Book
|
|
14
14
|
:page_extraction_state,
|
15
15
|
:page_extraction_pages,
|
16
16
|
:page_extraction_doctype,
|
17
|
-
:page_extraction_filesize
|
17
|
+
:page_extraction_filesize,
|
18
|
+
:pages_extraction_errors
|
18
19
|
|
19
20
|
def self.before_create &block
|
20
21
|
yield
|
@@ -35,6 +36,7 @@ class Book
|
|
35
36
|
filename: :filename, # CarrierWave class with 'filename.url' method
|
36
37
|
document_id: :document_id,
|
37
38
|
additional_fields: [:category_id, :user_id],
|
39
|
+
root_folder: Dir.pwd.to_s,
|
38
40
|
file_storage: "#{Dir.pwd}/test/",
|
39
41
|
pdf_storage: "#{Dir.pwd}/test/uploads/extracted/pdf"
|
40
42
|
}
|
@@ -44,6 +46,7 @@ class Book
|
|
44
46
|
@id = @category_id = @user_id = nil
|
45
47
|
@page_extraction_state = @page_extraction_pages = nil
|
46
48
|
@page_extraction_doctype = @page_extraction_filesize = nil
|
49
|
+
@pages_extraction_errors = ''
|
47
50
|
ExtractedPage.cleanup
|
48
51
|
end
|
49
52
|
|
@@ -62,7 +65,13 @@ class Book
|
|
62
65
|
|
63
66
|
def update params
|
64
67
|
params.each do |key, value|
|
65
|
-
|
68
|
+
if value.nil?
|
69
|
+
instance_eval("self.#{key} = nil")
|
70
|
+
elsif value.class == String
|
71
|
+
instance_eval("self.#{key} = \"#{value}\"")
|
72
|
+
else
|
73
|
+
instance_eval("self.#{key} = #{value}")
|
74
|
+
end
|
66
75
|
end
|
67
76
|
end
|
68
77
|
end
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|