act_as_page_extractor 0.6.3 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/coverage.yml +32 -0
  3. data/.gitignore +1 -0
  4. data/Aptfile.sh +55 -0
  5. data/CHANGELOG.md +67 -0
  6. data/Gemfile +4 -2
  7. data/Gemfile.lock +13 -9
  8. data/README.md +27 -32
  9. data/act_as_page_extractor.gemspec +4 -3
  10. data/lib/act_as_page_extractor/modules/extracting.rb +20 -14
  11. data/lib/act_as_page_extractor/modules/interface.rb +1 -1
  12. data/lib/act_as_page_extractor/modules/tools.rb +14 -4
  13. data/lib/act_as_page_extractor/modules/unzipping.rb +14 -0
  14. data/lib/act_as_page_extractor/modules/validating.rb +15 -9
  15. data/lib/act_as_page_extractor/version.rb +1 -1
  16. data/lib/act_as_page_extractor.rb +27 -17
  17. data/lib/generators/act_as_page_extractor/templates/add_page_extractor_fields_to_documents.rb.erb +1 -0
  18. data/spec/act_as_page_extractor_spec.rb +58 -21
  19. data/spec/spec_helper.rb +1 -1
  20. data/spec/support/models.rb +11 -2
  21. data/test/Oscar_Wilde_The_Happy_Prince_en.doc +0 -0
  22. data/test/Oscar_Wilde_The_Happy_Prince_en.docx +0 -0
  23. data/test/Oscar_Wilde_The_Happy_Prince_en.docx.7z +0 -0
  24. data/test/Oscar_Wilde_The_Happy_Prince_en.docx.rar +0 -0
  25. data/test/Oscar_Wilde_The_Happy_Prince_en.docx.zip +0 -0
  26. data/test/Oscar_Wilde_The_Happy_Prince_en.html +395 -0
  27. data/test/Oscar_Wilde_The_Happy_Prince_en.odt +0 -0
  28. data/test/Oscar_Wilde_The_Happy_Prince_en.pdf +0 -0
  29. data/test/Oscar_Wilde_The_Happy_Prince_en.rtf +257 -0
  30. data/test/Oscar_Wilde_The_Happy_Prince_en.txt +79 -0
  31. data/test/Oscar_Wilde_The_Happy_Prince_en.wrong +0 -0
  32. metadata +36 -33
  33. data/test/test-doc-3-pages.doc +0 -0
  34. data/test/test-doc-3-pages.docx +0 -0
  35. data/test/test-doc-3-pages.docx.7z +0 -0
  36. data/test/test-doc-3-pages.docx.rar +0 -0
  37. data/test/test-doc-3-pages.docx.zip +0 -0
  38. data/test/test-doc-3-pages.html +0 -279
  39. data/test/test-doc-3-pages.odt +0 -0
  40. data/test/test-doc-3-pages.pdf +0 -0
  41. data/test/test-doc-3-pages.rtf +0 -339
  42. data/test/test-doc-3-pages.txt +0 -125
  43. data/test/test-doc-3-pages.wrong +0 -0
@@ -4,42 +4,79 @@ require 'act_as_page_extractor'
4
4
  describe ActAsPageExtractor do
5
5
  context 'correct extraction' do
6
6
  [
7
- 'test-doc-3-pages.docx',
8
- 'test-doc-3-pages.doc',
9
- 'test-doc-3-pages.pdf',
10
- 'test-doc-3-pages.rtf',
11
- 'test-doc-3-pages.odt',
12
- 'test-doc-3-pages.html',
13
- 'test-doc-3-pages.txt',
14
- 'test-doc-3-pages.docx.zip',
15
- 'test-doc-3-pages.docx.rar',
16
- 'test-doc-3-pages.docx.7z'
7
+ 'Oscar_Wilde_The_Happy_Prince_en.docx',
8
+ 'Oscar_Wilde_The_Happy_Prince_en.doc',
9
+ 'Oscar_Wilde_The_Happy_Prince_en.pdf',
10
+ 'Oscar_Wilde_The_Happy_Prince_en.rtf',
11
+ 'Oscar_Wilde_The_Happy_Prince_en.odt',
12
+ 'Oscar_Wilde_The_Happy_Prince_en.html',
13
+ 'Oscar_Wilde_The_Happy_Prince_en.txt',
14
+ 'Oscar_Wilde_The_Happy_Prince_en.docx.zip',
15
+ 'Oscar_Wilde_The_Happy_Prince_en.docx.rar',
16
+ 'Oscar_Wilde_The_Happy_Prince_en.docx.7z'
17
17
  ].each do |document|
18
- it "extraction valid document #{document}" do
18
+ it "extracts valid document #{document}" do
19
19
  book = Book.new({doc_path: document})
20
20
  allow(Book).to receive_message_chain('where') { [book] }
21
21
  ActAsPageExtractor.start_extraction
22
22
  expect(book.page_extraction_state).to eq ActAsPageExtractor::EXTRACTING_STATES[:extracted]
23
- expect(ExtractedPage.array.count).to eq 3
24
- expect(ExtractedPage.array[0][:page]).to match /require \'act_as_page_extractor\/modules\/interface\'/
23
+ expect(ExtractedPage.array.count).to eq 4
24
+ expect(ExtractedPage.array[0][:page]).to match /on a tall column, stood the statue of the Happy Prince/
25
25
  unless document.match /pdf/
26
26
  expect(book.pdf_path).to match /pdf/
27
27
  expect(book.remove_files.count).to eq 1
28
+ expect(book.pages_extraction_errors).to be_empty
28
29
  end
29
30
  expect(ActAsPageExtractor.statistics).to include(supported_documents: 1)
30
31
  end
31
32
  end
32
33
  end
33
34
 
34
- context 'incorrect extraction' do
35
- [
36
- 'test-doc-3-pages.wrong',
37
- ].each do |document|
38
- it "extraction invalid document #{document}" do
39
- book = Book.new({doc_path: document})
40
- allow(Book).to receive_message_chain('where') { [book] }
35
+ describe 'errors processing' do
36
+ let(:book) { Book.new({doc_path: document}) }
37
+
38
+ before do
39
+ allow(Book).to receive_message_chain('where') { [book] }
40
+ end
41
+
42
+ context 'when invalid doctype' do
43
+ let(:document) { 'Oscar_Wilde_The_Happy_Prince_en.wrong' }
44
+
45
+ it "logs invalid doctype" do
46
+ ActAsPageExtractor.start_extraction
47
+ expect(book.page_extraction_state).to eq 'error_doctype'
48
+ expect(book.pages_extraction_errors).to match('error_doctype')
49
+ end
50
+ end
51
+
52
+ context 'with extraction timeout' do
53
+ let(:error_msg) { 'execution expired' }
54
+ let(:document) { 'Oscar_Wilde_The_Happy_Prince_en.docx' }
55
+
56
+ before do
57
+ allow(Docsplit).to receive(:extract_pdf).and_raise(Timeout::Error.new(error_msg))
58
+ end
59
+
60
+ it "logs timeout error" do
61
+ ActAsPageExtractor.start_extraction
62
+ expect(book.page_extraction_state).to eq 'error_extraction'
63
+ expect(book.pages_extraction_errors).to match(error_msg)
64
+ end
65
+ end
66
+
67
+ context 'when Docsplit returns failure' do
68
+ let(:error_msg) { 'Unknown Docsplit error' }
69
+ let(:document) { 'Oscar_Wilde_The_Happy_Prince_en.docx' }
70
+
71
+ before do
72
+ allow(Docsplit).to receive(:extract_pdf).and_raise(Timeout::Error.new(error_msg))
73
+ allow(Docsplit).to receive(:extract_text).and_raise(Timeout::Error.new(error_msg))
74
+ end
75
+
76
+ it "logs Docsplit error" do
41
77
  ActAsPageExtractor.start_extraction
42
- expect(book.page_extraction_state).to eq ActAsPageExtractor::EXTRACTING_STATES[:'error.extraction']
78
+ expect(book.page_extraction_state).to eq 'error_extraction'
79
+ expect(book.pages_extraction_errors).to match(error_msg)
43
80
  end
44
81
  end
45
82
  end
data/spec/spec_helper.rb CHANGED
@@ -3,7 +3,7 @@ unless ENV['SKIP_COVERAGE']
3
3
  SimpleCov.start 'rails' do
4
4
  add_filter 'vendor'
5
5
  end
6
- SimpleCov.minimum_coverage 100
6
+ SimpleCov.minimum_coverage 98
7
7
  end
8
8
 
9
9
  require 'rspec'
@@ -14,7 +14,8 @@ class Book
14
14
  :page_extraction_state,
15
15
  :page_extraction_pages,
16
16
  :page_extraction_doctype,
17
- :page_extraction_filesize
17
+ :page_extraction_filesize,
18
+ :pages_extraction_errors
18
19
 
19
20
  def self.before_create &block
20
21
  yield
@@ -35,6 +36,7 @@ class Book
35
36
  filename: :filename, # CarrierWave class with 'filename.url' method
36
37
  document_id: :document_id,
37
38
  additional_fields: [:category_id, :user_id],
39
+ root_folder: Dir.pwd.to_s,
38
40
  file_storage: "#{Dir.pwd}/test/",
39
41
  pdf_storage: "#{Dir.pwd}/test/uploads/extracted/pdf"
40
42
  }
@@ -44,6 +46,7 @@ class Book
44
46
  @id = @category_id = @user_id = nil
45
47
  @page_extraction_state = @page_extraction_pages = nil
46
48
  @page_extraction_doctype = @page_extraction_filesize = nil
49
+ @pages_extraction_errors = ''
47
50
  ExtractedPage.cleanup
48
51
  end
49
52
 
@@ -62,7 +65,13 @@ class Book
62
65
 
63
66
  def update params
64
67
  params.each do |key, value|
65
- instance_eval("self.#{key} = #{value.class == String ? '\'' + value + '\'': value }")
68
+ if value.nil?
69
+ instance_eval("self.#{key} = nil")
70
+ elsif value.class == String
71
+ instance_eval("self.#{key} = \"#{value}\"")
72
+ else
73
+ instance_eval("self.#{key} = #{value}")
74
+ end
66
75
  end
67
76
  end
68
77
  end