act_as_page_extractor 0.6.2 → 0.6.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +2 -0
- data/Gemfile.lock +7 -4
- data/act_as_page_extractor.gemspec +1 -1
- data/lib/act_as_page_extractor/modules/extracting.rb +8 -1
- data/lib/act_as_page_extractor/modules/interface.rb +1 -1
- data/lib/act_as_page_extractor/modules/saving.rb +1 -1
- data/lib/act_as_page_extractor/modules/tools.rb +1 -1
- data/lib/act_as_page_extractor/version.rb +1 -1
- data/lib/act_as_page_extractor.rb +2 -2
- data/spec/act_as_page_extractor_spec.rb +13 -13
- data/spec/support/models.rb +1 -1
- data/test/Oscar_Wilde_The_Happy_Prince_en.doc +0 -0
- data/test/Oscar_Wilde_The_Happy_Prince_en.docx +0 -0
- data/test/Oscar_Wilde_The_Happy_Prince_en.docx.7z +0 -0
- data/test/Oscar_Wilde_The_Happy_Prince_en.docx.rar +0 -0
- data/test/Oscar_Wilde_The_Happy_Prince_en.docx.zip +0 -0
- data/test/Oscar_Wilde_The_Happy_Prince_en.html +395 -0
- data/test/Oscar_Wilde_The_Happy_Prince_en.odt +0 -0
- data/test/Oscar_Wilde_The_Happy_Prince_en.pdf +0 -0
- data/test/Oscar_Wilde_The_Happy_Prince_en.rtf +257 -0
- data/test/Oscar_Wilde_The_Happy_Prince_en.txt +79 -0
- data/test/Oscar_Wilde_The_Happy_Prince_en.wrong +0 -0
- metadata +26 -26
- data/test/test-doc-3-pages.doc +0 -0
- data/test/test-doc-3-pages.docx +0 -0
- data/test/test-doc-3-pages.docx.7z +0 -0
- data/test/test-doc-3-pages.docx.rar +0 -0
- data/test/test-doc-3-pages.docx.zip +0 -0
- data/test/test-doc-3-pages.html +0 -279
- data/test/test-doc-3-pages.odt +0 -0
- data/test/test-doc-3-pages.pdf +0 -0
- data/test/test-doc-3-pages.rtf +0 -339
- data/test/test-doc-3-pages.txt +0 -125
- data/test/test-doc-3-pages.wrong +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f412c60bccb3fca934efecbc7922af07b41297423e6a2c4fbe04b8110a0e22e8
|
4
|
+
data.tar.gz: 4b281d9c93de0955e90b1a9d500213b1fa7103c449d72354caaa3d5d29702ff5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b545143db8d5fd51fb4c5c3d95d76b8122576e26e6587bb0b8c1ec62303e7e7bc5509554132ba9e65d47294fd2fa7c803a01634529c8e4ca8ecf9d0b3f1a392c
|
7
|
+
data.tar.gz: 3ff648cca05fe842e97db5e5153399bcfcd34f98141cd47ad5ca511de588de67c080778b643cba9be372178a6d6fc497552082bece168d3e10be6788382c8426
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
act_as_page_extractor (0.6.
|
4
|
+
act_as_page_extractor (0.6.4)
|
5
5
|
activerecord (~> 6)
|
6
6
|
awesome_print (~> 1)
|
7
7
|
docsplit (~> 0)
|
@@ -80,13 +80,13 @@ GEM
|
|
80
80
|
zeitwerk (2.6.17)
|
81
81
|
|
82
82
|
PLATFORMS
|
83
|
-
|
83
|
+
x86_64-linux
|
84
84
|
|
85
85
|
DEPENDENCIES
|
86
86
|
act_as_page_extractor!
|
87
87
|
activerecord (~> 6)
|
88
88
|
awesome_print
|
89
|
-
bundler (~>
|
89
|
+
bundler (~> 2)
|
90
90
|
byebug
|
91
91
|
docsplit
|
92
92
|
filesize
|
@@ -98,5 +98,8 @@ DEPENDENCIES
|
|
98
98
|
simplecov
|
99
99
|
total_compressor
|
100
100
|
|
101
|
+
RUBY VERSION
|
102
|
+
ruby 3.2.0p0
|
103
|
+
|
101
104
|
BUNDLED WITH
|
102
|
-
|
105
|
+
2.4.1
|
@@ -17,7 +17,7 @@ Gem::Specification.new do |spec|
|
|
17
17
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
18
18
|
spec.require_paths = ['lib']
|
19
19
|
|
20
|
-
spec.add_development_dependency 'bundler', '~>
|
20
|
+
spec.add_development_dependency 'bundler', '~> 2'
|
21
21
|
spec.add_development_dependency 'rake', '~> 12', '>= 12.3.3'
|
22
22
|
spec.add_development_dependency 'byebug', '~> 0'
|
23
23
|
spec.add_development_dependency 'rspec', '~> 0'
|
@@ -1,3 +1,10 @@
|
|
1
|
+
# Fix: https://github.com/documentcloud/docsplit/pull/159
|
2
|
+
class File
|
3
|
+
class << self
|
4
|
+
alias_method :exists?, :exist?
|
5
|
+
end
|
6
|
+
end
|
7
|
+
|
1
8
|
module ActAsPageExtractor
|
2
9
|
def extract_pages
|
3
10
|
convert_to_pdf
|
@@ -10,7 +17,7 @@ module ActAsPageExtractor
|
|
10
17
|
else
|
11
18
|
if timeout_wrapper{ Docsplit.extract_pdf(@document_path, output: @tmp_dir)}
|
12
19
|
pdf_path = (@document_path.split('.')[0..-2] + ['pdf']).join('.')
|
13
|
-
pdf_path if File.
|
20
|
+
pdf_path if File.exist?(pdf_path)
|
14
21
|
end
|
15
22
|
end
|
16
23
|
end
|
@@ -11,7 +11,7 @@ module ActAsPageExtractor
|
|
11
11
|
end
|
12
12
|
|
13
13
|
def save_to_db
|
14
|
-
self.
|
14
|
+
self.update(page_extraction_state: EXTRACTING_STATES[:extracting])
|
15
15
|
ExtractedPage.transaction do
|
16
16
|
@pdf_pages&.times&.each do |pdf_page|
|
17
17
|
page_filename = "#{@tmp_dir}/#{@document_filename.split('.').first}_#{(pdf_page + 1).to_s}.txt"
|
@@ -32,7 +32,7 @@ module ActAsPageExtractor
|
|
32
32
|
page_extraction_doctype: @document_path&.split('.')&.last,
|
33
33
|
page_extraction_filesize: Filesize.from("#{File.size(@document_path)} B").pretty
|
34
34
|
})
|
35
|
-
self.
|
35
|
+
self.update(updated_attributes)
|
36
36
|
end
|
37
37
|
|
38
38
|
def cleanup_pages
|
@@ -85,13 +85,13 @@ module ActAsPageExtractor
|
|
85
85
|
|
86
86
|
def create_pdf_dir
|
87
87
|
if save_as_pdf
|
88
|
-
FileUtils::mkdir_p(pdf_storage) unless File.
|
88
|
+
FileUtils::mkdir_p(pdf_storage) unless File.exist?(pdf_storage)
|
89
89
|
end
|
90
90
|
end
|
91
91
|
|
92
92
|
def create_tmp_dir
|
93
93
|
@tmp_dir = "#{TMP_EXTRACTION_FILE_STORAGE}/#{SecureRandom.hex(6)}"
|
94
|
-
FileUtils::mkdir_p(@tmp_dir) unless File.
|
94
|
+
FileUtils::mkdir_p(@tmp_dir) unless File.exist?(@tmp_dir)
|
95
95
|
end
|
96
96
|
|
97
97
|
def copy_document
|
@@ -4,24 +4,24 @@ require 'act_as_page_extractor'
|
|
4
4
|
describe ActAsPageExtractor do
|
5
5
|
context 'correct extraction' do
|
6
6
|
[
|
7
|
-
'
|
8
|
-
'
|
9
|
-
'
|
10
|
-
'
|
11
|
-
'
|
12
|
-
'
|
13
|
-
'
|
14
|
-
'
|
15
|
-
'
|
16
|
-
'
|
7
|
+
'Oscar_Wilde_The_Happy_Prince_en.docx',
|
8
|
+
'Oscar_Wilde_The_Happy_Prince_en.doc',
|
9
|
+
'Oscar_Wilde_The_Happy_Prince_en.pdf',
|
10
|
+
'Oscar_Wilde_The_Happy_Prince_en.rtf',
|
11
|
+
'Oscar_Wilde_The_Happy_Prince_en.odt',
|
12
|
+
'Oscar_Wilde_The_Happy_Prince_en.html',
|
13
|
+
'Oscar_Wilde_The_Happy_Prince_en.txt',
|
14
|
+
'Oscar_Wilde_The_Happy_Prince_en.docx.zip',
|
15
|
+
'Oscar_Wilde_The_Happy_Prince_en.docx.rar',
|
16
|
+
'Oscar_Wilde_The_Happy_Prince_en.docx.7z'
|
17
17
|
].each do |document|
|
18
18
|
it "extraction valid document #{document}" do
|
19
19
|
book = Book.new({doc_path: document})
|
20
20
|
allow(Book).to receive_message_chain('where') { [book] }
|
21
21
|
ActAsPageExtractor.start_extraction
|
22
22
|
expect(book.page_extraction_state).to eq ActAsPageExtractor::EXTRACTING_STATES[:extracted]
|
23
|
-
expect(ExtractedPage.array.count).to eq
|
24
|
-
expect(ExtractedPage.array[0][:page]).to match /
|
23
|
+
expect(ExtractedPage.array.count).to eq 4
|
24
|
+
expect(ExtractedPage.array[0][:page]).to match /on a tall column, stood the statue of the Happy Prince/
|
25
25
|
unless document.match /pdf/
|
26
26
|
expect(book.pdf_path).to match /pdf/
|
27
27
|
expect(book.remove_files.count).to eq 1
|
@@ -33,7 +33,7 @@ describe ActAsPageExtractor do
|
|
33
33
|
|
34
34
|
context 'incorrect extraction' do
|
35
35
|
[
|
36
|
-
'
|
36
|
+
'Oscar_Wilde_The_Happy_Prince_en.wrong',
|
37
37
|
].each do |document|
|
38
38
|
it "extraction invalid document #{document}" do
|
39
39
|
book = Book.new({doc_path: document})
|
data/spec/support/models.rb
CHANGED
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|