act_as_page_extractor 0.6.3 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/coverage.yml +32 -0
- data/.gitignore +1 -0
- data/Aptfile.sh +55 -0
- data/CHANGELOG.md +67 -0
- data/Gemfile +4 -2
- data/Gemfile.lock +13 -9
- data/README.md +27 -32
- data/act_as_page_extractor.gemspec +4 -3
- data/lib/act_as_page_extractor/modules/extracting.rb +20 -14
- data/lib/act_as_page_extractor/modules/interface.rb +1 -1
- data/lib/act_as_page_extractor/modules/tools.rb +14 -4
- data/lib/act_as_page_extractor/modules/unzipping.rb +14 -0
- data/lib/act_as_page_extractor/modules/validating.rb +15 -9
- data/lib/act_as_page_extractor/version.rb +1 -1
- data/lib/act_as_page_extractor.rb +27 -17
- data/lib/generators/act_as_page_extractor/templates/add_page_extractor_fields_to_documents.rb.erb +1 -0
- data/spec/act_as_page_extractor_spec.rb +58 -21
- data/spec/spec_helper.rb +1 -1
- data/spec/support/models.rb +11 -2
- data/test/Oscar_Wilde_The_Happy_Prince_en.doc +0 -0
- data/test/Oscar_Wilde_The_Happy_Prince_en.docx +0 -0
- data/test/Oscar_Wilde_The_Happy_Prince_en.docx.7z +0 -0
- data/test/Oscar_Wilde_The_Happy_Prince_en.docx.rar +0 -0
- data/test/Oscar_Wilde_The_Happy_Prince_en.docx.zip +0 -0
- data/test/Oscar_Wilde_The_Happy_Prince_en.html +395 -0
- data/test/Oscar_Wilde_The_Happy_Prince_en.odt +0 -0
- data/test/Oscar_Wilde_The_Happy_Prince_en.pdf +0 -0
- data/test/Oscar_Wilde_The_Happy_Prince_en.rtf +257 -0
- data/test/Oscar_Wilde_The_Happy_Prince_en.txt +79 -0
- data/test/Oscar_Wilde_The_Happy_Prince_en.wrong +0 -0
- metadata +36 -33
- data/test/test-doc-3-pages.doc +0 -0
- data/test/test-doc-3-pages.docx +0 -0
- data/test/test-doc-3-pages.docx.7z +0 -0
- data/test/test-doc-3-pages.docx.rar +0 -0
- data/test/test-doc-3-pages.docx.zip +0 -0
- data/test/test-doc-3-pages.html +0 -279
- data/test/test-doc-3-pages.odt +0 -0
- data/test/test-doc-3-pages.pdf +0 -0
- data/test/test-doc-3-pages.rtf +0 -339
- data/test/test-doc-3-pages.txt +0 -125
- data/test/test-doc-3-pages.wrong +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0e2f7b9cc30bb8ba03a61ed873b43d4b1764997c65f470a751abe3a2042561e4
|
4
|
+
data.tar.gz: ec3e7125f2119a3666e8e9732ff2ae171df2b1c7e51a02eb9c0794e9f73a5bf5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b4ba9e08f4a11c250cb9449035d5728dbd764870c85709aa76a2f532e3f41d5e895a813ddd8a7bf98b3885ca871c25aa29867db9f2e7b099e4d99504c3a81907
|
7
|
+
data.tar.gz: 36e66d4bbf0300f570cf964902c0163caad97c8c6ca0c8856c7aaba8c72a1716b0a6a8218e5176edebd8dd261786a9142f9ba9cbfe62cf0193c9b677fb17088a
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# .github/workflows/coverage.yml
|
2
|
+
name: Coverage
|
3
|
+
|
4
|
+
|
5
|
+
on:
|
6
|
+
push:
|
7
|
+
branches: [ main, master ]
|
8
|
+
pull_request:
|
9
|
+
branches: [ main, master ]
|
10
|
+
|
11
|
+
jobs:
|
12
|
+
test:
|
13
|
+
runs-on: ubuntu-latest
|
14
|
+
steps:
|
15
|
+
- uses: actions/checkout@v4
|
16
|
+
- name: Set up Ruby
|
17
|
+
uses: ruby/setup-ruby@v1
|
18
|
+
with:
|
19
|
+
ruby-version: 3.2.3
|
20
|
+
- name: Install dependencies
|
21
|
+
run: |
|
22
|
+
sudo apt-get update
|
23
|
+
sudo apt-get install -y libreoffice unoconv poppler-utils zlib1g zlib1g-dev zip rar p7zip-full
|
24
|
+
bundle install --jobs 4 --retry 3
|
25
|
+
- name: Run tests with coverage
|
26
|
+
run: |
|
27
|
+
bundle exec rspec
|
28
|
+
- name: Upload coverage to Codecov
|
29
|
+
uses: codecov/codecov-action@v4
|
30
|
+
with:
|
31
|
+
files: ./coverage/.resultset.json,./coverage/coverage.json,./coverage/index.html
|
32
|
+
fail_ci_if_error: false
|
data/.gitignore
CHANGED
data/Aptfile.sh
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
# https://github.com/phlowerteam/total_compressor
|
2
|
+
sudo apt-get install zlib1g
|
3
|
+
sudo apt-get install zlib1g-dev
|
4
|
+
sudo apt-get install zip
|
5
|
+
sudo apt-get install rar
|
6
|
+
sudo apt-get install p7zip-full
|
7
|
+
|
8
|
+
# PDF
|
9
|
+
sudo apt-get install poppler-utils
|
10
|
+
|
11
|
+
# OpenOffice
|
12
|
+
sudo apt-get install jodconverter
|
13
|
+
|
14
|
+
sudo apt-get install nautilus-filename-repairer
|
15
|
+
sudo apt-get install python3-chardet
|
16
|
+
sudo apt-get install xfonts-encodings
|
17
|
+
sudo apt-get install libfontenc1
|
18
|
+
sudo apt-get install console-setup
|
19
|
+
sudo apt-get install fontconfig
|
20
|
+
sudo apt-get install fontconfig-config
|
21
|
+
sudo apt-get install fonts-kacst
|
22
|
+
sudo apt-get install fonts-kacst-one
|
23
|
+
sudo apt-get install fonts-khmeros-core
|
24
|
+
sudo apt-get install fonts-lao
|
25
|
+
sudo apt-get install fonts-liberation
|
26
|
+
sudo apt-get install fonts-nanum
|
27
|
+
sudo apt-get install fonts-opensymbol
|
28
|
+
sudo apt-get install fonts-sil-gentium-basic
|
29
|
+
sudo apt-get install fonts-takao-pgothic
|
30
|
+
sudo apt-get install fonts-thai-tlwg
|
31
|
+
sudo apt-get install fonts-tlwg-garuda
|
32
|
+
sudo apt-get install fonts-tlwg-kinnari
|
33
|
+
sudo apt-get install fonts-tlwg-loma
|
34
|
+
sudo apt-get install fonts-tlwg-mono
|
35
|
+
sudo apt-get install fonts-tlwg-norasi
|
36
|
+
sudo apt-get install fonts-tlwg-purisa
|
37
|
+
sudo apt-get install fonts-tlwg-sawasdee
|
38
|
+
sudo apt-get install fonts-tlwg-typewriter
|
39
|
+
sudo apt-get install fonts-tlwg-typist
|
40
|
+
sudo apt-get install fonts-tlwg-typo
|
41
|
+
sudo apt-get install fonts-tlwg-umpush
|
42
|
+
sudo apt-get install fonts-tlwg-waree
|
43
|
+
sudo apt-get install gnome-font-viewer
|
44
|
+
sudo apt-get install gsfonts gucharmap
|
45
|
+
sudo apt-get install kbd
|
46
|
+
sudo apt-get install libfontconfig1
|
47
|
+
sudo apt-get install libfontenc1
|
48
|
+
sudo apt-get install libfreetype6
|
49
|
+
sudo apt-get install libxft2
|
50
|
+
sudo apt-get install fonts-ubuntu
|
51
|
+
sudo apt-get install fonts-wqy-microhei
|
52
|
+
sudo apt-get install x11-xfs-utils xfonts-base
|
53
|
+
sudo apt-get install xfonts-encodings
|
54
|
+
sudo apt-get install xfonts-scalable
|
55
|
+
sudo apt-get install xfonts-utils
|
data/CHANGELOG.md
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
# Changelog
|
2
|
+
|
3
|
+
## [0.7.0] - 2025-08-21
|
4
|
+
### Added
|
5
|
+
- Breaking changes: added root folder as an option to access the folder between deployments, improved error processing ([883cafc], [b10a367])
|
6
|
+
|
7
|
+
HINT: to upgrade an older version, you need to fix the DB scheme and data migrations like this:
|
8
|
+
```rb
|
9
|
+
# db/migrate/20250804182426_upgrade_act_as_page_extractor_to_version.rb
|
10
|
+
class UpgradeActAsPageExtractorToVersion < ActiveRecord::Migration
|
11
|
+
def change
|
12
|
+
add_column :documents, :pages_extraction_errors, :string, default: ''
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
# db/data/20250804183544_upgrade_act_as_page_extractor_to_version.rb
|
17
|
+
class UpgradeActAsPageExtractorToVersion < ActiveRecord::Migration
|
18
|
+
def up
|
19
|
+
Document
|
20
|
+
.where(page_extraction_state: 'error.extraction')
|
21
|
+
.update_all(page_extraction_state: 'error_extraction')
|
22
|
+
end
|
23
|
+
|
24
|
+
def down
|
25
|
+
raise ActiveRecord::IrreversibleMigration
|
26
|
+
end
|
27
|
+
end
|
28
|
+
```
|
29
|
+
|
30
|
+
## [0.6.0] - 2024-08-31
|
31
|
+
### Changed
|
32
|
+
- Upgraded to Ruby 3.2 minimal version ([4b463a1], [be52d9c])
|
33
|
+
- Upgraded to ActiveRecord >=6.x.x ([7881613], [48e6d8b], [8bd3707])
|
34
|
+
- Improved docs & Readme ([c405044], [6e895bb])
|
35
|
+
|
36
|
+
## [0.5.0] - 2024-08-30
|
37
|
+
### Changed
|
38
|
+
- Upgraded to ActiveRecord 6.0 ([9eea586])
|
39
|
+
|
40
|
+
## [0.2.3] - 2020-06-05
|
41
|
+
### Changed
|
42
|
+
- Upgraded to ActiveRecord-5.2.0 ([cde1f36])
|
43
|
+
|
44
|
+
## [0.2.2] - 2020-06-04
|
45
|
+
### Changed
|
46
|
+
- Upgraded to ActiveRecord-5.1.0 ([f6ea8d7])
|
47
|
+
|
48
|
+
## [0.2.1] - 2020-05-11
|
49
|
+
### Changed
|
50
|
+
- Upgraded to ActiveRecord-5.0.0 ([5c595ee], [3eb4ad7])
|
51
|
+
|
52
|
+
## [0.1.6] - 2020-05-10
|
53
|
+
### Changed
|
54
|
+
- Updated libraries ([eca4346]), ([7d3bb4f], [203c689], [171cf27])
|
55
|
+
|
56
|
+
## [0.1.2] - 2018-11-29
|
57
|
+
### Changed
|
58
|
+
- Updated rubyzip library ([38c4156])
|
59
|
+
|
60
|
+
## [0.1.1] - 2017-01-10
|
61
|
+
### Changed
|
62
|
+
- Removed code coverage from Rails generators ([a990357])
|
63
|
+
|
64
|
+
## [0.1.0] - 2017-01-09
|
65
|
+
### Added
|
66
|
+
- Initial commit ([47c0950], [5225f33])
|
67
|
+
- Fixed tests ([e68a6b7])
|
data/Gemfile
CHANGED
@@ -3,9 +3,11 @@ source 'https://rubygems.org'
|
|
3
3
|
# Specify your gem's dependencies in total_compressor.gemspec
|
4
4
|
gemspec
|
5
5
|
|
6
|
-
|
6
|
+
ruby '3.2.3'
|
7
7
|
|
8
|
-
gem '
|
8
|
+
gem 'activerecord'
|
9
|
+
|
10
|
+
gem 'amazing_print'
|
9
11
|
|
10
12
|
gem 'docsplit' # API for OpenOffice jodconverter (any to pdf)
|
11
13
|
gem 'pdf_utils' # getting text from pdf
|
data/Gemfile.lock
CHANGED
@@ -1,9 +1,9 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
act_as_page_extractor (0.
|
5
|
-
activerecord (
|
6
|
-
|
4
|
+
act_as_page_extractor (0.7.0)
|
5
|
+
activerecord (>= 6)
|
6
|
+
amazing_print (~> 1)
|
7
7
|
docsplit (~> 0)
|
8
8
|
filesize (~> 0)
|
9
9
|
pdf-reader (~> 1, >= 1.4)
|
@@ -27,8 +27,9 @@ GEM
|
|
27
27
|
tzinfo (~> 2.0)
|
28
28
|
zeitwerk (~> 2.3)
|
29
29
|
afm (0.2.2)
|
30
|
+
amazing_print (1.8.1)
|
30
31
|
awesome_print (1.9.2)
|
31
|
-
byebug (
|
32
|
+
byebug (12.0.0)
|
32
33
|
concurrent-ruby (1.3.4)
|
33
34
|
diff-lcs (1.5.1)
|
34
35
|
docile (1.4.1)
|
@@ -80,13 +81,13 @@ GEM
|
|
80
81
|
zeitwerk (2.6.17)
|
81
82
|
|
82
83
|
PLATFORMS
|
83
|
-
|
84
|
+
x86_64-linux
|
84
85
|
|
85
86
|
DEPENDENCIES
|
86
87
|
act_as_page_extractor!
|
87
|
-
activerecord
|
88
|
-
|
89
|
-
bundler (
|
88
|
+
activerecord
|
89
|
+
amazing_print
|
90
|
+
bundler (>= 2.2.33)
|
90
91
|
byebug
|
91
92
|
docsplit
|
92
93
|
filesize
|
@@ -98,5 +99,8 @@ DEPENDENCIES
|
|
98
99
|
simplecov
|
99
100
|
total_compressor
|
100
101
|
|
102
|
+
RUBY VERSION
|
103
|
+
ruby 3.2.3p157
|
104
|
+
|
101
105
|
BUNDLED WITH
|
102
|
-
|
106
|
+
2.4.1
|
data/README.md
CHANGED
@@ -1,30 +1,29 @@
|
|
1
|
+
[](https://codecov.io/gh/phlowerteam/act_as_page_extractor)
|
2
|
+
|
1
3
|
act_as_page_extractor
|
2
4
|
================
|
3
5
|
|
4
|
-
|
6
|
+
A library that extracts plain text from documents for subsequent processing, such as indexing and search.
|
5
7
|
|
6
8
|
## Installation
|
7
9
|
|
8
|
-
Install
|
9
|
-
|
10
|
+
Install all dependencies before use:
|
10
11
|
```sh
|
11
|
-
|
12
|
+
sh Aptfile.sh
|
12
13
|
```
|
13
|
-
Add this line to your application's Gemfile:
|
14
14
|
|
15
|
+
Add this to your Gemfile:
|
15
16
|
```rb
|
16
17
|
gem 'act_as_page_extractor'
|
17
|
-
bundle
|
18
18
|
```
|
19
19
|
## Usage
|
20
20
|
|
21
|
-
|
22
|
-
|
21
|
+
Generate a migration, for example for a Document model:
|
23
22
|
```sh
|
24
23
|
rails g act_as_page_extractor:migration Document category_id user_id
|
25
24
|
```
|
26
25
|
|
27
|
-
|
26
|
+
This will generate two migration files:
|
28
27
|
```rb
|
29
28
|
class AddPageExtractorFields < ActiveRecord::Migration
|
30
29
|
def change
|
@@ -32,6 +31,7 @@ class AddPageExtractorFields < ActiveRecord::Migration
|
|
32
31
|
add_column :documents, :page_extraction_pages, :integer, default: 0
|
33
32
|
add_column :documents, :page_extraction_doctype, :string, default: ''
|
34
33
|
add_column :documents, :page_extraction_filesize, :string, default: ''
|
34
|
+
add_column :documents, :pages_extraction_errors, :string, default: ''
|
35
35
|
end
|
36
36
|
end
|
37
37
|
|
@@ -65,20 +65,20 @@ Add to model next parameters for initializing:
|
|
65
65
|
|
66
66
|
act_as_page_extractor options: {
|
67
67
|
document_class: 'Document',
|
68
|
-
save_as_pdf: true,
|
68
|
+
save_as_pdf: true, # store converted document as PDF
|
69
69
|
filename: :filename,
|
70
70
|
document_id: :document_id,
|
71
|
-
additional_fields: [:category_id, :user_id],
|
72
|
-
#
|
73
|
-
#
|
71
|
+
additional_fields: [:category_id, :user_id], # copy values of these fields from document to extracted_page
|
72
|
+
root_folder: Rails.root.to_s, # or "/full/path/to/project", it needs to share folder between deployments
|
73
|
+
# file_storage: "/full/path/to/project/public/uploads/documents/storage" # optional
|
74
|
+
# pdf_storage: "/full/path/to/project/public/uploads/extracted/pdf/storage" # optional
|
74
75
|
}
|
75
76
|
|
76
77
|
has_many :extracted_pages, dependent: :destroy
|
77
78
|
end
|
78
79
|
```
|
79
80
|
|
80
|
-
|
81
|
-
|
81
|
+
The instance now provides several new methods:
|
82
82
|
```rb
|
83
83
|
document = Document.first
|
84
84
|
document.page_extract!
|
@@ -100,29 +100,24 @@ ActAsPageExtractor.statistics
|
|
100
100
|
|
101
101
|
Parameters of initializing **act_as_page_extractor**:
|
102
102
|
|
103
|
-
* **document_class**
|
104
|
-
* **save_as_pdf**
|
105
|
-
* **filename**
|
106
|
-
* **document_id**
|
107
|
-
* **additional_fields**
|
108
|
-
* **
|
109
|
-
* **
|
103
|
+
* **document_class** — The name of the model (e.g., `Document`).
|
104
|
+
* **save_as_pdf** — Boolean (`true`/`false`). Indicates whether to save a temporary PDF.
|
105
|
+
* **filename** — The field containing access to the file. This should be an object with a `url` method that returns the file path (e.g., a CarrierWave object with `filename.url`).
|
106
|
+
* **document_id** — The field name for storing the document ID.
|
107
|
+
* **additional_fields** — Extra fields to be added to the extracted page (useful for indexing, etc.).
|
108
|
+
* **root_folder** — The root folder to be shared across deployments (e.g., `Rails.root.to_s`).
|
109
|
+
* **file_storage** — Path for saving temporary files (default: `"public"`).
|
110
|
+
* **pdf_storage** — Path for saving PDFs (default: `"public/uploads/extracted/pdf"`).
|
110
111
|
|
111
112
|
## Run tests
|
112
113
|
```sh
|
114
|
+
bundle
|
113
115
|
rspec
|
114
116
|
```
|
115
|
-
## Contributing
|
116
|
-
1. Fork it
|
117
|
-
2. Create your feature branch (`git checkout -b my-new-feature`)
|
118
|
-
3. Commit your changes (`git commit -am 'Add some feature'`)
|
119
|
-
4. Push to the branch (`git push origin my-new-feature`)
|
120
|
-
5. Create new Pull Request
|
121
117
|
|
122
118
|
## Contacts
|
123
|
-
https://github.com/phlowerteam
|
124
|
-
phlowerteam@gmail.com
|
119
|
+
https://github.com/phlowerteam / phlowerteam[A]gmail.com
|
125
120
|
|
126
121
|
## License
|
127
|
-
|
128
|
-
MIT License
|
122
|
+
|
123
|
+
MIT License © 2025 PhlowerTeam
|
@@ -16,15 +16,16 @@ Gem::Specification.new do |spec|
|
|
16
16
|
spec.files = `git ls-files`.split($/)
|
17
17
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
18
18
|
spec.require_paths = ['lib']
|
19
|
+
spec.required_ruby_version = '>= 3.2'
|
19
20
|
|
20
|
-
spec.add_development_dependency 'bundler',
|
21
|
+
spec.add_development_dependency 'bundler', '>= 2.2.33'
|
21
22
|
spec.add_development_dependency 'rake', '~> 12', '>= 12.3.3'
|
22
23
|
spec.add_development_dependency 'byebug', '~> 0'
|
23
24
|
spec.add_development_dependency 'rspec', '~> 0'
|
24
25
|
spec.add_development_dependency 'simplecov', '~> 0'
|
25
26
|
|
26
|
-
spec.add_runtime_dependency 'activerecord', '
|
27
|
-
spec.add_runtime_dependency '
|
27
|
+
spec.add_runtime_dependency 'activerecord', '>= 6'
|
28
|
+
spec.add_runtime_dependency 'amazing_print', '~> 1'
|
28
29
|
spec.add_runtime_dependency 'docsplit', '~> 0' # API for OpenOffice jodconverter (any to pdf)
|
29
30
|
spec.add_runtime_dependency 'pdf_utils', '~> 0' # getting text from pdf
|
30
31
|
spec.add_runtime_dependency 'prawn', '~> 1' # need for pdf_utils
|
@@ -1,3 +1,10 @@
|
|
1
|
+
# Fix: https://github.com/documentcloud/docsplit/pull/159
|
2
|
+
class File
|
3
|
+
class << self
|
4
|
+
alias_method :exists?, :exist?
|
5
|
+
end
|
6
|
+
end
|
7
|
+
|
1
8
|
module ActAsPageExtractor
|
2
9
|
def extract_pages
|
3
10
|
convert_to_pdf
|
@@ -10,26 +17,25 @@ module ActAsPageExtractor
|
|
10
17
|
else
|
11
18
|
if timeout_wrapper{ Docsplit.extract_pdf(@document_path, output: @tmp_dir)}
|
12
19
|
pdf_path = (@document_path.split('.')[0..-2] + ['pdf']).join('.')
|
13
|
-
pdf_path if File.
|
20
|
+
pdf_path if File.exist?(pdf_path)
|
14
21
|
end
|
15
22
|
end
|
23
|
+
rescue StandardError => e
|
24
|
+
add_error(e)
|
16
25
|
end
|
17
26
|
|
18
27
|
def convert_to_text
|
19
|
-
|
20
|
-
|
21
|
-
if @
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
# :nocov:
|
28
|
-
end
|
28
|
+
@pdf_pages = PdfUtils.info(@pdf_path).pages
|
29
|
+
if @pdf_pages
|
30
|
+
if timeout_wrapper{ Docsplit::extract_text(@pdf_path, ocr: false, pages: 'all', output: @tmp_dir) }
|
31
|
+
else
|
32
|
+
# :nocov:
|
33
|
+
@pdf_pages = nil
|
34
|
+
raise ERRORS[:unknown_docsplit_error]
|
35
|
+
# :nocov:
|
29
36
|
end
|
30
|
-
# :nocov:
|
31
|
-
rescue
|
32
37
|
end
|
33
|
-
|
38
|
+
rescue StandardError => e
|
39
|
+
add_error(e)
|
34
40
|
end
|
35
41
|
end
|
@@ -5,8 +5,9 @@ module ActAsPageExtractor
|
|
5
5
|
def timeout_wrapper
|
6
6
|
result = nil
|
7
7
|
begin
|
8
|
-
result = Timeout::timeout(
|
9
|
-
rescue
|
8
|
+
result = Timeout::timeout(EXTRACTION_TIMEOUT) { yield }
|
9
|
+
rescue StandardError => e
|
10
|
+
add_error(e)
|
10
11
|
ensure
|
11
12
|
result
|
12
13
|
end
|
@@ -25,12 +26,13 @@ module ActAsPageExtractor
|
|
25
26
|
}
|
26
27
|
else
|
27
28
|
{
|
28
|
-
page_extraction_state: EXTRACTING_STATES[:
|
29
|
+
page_extraction_state: @page_extraction_state || EXTRACTING_STATES[:error_extraction],
|
29
30
|
page_extraction_pages: 0
|
30
31
|
}
|
31
32
|
end.merge({
|
32
33
|
page_extraction_doctype: @document_path&.split('.')&.last,
|
33
|
-
page_extraction_filesize: Filesize.from("#{File.size(@document_path)} B").pretty
|
34
|
+
page_extraction_filesize: Filesize.from("#{File.size(@document_path)} B").pretty,
|
35
|
+
pages_extraction_errors: @pages_extraction_errors.chomp
|
34
36
|
})
|
35
37
|
self.update(updated_attributes)
|
36
38
|
end
|
@@ -39,6 +41,14 @@ module ActAsPageExtractor
|
|
39
41
|
self.extracted_pages.destroy_all
|
40
42
|
end
|
41
43
|
|
44
|
+
def add_error(e)
|
45
|
+
if ERRORS.values.include?(e.message)
|
46
|
+
@pages_extraction_errors << "#{e.message}\n\n"
|
47
|
+
else
|
48
|
+
@pages_extraction_errors << "#{e.class}, #{e.message}\n#{e.backtrace[0..ERROR_BACKTRACE_LINES].join("\n")}\n"
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
42
52
|
# :nocov:
|
43
53
|
def debug_info
|
44
54
|
# ap "@tmp_dir"
|
@@ -1,6 +1,9 @@
|
|
1
1
|
module ActAsPageExtractor
|
2
2
|
def unzip_document
|
3
3
|
@document_path = @copy_document_path
|
4
|
+
|
5
|
+
return if VALIDATE_DOC_TYPES.include?(@document_path.split('.').last&.downcase)
|
6
|
+
|
4
7
|
if validate_compress_types
|
5
8
|
result = TotalCompressor.decompress(@copy_document_path)
|
6
9
|
if result[:success] && result[:files].length == 1
|
@@ -12,4 +15,15 @@ module ActAsPageExtractor
|
|
12
15
|
end
|
13
16
|
end
|
14
17
|
end
|
18
|
+
|
19
|
+
def validate_compress_types
|
20
|
+
valid = VALIDATE_COMPRESS_TYPES.include?(@copy_document_path.split('.').last&.downcase)
|
21
|
+
|
22
|
+
unless valid
|
23
|
+
@page_extraction_state = EXTRACTING_STATES[:error_doctype]
|
24
|
+
@pages_extraction_errors << "#{EXTRACTING_STATES[:error_doctype]} "
|
25
|
+
end
|
26
|
+
|
27
|
+
valid
|
28
|
+
end
|
15
29
|
end
|
@@ -1,22 +1,28 @@
|
|
1
1
|
module ActAsPageExtractor
|
2
|
-
VALIDATE_COMPRESS_TYPES = ['zip', 'rar', '7z', 'gzip'].freeze
|
3
|
-
VALIDATE_DOC_TYPES = ['txt', 'pdf', 'doc', 'docx',
|
4
|
-
'rtf', 'odt', 'htm', 'html'].freeze
|
5
|
-
|
6
2
|
def valid_document
|
7
3
|
validate_size && validate_doc_types
|
8
4
|
end
|
9
5
|
|
10
6
|
def validate_size
|
11
7
|
mb = 2**20
|
12
|
-
File.size(@copy_document_path) <= 1*mb
|
13
|
-
|
8
|
+
valid = File.size(@copy_document_path) <= 1*mb
|
9
|
+
|
10
|
+
unless valid
|
11
|
+
@page_extraction_state = EXTRACTING_STATES[:error_filesize]
|
12
|
+
@pages_extraction_errors << "#{EXTRACTING_STATES[:error_filesize]} "
|
13
|
+
end
|
14
14
|
|
15
|
-
|
16
|
-
VALIDATE_COMPRESS_TYPES.include?(@copy_document_path.split('.').last&.downcase)
|
15
|
+
valid
|
17
16
|
end
|
18
17
|
|
19
18
|
def validate_doc_types
|
20
|
-
VALIDATE_DOC_TYPES.include?(@document_path.split('.').last&.downcase)
|
19
|
+
valid = VALIDATE_DOC_TYPES.include?(@document_path.split('.').last&.downcase)
|
20
|
+
|
21
|
+
unless valid
|
22
|
+
@page_extraction_state = EXTRACTING_STATES[:error_doctype]
|
23
|
+
@pages_extraction_errors << "#{EXTRACTING_STATES[:error_doctype]} "
|
24
|
+
end
|
25
|
+
|
26
|
+
valid
|
21
27
|
end
|
22
28
|
end
|
@@ -19,9 +19,26 @@ require 'act_as_page_extractor/modules/saving.rb'
|
|
19
19
|
require 'act_as_page_extractor/modules/interface'
|
20
20
|
|
21
21
|
module ActAsPageExtractor
|
22
|
-
|
23
22
|
extend ActiveSupport::Concern
|
24
23
|
|
24
|
+
DEFAULT_ROOT_FOLDER = Dir.pwd.to_s
|
25
|
+
ERRORS = {
|
26
|
+
unknown_docsplit_error: 'Unknown Docsplit error'
|
27
|
+
}.freeze
|
28
|
+
ERROR_BACKTRACE_LINES = 15
|
29
|
+
EXTRACTING_STATES = {
|
30
|
+
new: 'new',
|
31
|
+
extracting: 'extracting',
|
32
|
+
extracted: 'extracted',
|
33
|
+
error_doctype: 'error_doctype',
|
34
|
+
error_extraction: 'error_extraction',
|
35
|
+
error_filesize: 'error_filesize'
|
36
|
+
}.freeze
|
37
|
+
EXTRACTION_TIMEOUT = 60*5 # 5 minutes
|
38
|
+
VALIDATE_COMPRESS_TYPES = ['zip', 'rar', '7z', 'gzip'].freeze
|
39
|
+
VALIDATE_DOC_TYPES = ['txt', 'pdf', 'doc', 'docx',
|
40
|
+
'rtf', 'odt', 'htm', 'html'].freeze
|
41
|
+
|
25
42
|
included do
|
26
43
|
before_create { self.page_extraction_state = EXTRACTING_STATES[:new] }
|
27
44
|
before_destroy :remove_files
|
@@ -35,23 +52,16 @@ module ActAsPageExtractor
|
|
35
52
|
ActAsPageExtractor.define_singleton_method(:document_class) {|*args| Object.const_get(options[:document_class]) }
|
36
53
|
define_method(:extracted_document_id){|*args| options[:document_id] }
|
37
54
|
define_method(:additional_fields){|*args| options[:additional_fields] || [] }
|
38
|
-
define_method(:
|
39
|
-
define_method(:
|
55
|
+
define_method(:root_folder){|*args| options[:root_folder] || DEFAULT_ROOT_FOLDER }
|
56
|
+
define_method(:file_storage){|*args| options[:file_storage] || "#{root_folder}/public".freeze }
|
57
|
+
define_method(:pdf_storage){|*args| options[:pdf_storage] || "#{file_storage}/uploads/extracted/pdf".freeze }
|
58
|
+
define_method(:tmp_extraction_file_storage){|*args| "#{root_folder}/tmp/page_extraction" }
|
40
59
|
end
|
41
60
|
end
|
42
61
|
|
43
|
-
EXTRACTING_STATES = {
|
44
|
-
new: 'new',
|
45
|
-
extracting: 'extracting',
|
46
|
-
extracted: 'extracted',
|
47
|
-
'error.extraction': 'error.extraction'
|
48
|
-
}.freeze
|
49
|
-
|
50
|
-
TMP_EXTRACTION_FILE_STORAGE = "#{Dir.pwd}/tmp/page_extraction".freeze
|
51
|
-
FILE_STORAGE = "#{Dir.pwd}/public".freeze
|
52
|
-
PDF_STORAGE = "#{FILE_STORAGE}/uploads/extracted/pdf".freeze
|
53
|
-
|
54
62
|
def initialized
|
63
|
+
@page_extraction_state = nil
|
64
|
+
@pages_extraction_errors = ''
|
55
65
|
# add all need callbacks
|
56
66
|
#on destroy remove pdf
|
57
67
|
|
@@ -85,13 +95,13 @@ module ActAsPageExtractor
|
|
85
95
|
|
86
96
|
def create_pdf_dir
|
87
97
|
if save_as_pdf
|
88
|
-
FileUtils::mkdir_p(pdf_storage) unless File.
|
98
|
+
FileUtils::mkdir_p(pdf_storage) unless File.exist?(pdf_storage)
|
89
99
|
end
|
90
100
|
end
|
91
101
|
|
92
102
|
def create_tmp_dir
|
93
|
-
@tmp_dir = "#{
|
94
|
-
FileUtils::mkdir_p(@tmp_dir) unless File.
|
103
|
+
@tmp_dir = "#{tmp_extraction_file_storage}/#{SecureRandom.hex(6)}"
|
104
|
+
FileUtils::mkdir_p(@tmp_dir) unless File.exist?(@tmp_dir)
|
95
105
|
end
|
96
106
|
|
97
107
|
def copy_document
|
data/lib/generators/act_as_page_extractor/templates/add_page_extractor_fields_to_documents.rb.erb
CHANGED
@@ -4,5 +4,6 @@ class <%= migration_class_name_documents %> < ActiveRecord::Migration
|
|
4
4
|
add_column :<%= documents_table_name %>, :page_extraction_pages, :integer, default: 0
|
5
5
|
add_column :<%= documents_table_name %>, :page_extraction_doctype, :string, default: ''
|
6
6
|
add_column :<%= documents_table_name %>, :page_extraction_filesize, :string, default: ''
|
7
|
+
add_column :<%= documents_table_name %>, :pages_extraction_errors, :text, default: ''
|
7
8
|
end
|
8
9
|
end
|