act_as_page_extractor 0.6.4 → 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/coverage.yml +32 -0
- data/.gitignore +1 -0
- data/Aptfile.sh +55 -0
- data/CHANGELOG.md +67 -0
- data/Gemfile +3 -3
- data/Gemfile.lock +9 -8
- data/README.md +27 -32
- data/act_as_page_extractor.gemspec +4 -3
- data/lib/act_as_page_extractor/modules/extracting.rb +12 -13
- data/lib/act_as_page_extractor/modules/tools.rb +14 -4
- data/lib/act_as_page_extractor/modules/unzipping.rb +14 -0
- data/lib/act_as_page_extractor/modules/validating.rb +15 -9
- data/lib/act_as_page_extractor/version.rb +1 -1
- data/lib/act_as_page_extractor.rb +25 -15
- data/lib/generators/act_as_page_extractor/templates/add_page_extractor_fields_to_documents.rb.erb +1 -0
- data/spec/act_as_page_extractor_spec.rb +46 -9
- data/spec/spec_helper.rb +1 -1
- data/spec/support/models.rb +11 -2
- metadata +18 -9
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4a11b311c9575aace2a74e468f35b062c6e444d58e6f62caed99b70bab60703b
|
4
|
+
data.tar.gz: 9c463a553f4e3490110f46626d2c4c6c0f124b2522c63ef0e7db2acc856b12a2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4db62f37880a270dfe39dcedf5a537b2e5699633709348a030ae2274218478493c994099d38e7ff21729bdf543e681c66b818d2bc321b6b49f1478fa348d00c7
|
7
|
+
data.tar.gz: e2df6c4f723418d1098b0d0afac87fef847b50a87a766198e13ee42b8ff2cf4fac5e2788f09a7105ee13dbec4d84bfaf5084f9bcae7ad127915ab08d652f9deb
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# .github/workflows/coverage.yml
|
2
|
+
name: Coverage
|
3
|
+
|
4
|
+
|
5
|
+
on:
|
6
|
+
push:
|
7
|
+
branches: [ main, master ]
|
8
|
+
pull_request:
|
9
|
+
branches: [ main, master ]
|
10
|
+
|
11
|
+
jobs:
|
12
|
+
test:
|
13
|
+
runs-on: ubuntu-latest
|
14
|
+
steps:
|
15
|
+
- uses: actions/checkout@v4
|
16
|
+
- name: Set up Ruby
|
17
|
+
uses: ruby/setup-ruby@v1
|
18
|
+
with:
|
19
|
+
ruby-version: 3.2.3
|
20
|
+
- name: Install dependencies
|
21
|
+
run: |
|
22
|
+
sudo apt-get update
|
23
|
+
sudo apt-get install -y libreoffice unoconv poppler-utils zlib1g zlib1g-dev zip rar p7zip-full
|
24
|
+
bundle install --jobs 4 --retry 3
|
25
|
+
- name: Run tests with coverage
|
26
|
+
run: |
|
27
|
+
bundle exec rspec
|
28
|
+
- name: Upload coverage to Codecov
|
29
|
+
uses: codecov/codecov-action@v4
|
30
|
+
with:
|
31
|
+
files: ./coverage/.resultset.json,./coverage/coverage.json,./coverage/index.html
|
32
|
+
fail_ci_if_error: false
|
data/.gitignore
CHANGED
data/Aptfile.sh
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
# https://github.com/phlowerteam/total_compressor
|
2
|
+
sudo apt-get install zlib1g
|
3
|
+
sudo apt-get install zlib1g-dev
|
4
|
+
sudo apt-get install zip
|
5
|
+
sudo apt-get install rar
|
6
|
+
sudo apt-get install p7zip-full
|
7
|
+
|
8
|
+
# PDF
|
9
|
+
sudo apt-get install poppler-utils
|
10
|
+
|
11
|
+
# OpenOffice
|
12
|
+
sudo apt-get install jodconverter
|
13
|
+
|
14
|
+
sudo apt-get install nautilus-filename-repairer
|
15
|
+
sudo apt-get install python3-chardet
|
16
|
+
sudo apt-get install xfonts-encodings
|
17
|
+
sudo apt-get install libfontenc1
|
18
|
+
sudo apt-get install console-setup
|
19
|
+
sudo apt-get install fontconfig
|
20
|
+
sudo apt-get install fontconfig-config
|
21
|
+
sudo apt-get install fonts-kacst
|
22
|
+
sudo apt-get install fonts-kacst-one
|
23
|
+
sudo apt-get install fonts-khmeros-core
|
24
|
+
sudo apt-get install fonts-lao
|
25
|
+
sudo apt-get install fonts-liberation
|
26
|
+
sudo apt-get install fonts-nanum
|
27
|
+
sudo apt-get install fonts-opensymbol
|
28
|
+
sudo apt-get install fonts-sil-gentium-basic
|
29
|
+
sudo apt-get install fonts-takao-pgothic
|
30
|
+
sudo apt-get install fonts-thai-tlwg
|
31
|
+
sudo apt-get install fonts-tlwg-garuda
|
32
|
+
sudo apt-get install fonts-tlwg-kinnari
|
33
|
+
sudo apt-get install fonts-tlwg-loma
|
34
|
+
sudo apt-get install fonts-tlwg-mono
|
35
|
+
sudo apt-get install fonts-tlwg-norasi
|
36
|
+
sudo apt-get install fonts-tlwg-purisa
|
37
|
+
sudo apt-get install fonts-tlwg-sawasdee
|
38
|
+
sudo apt-get install fonts-tlwg-typewriter
|
39
|
+
sudo apt-get install fonts-tlwg-typist
|
40
|
+
sudo apt-get install fonts-tlwg-typo
|
41
|
+
sudo apt-get install fonts-tlwg-umpush
|
42
|
+
sudo apt-get install fonts-tlwg-waree
|
43
|
+
sudo apt-get install gnome-font-viewer
|
44
|
+
sudo apt-get install gsfonts gucharmap
|
45
|
+
sudo apt-get install kbd
|
46
|
+
sudo apt-get install libfontconfig1
|
47
|
+
sudo apt-get install libfontenc1
|
48
|
+
sudo apt-get install libfreetype6
|
49
|
+
sudo apt-get install libxft2
|
50
|
+
sudo apt-get install fonts-ubuntu
|
51
|
+
sudo apt-get install fonts-wqy-microhei
|
52
|
+
sudo apt-get install x11-xfs-utils xfonts-base
|
53
|
+
sudo apt-get install xfonts-encodings
|
54
|
+
sudo apt-get install xfonts-scalable
|
55
|
+
sudo apt-get install xfonts-utils
|
data/CHANGELOG.md
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
# Changelog
|
2
|
+
|
3
|
+
## [0.7.0] - 2025-08-21
|
4
|
+
### Added
|
5
|
+
- Breaking changes: added root folder as an option to access the folder between deployments, improved error processing ([883cafc], [b10a367])
|
6
|
+
|
7
|
+
HINT: to upgrade an older version, you need to fix the DB scheme and data migrations like this:
|
8
|
+
```rb
|
9
|
+
# db/migrate/20250804182426_upgrade_act_as_page_extractor_to_version.rb
|
10
|
+
class UpgradeActAsPageExtractorToVersion < ActiveRecord::Migration
|
11
|
+
def change
|
12
|
+
add_column :documents, :pages_extraction_errors, :string, default: ''
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
# db/data/20250804183544_upgrade_act_as_page_extractor_to_version.rb
|
17
|
+
class UpgradeActAsPageExtractorToVersion < ActiveRecord::Migration
|
18
|
+
def up
|
19
|
+
Document
|
20
|
+
.where(page_extraction_state: 'error.extraction')
|
21
|
+
.update_all(page_extraction_state: 'error_extraction')
|
22
|
+
end
|
23
|
+
|
24
|
+
def down
|
25
|
+
raise ActiveRecord::IrreversibleMigration
|
26
|
+
end
|
27
|
+
end
|
28
|
+
```
|
29
|
+
|
30
|
+
## [0.6.0] - 2024-08-31
|
31
|
+
### Changed
|
32
|
+
- Upgraded to Ruby 3.2 minimal version ([4b463a1], [be52d9c])
|
33
|
+
- Upgraded to ActiveRecord >=6.x.x ([7881613], [48e6d8b], [8bd3707])
|
34
|
+
- Improved docs & Readme ([c405044], [6e895bb])
|
35
|
+
|
36
|
+
## [0.5.0] - 2024-08-30
|
37
|
+
### Changed
|
38
|
+
- Upgraded to ActiveRecord 6.0 ([9eea586])
|
39
|
+
|
40
|
+
## [0.2.3] - 2020-06-05
|
41
|
+
### Changed
|
42
|
+
- Upgraded to ActiveRecord-5.2.0 ([cde1f36])
|
43
|
+
|
44
|
+
## [0.2.2] - 2020-06-04
|
45
|
+
### Changed
|
46
|
+
- Upgraded to ActiveRecord-5.1.0 ([f6ea8d7])
|
47
|
+
|
48
|
+
## [0.2.1] - 2020-05-11
|
49
|
+
### Changed
|
50
|
+
- Upgraded to ActiveRecord-5.0.0 ([5c595ee], [3eb4ad7])
|
51
|
+
|
52
|
+
## [0.1.6] - 2020-05-10
|
53
|
+
### Changed
|
54
|
+
- Updated libraries ([eca4346]), ([7d3bb4f], [203c689], [171cf27])
|
55
|
+
|
56
|
+
## [0.1.2] - 2018-11-29
|
57
|
+
### Changed
|
58
|
+
- Updated rubyzip library ([38c4156])
|
59
|
+
|
60
|
+
## [0.1.1] - 2017-01-10
|
61
|
+
### Changed
|
62
|
+
- Removed code coverage from Rails generators ([a990357])
|
63
|
+
|
64
|
+
## [0.1.0] - 2017-01-09
|
65
|
+
### Added
|
66
|
+
- Initial commit ([47c0950], [5225f33])
|
67
|
+
- Fixed tests ([e68a6b7])
|
data/Gemfile
CHANGED
@@ -3,11 +3,11 @@ source 'https://rubygems.org'
|
|
3
3
|
# Specify your gem's dependencies in total_compressor.gemspec
|
4
4
|
gemspec
|
5
5
|
|
6
|
-
ruby '
|
6
|
+
ruby '3.2.3'
|
7
7
|
|
8
|
-
gem 'activerecord'
|
8
|
+
gem 'activerecord'
|
9
9
|
|
10
|
-
gem '
|
10
|
+
gem 'amazing_print'
|
11
11
|
|
12
12
|
gem 'docsplit' # API for OpenOffice jodconverter (any to pdf)
|
13
13
|
gem 'pdf_utils' # getting text from pdf
|
data/Gemfile.lock
CHANGED
@@ -1,9 +1,9 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
act_as_page_extractor (0.
|
5
|
-
activerecord (~> 6)
|
6
|
-
|
4
|
+
act_as_page_extractor (0.7.1)
|
5
|
+
activerecord (~> 6.0)
|
6
|
+
amazing_print (~> 1)
|
7
7
|
docsplit (~> 0)
|
8
8
|
filesize (~> 0)
|
9
9
|
pdf-reader (~> 1, >= 1.4)
|
@@ -27,8 +27,9 @@ GEM
|
|
27
27
|
tzinfo (~> 2.0)
|
28
28
|
zeitwerk (~> 2.3)
|
29
29
|
afm (0.2.2)
|
30
|
+
amazing_print (1.8.1)
|
30
31
|
awesome_print (1.9.2)
|
31
|
-
byebug (
|
32
|
+
byebug (12.0.0)
|
32
33
|
concurrent-ruby (1.3.4)
|
33
34
|
diff-lcs (1.5.1)
|
34
35
|
docile (1.4.1)
|
@@ -84,9 +85,9 @@ PLATFORMS
|
|
84
85
|
|
85
86
|
DEPENDENCIES
|
86
87
|
act_as_page_extractor!
|
87
|
-
activerecord
|
88
|
-
|
89
|
-
bundler (~> 2)
|
88
|
+
activerecord
|
89
|
+
amazing_print
|
90
|
+
bundler (~> 2.2, >= 2.2.33)
|
90
91
|
byebug
|
91
92
|
docsplit
|
92
93
|
filesize
|
@@ -99,7 +100,7 @@ DEPENDENCIES
|
|
99
100
|
total_compressor
|
100
101
|
|
101
102
|
RUBY VERSION
|
102
|
-
ruby 3.2.
|
103
|
+
ruby 3.2.3p157
|
103
104
|
|
104
105
|
BUNDLED WITH
|
105
106
|
2.4.1
|
data/README.md
CHANGED
@@ -1,30 +1,29 @@
|
|
1
|
+
[](https://codecov.io/gh/phlowerteam/act_as_page_extractor)
|
2
|
+
|
1
3
|
act_as_page_extractor
|
2
4
|
================
|
3
5
|
|
4
|
-
|
6
|
+
A library that extracts plain text from documents for subsequent processing, such as indexing and search.
|
5
7
|
|
6
8
|
## Installation
|
7
9
|
|
8
|
-
Install
|
9
|
-
|
10
|
+
Install all dependencies before use:
|
10
11
|
```sh
|
11
|
-
|
12
|
+
sh Aptfile.sh
|
12
13
|
```
|
13
|
-
Add this line to your application's Gemfile:
|
14
14
|
|
15
|
+
Add this to your Gemfile:
|
15
16
|
```rb
|
16
17
|
gem 'act_as_page_extractor'
|
17
|
-
bundle
|
18
18
|
```
|
19
19
|
## Usage
|
20
20
|
|
21
|
-
|
22
|
-
|
21
|
+
Generate a migration, for example for a Document model:
|
23
22
|
```sh
|
24
23
|
rails g act_as_page_extractor:migration Document category_id user_id
|
25
24
|
```
|
26
25
|
|
27
|
-
|
26
|
+
This will generate two migration files:
|
28
27
|
```rb
|
29
28
|
class AddPageExtractorFields < ActiveRecord::Migration
|
30
29
|
def change
|
@@ -32,6 +31,7 @@ class AddPageExtractorFields < ActiveRecord::Migration
|
|
32
31
|
add_column :documents, :page_extraction_pages, :integer, default: 0
|
33
32
|
add_column :documents, :page_extraction_doctype, :string, default: ''
|
34
33
|
add_column :documents, :page_extraction_filesize, :string, default: ''
|
34
|
+
add_column :documents, :pages_extraction_errors, :string, default: ''
|
35
35
|
end
|
36
36
|
end
|
37
37
|
|
@@ -65,20 +65,20 @@ Add to model next parameters for initializing:
|
|
65
65
|
|
66
66
|
act_as_page_extractor options: {
|
67
67
|
document_class: 'Document',
|
68
|
-
save_as_pdf: true,
|
68
|
+
save_as_pdf: true, # store converted document as PDF
|
69
69
|
filename: :filename,
|
70
70
|
document_id: :document_id,
|
71
|
-
additional_fields: [:category_id, :user_id],
|
72
|
-
#
|
73
|
-
#
|
71
|
+
additional_fields: [:category_id, :user_id], # copy values of these fields from document to extracted_page
|
72
|
+
root_folder: Rails.root.to_s, # or "/full/path/to/project", it needs to share folder between deployments
|
73
|
+
# file_storage: "/full/path/to/project/public/uploads/documents/storage" # optional
|
74
|
+
# pdf_storage: "/full/path/to/project/public/uploads/extracted/pdf/storage" # optional
|
74
75
|
}
|
75
76
|
|
76
77
|
has_many :extracted_pages, dependent: :destroy
|
77
78
|
end
|
78
79
|
```
|
79
80
|
|
80
|
-
|
81
|
-
|
81
|
+
The instance now provides several new methods:
|
82
82
|
```rb
|
83
83
|
document = Document.first
|
84
84
|
document.page_extract!
|
@@ -100,29 +100,24 @@ ActAsPageExtractor.statistics
|
|
100
100
|
|
101
101
|
Parameters of initializing **act_as_page_extractor**:
|
102
102
|
|
103
|
-
* **document_class**
|
104
|
-
* **save_as_pdf**
|
105
|
-
* **filename**
|
106
|
-
* **document_id**
|
107
|
-
* **additional_fields**
|
108
|
-
* **
|
109
|
-
* **
|
103
|
+
* **document_class** — The name of the model (e.g., `Document`).
|
104
|
+
* **save_as_pdf** — Boolean (`true`/`false`). Indicates whether to save a temporary PDF.
|
105
|
+
* **filename** — The field containing access to the file. This should be an object with a `url` method that returns the file path (e.g., a CarrierWave object with `filename.url`).
|
106
|
+
* **document_id** — The field name for storing the document ID.
|
107
|
+
* **additional_fields** — Extra fields to be added to the extracted page (useful for indexing, etc.).
|
108
|
+
* **root_folder** — The root folder to be shared across deployments (e.g., `Rails.root.to_s`).
|
109
|
+
* **file_storage** — Path for saving temporary files (default: `"public"`).
|
110
|
+
* **pdf_storage** — Path for saving PDFs (default: `"public/uploads/extracted/pdf"`).
|
110
111
|
|
111
112
|
## Run tests
|
112
113
|
```sh
|
114
|
+
bundle
|
113
115
|
rspec
|
114
116
|
```
|
115
|
-
## Contributing
|
116
|
-
1. Fork it
|
117
|
-
2. Create your feature branch (`git checkout -b my-new-feature`)
|
118
|
-
3. Commit your changes (`git commit -am 'Add some feature'`)
|
119
|
-
4. Push to the branch (`git push origin my-new-feature`)
|
120
|
-
5. Create new Pull Request
|
121
117
|
|
122
118
|
## Contacts
|
123
|
-
https://github.com/phlowerteam
|
124
|
-
phlowerteam@gmail.com
|
119
|
+
https://github.com/phlowerteam / phlowerteam[A]gmail.com
|
125
120
|
|
126
121
|
## License
|
127
|
-
|
128
|
-
MIT License
|
122
|
+
|
123
|
+
MIT License © 2025 PhlowerTeam
|
@@ -16,15 +16,16 @@ Gem::Specification.new do |spec|
|
|
16
16
|
spec.files = `git ls-files`.split($/)
|
17
17
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
18
18
|
spec.require_paths = ['lib']
|
19
|
+
spec.required_ruby_version = '>= 3.2'
|
19
20
|
|
20
|
-
spec.add_development_dependency 'bundler', '~> 2'
|
21
|
+
spec.add_development_dependency 'bundler', '~> 2.2', '>= 2.2.33'
|
21
22
|
spec.add_development_dependency 'rake', '~> 12', '>= 12.3.3'
|
22
23
|
spec.add_development_dependency 'byebug', '~> 0'
|
23
24
|
spec.add_development_dependency 'rspec', '~> 0'
|
24
25
|
spec.add_development_dependency 'simplecov', '~> 0'
|
25
26
|
|
26
|
-
spec.add_runtime_dependency 'activerecord', '~> 6'
|
27
|
-
spec.add_runtime_dependency '
|
27
|
+
spec.add_runtime_dependency 'activerecord', '~> 6.0'
|
28
|
+
spec.add_runtime_dependency 'amazing_print', '~> 1'
|
28
29
|
spec.add_runtime_dependency 'docsplit', '~> 0' # API for OpenOffice jodconverter (any to pdf)
|
29
30
|
spec.add_runtime_dependency 'pdf_utils', '~> 0' # getting text from pdf
|
30
31
|
spec.add_runtime_dependency 'prawn', '~> 1' # need for pdf_utils
|
@@ -20,23 +20,22 @@ module ActAsPageExtractor
|
|
20
20
|
pdf_path if File.exist?(pdf_path)
|
21
21
|
end
|
22
22
|
end
|
23
|
+
rescue StandardError => e
|
24
|
+
add_error(e)
|
23
25
|
end
|
24
26
|
|
25
27
|
def convert_to_text
|
26
|
-
|
27
|
-
|
28
|
-
if @
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
# :nocov:
|
35
|
-
end
|
28
|
+
@pdf_pages = PdfUtils.info(@pdf_path).pages
|
29
|
+
if @pdf_pages
|
30
|
+
if timeout_wrapper{ Docsplit::extract_text(@pdf_path, ocr: false, pages: 'all', output: @tmp_dir) }
|
31
|
+
else
|
32
|
+
# :nocov:
|
33
|
+
@pdf_pages = nil
|
34
|
+
raise ERRORS[:unknown_docsplit_error]
|
35
|
+
# :nocov:
|
36
36
|
end
|
37
|
-
# :nocov:
|
38
|
-
rescue
|
39
37
|
end
|
40
|
-
|
38
|
+
rescue StandardError => e
|
39
|
+
add_error(e)
|
41
40
|
end
|
42
41
|
end
|
@@ -5,8 +5,9 @@ module ActAsPageExtractor
|
|
5
5
|
def timeout_wrapper
|
6
6
|
result = nil
|
7
7
|
begin
|
8
|
-
result = Timeout::timeout(
|
9
|
-
rescue
|
8
|
+
result = Timeout::timeout(EXTRACTION_TIMEOUT) { yield }
|
9
|
+
rescue StandardError => e
|
10
|
+
add_error(e)
|
10
11
|
ensure
|
11
12
|
result
|
12
13
|
end
|
@@ -25,12 +26,13 @@ module ActAsPageExtractor
|
|
25
26
|
}
|
26
27
|
else
|
27
28
|
{
|
28
|
-
page_extraction_state: EXTRACTING_STATES[:
|
29
|
+
page_extraction_state: @page_extraction_state || EXTRACTING_STATES[:error_extraction],
|
29
30
|
page_extraction_pages: 0
|
30
31
|
}
|
31
32
|
end.merge({
|
32
33
|
page_extraction_doctype: @document_path&.split('.')&.last,
|
33
|
-
page_extraction_filesize: Filesize.from("#{File.size(@document_path)} B").pretty
|
34
|
+
page_extraction_filesize: Filesize.from("#{File.size(@document_path)} B").pretty,
|
35
|
+
pages_extraction_errors: @pages_extraction_errors.chomp
|
34
36
|
})
|
35
37
|
self.update(updated_attributes)
|
36
38
|
end
|
@@ -39,6 +41,14 @@ module ActAsPageExtractor
|
|
39
41
|
self.extracted_pages.destroy_all
|
40
42
|
end
|
41
43
|
|
44
|
+
def add_error(e)
|
45
|
+
if ERRORS.values.include?(e.message)
|
46
|
+
@pages_extraction_errors << "#{e.message}\n\n"
|
47
|
+
else
|
48
|
+
@pages_extraction_errors << "#{e.class}, #{e.message}\n#{e.backtrace[0..ERROR_BACKTRACE_LINES].join("\n")}\n"
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
42
52
|
# :nocov:
|
43
53
|
def debug_info
|
44
54
|
# ap "@tmp_dir"
|
@@ -1,6 +1,9 @@
|
|
1
1
|
module ActAsPageExtractor
|
2
2
|
def unzip_document
|
3
3
|
@document_path = @copy_document_path
|
4
|
+
|
5
|
+
return if VALIDATE_DOC_TYPES.include?(@document_path.split('.').last&.downcase)
|
6
|
+
|
4
7
|
if validate_compress_types
|
5
8
|
result = TotalCompressor.decompress(@copy_document_path)
|
6
9
|
if result[:success] && result[:files].length == 1
|
@@ -12,4 +15,15 @@ module ActAsPageExtractor
|
|
12
15
|
end
|
13
16
|
end
|
14
17
|
end
|
18
|
+
|
19
|
+
def validate_compress_types
|
20
|
+
valid = VALIDATE_COMPRESS_TYPES.include?(@copy_document_path.split('.').last&.downcase)
|
21
|
+
|
22
|
+
unless valid
|
23
|
+
@page_extraction_state = EXTRACTING_STATES[:error_doctype]
|
24
|
+
@pages_extraction_errors << "#{EXTRACTING_STATES[:error_doctype]} "
|
25
|
+
end
|
26
|
+
|
27
|
+
valid
|
28
|
+
end
|
15
29
|
end
|
@@ -1,22 +1,28 @@
|
|
1
1
|
module ActAsPageExtractor
|
2
|
-
VALIDATE_COMPRESS_TYPES = ['zip', 'rar', '7z', 'gzip'].freeze
|
3
|
-
VALIDATE_DOC_TYPES = ['txt', 'pdf', 'doc', 'docx',
|
4
|
-
'rtf', 'odt', 'htm', 'html'].freeze
|
5
|
-
|
6
2
|
def valid_document
|
7
3
|
validate_size && validate_doc_types
|
8
4
|
end
|
9
5
|
|
10
6
|
def validate_size
|
11
7
|
mb = 2**20
|
12
|
-
File.size(@copy_document_path) <= 1*mb
|
13
|
-
|
8
|
+
valid = File.size(@copy_document_path) <= 1*mb
|
9
|
+
|
10
|
+
unless valid
|
11
|
+
@page_extraction_state = EXTRACTING_STATES[:error_filesize]
|
12
|
+
@pages_extraction_errors << "#{EXTRACTING_STATES[:error_filesize]} "
|
13
|
+
end
|
14
14
|
|
15
|
-
|
16
|
-
VALIDATE_COMPRESS_TYPES.include?(@copy_document_path.split('.').last&.downcase)
|
15
|
+
valid
|
17
16
|
end
|
18
17
|
|
19
18
|
def validate_doc_types
|
20
|
-
VALIDATE_DOC_TYPES.include?(@document_path.split('.').last&.downcase)
|
19
|
+
valid = VALIDATE_DOC_TYPES.include?(@document_path.split('.').last&.downcase)
|
20
|
+
|
21
|
+
unless valid
|
22
|
+
@page_extraction_state = EXTRACTING_STATES[:error_doctype]
|
23
|
+
@pages_extraction_errors << "#{EXTRACTING_STATES[:error_doctype]} "
|
24
|
+
end
|
25
|
+
|
26
|
+
valid
|
21
27
|
end
|
22
28
|
end
|
@@ -19,9 +19,26 @@ require 'act_as_page_extractor/modules/saving.rb'
|
|
19
19
|
require 'act_as_page_extractor/modules/interface'
|
20
20
|
|
21
21
|
module ActAsPageExtractor
|
22
|
-
|
23
22
|
extend ActiveSupport::Concern
|
24
23
|
|
24
|
+
DEFAULT_ROOT_FOLDER = Dir.pwd.to_s
|
25
|
+
ERRORS = {
|
26
|
+
unknown_docsplit_error: 'Unknown Docsplit error'
|
27
|
+
}.freeze
|
28
|
+
ERROR_BACKTRACE_LINES = 15
|
29
|
+
EXTRACTING_STATES = {
|
30
|
+
new: 'new',
|
31
|
+
extracting: 'extracting',
|
32
|
+
extracted: 'extracted',
|
33
|
+
error_doctype: 'error_doctype',
|
34
|
+
error_extraction: 'error_extraction',
|
35
|
+
error_filesize: 'error_filesize'
|
36
|
+
}.freeze
|
37
|
+
EXTRACTION_TIMEOUT = 60*5 # 5 minutes
|
38
|
+
VALIDATE_COMPRESS_TYPES = ['zip', 'rar', '7z', 'gzip'].freeze
|
39
|
+
VALIDATE_DOC_TYPES = ['txt', 'pdf', 'doc', 'docx',
|
40
|
+
'rtf', 'odt', 'htm', 'html'].freeze
|
41
|
+
|
25
42
|
included do
|
26
43
|
before_create { self.page_extraction_state = EXTRACTING_STATES[:new] }
|
27
44
|
before_destroy :remove_files
|
@@ -35,23 +52,16 @@ module ActAsPageExtractor
|
|
35
52
|
ActAsPageExtractor.define_singleton_method(:document_class) {|*args| Object.const_get(options[:document_class]) }
|
36
53
|
define_method(:extracted_document_id){|*args| options[:document_id] }
|
37
54
|
define_method(:additional_fields){|*args| options[:additional_fields] || [] }
|
38
|
-
define_method(:
|
39
|
-
define_method(:
|
55
|
+
define_method(:root_folder){|*args| options[:root_folder] || DEFAULT_ROOT_FOLDER }
|
56
|
+
define_method(:file_storage){|*args| options[:file_storage] || "#{root_folder}/public".freeze }
|
57
|
+
define_method(:pdf_storage){|*args| options[:pdf_storage] || "#{file_storage}/uploads/extracted/pdf".freeze }
|
58
|
+
define_method(:tmp_extraction_file_storage){|*args| "#{root_folder}/tmp/page_extraction" }
|
40
59
|
end
|
41
60
|
end
|
42
61
|
|
43
|
-
EXTRACTING_STATES = {
|
44
|
-
new: 'new',
|
45
|
-
extracting: 'extracting',
|
46
|
-
extracted: 'extracted',
|
47
|
-
'error.extraction': 'error.extraction'
|
48
|
-
}.freeze
|
49
|
-
|
50
|
-
TMP_EXTRACTION_FILE_STORAGE = "#{Dir.pwd}/tmp/page_extraction".freeze
|
51
|
-
FILE_STORAGE = "#{Dir.pwd}/public".freeze
|
52
|
-
PDF_STORAGE = "#{FILE_STORAGE}/uploads/extracted/pdf".freeze
|
53
|
-
|
54
62
|
def initialized
|
63
|
+
@page_extraction_state = nil
|
64
|
+
@pages_extraction_errors = ''
|
55
65
|
# add all need callbacks
|
56
66
|
#on destroy remove pdf
|
57
67
|
|
@@ -90,7 +100,7 @@ module ActAsPageExtractor
|
|
90
100
|
end
|
91
101
|
|
92
102
|
def create_tmp_dir
|
93
|
-
@tmp_dir = "#{
|
103
|
+
@tmp_dir = "#{tmp_extraction_file_storage}/#{SecureRandom.hex(6)}"
|
94
104
|
FileUtils::mkdir_p(@tmp_dir) unless File.exist?(@tmp_dir)
|
95
105
|
end
|
96
106
|
|
data/lib/generators/act_as_page_extractor/templates/add_page_extractor_fields_to_documents.rb.erb
CHANGED
@@ -4,5 +4,6 @@ class <%= migration_class_name_documents %> < ActiveRecord::Migration
|
|
4
4
|
add_column :<%= documents_table_name %>, :page_extraction_pages, :integer, default: 0
|
5
5
|
add_column :<%= documents_table_name %>, :page_extraction_doctype, :string, default: ''
|
6
6
|
add_column :<%= documents_table_name %>, :page_extraction_filesize, :string, default: ''
|
7
|
+
add_column :<%= documents_table_name %>, :pages_extraction_errors, :text, default: ''
|
7
8
|
end
|
8
9
|
end
|
@@ -15,7 +15,7 @@ describe ActAsPageExtractor do
|
|
15
15
|
'Oscar_Wilde_The_Happy_Prince_en.docx.rar',
|
16
16
|
'Oscar_Wilde_The_Happy_Prince_en.docx.7z'
|
17
17
|
].each do |document|
|
18
|
-
it "
|
18
|
+
it "extracts valid document #{document}" do
|
19
19
|
book = Book.new({doc_path: document})
|
20
20
|
allow(Book).to receive_message_chain('where') { [book] }
|
21
21
|
ActAsPageExtractor.start_extraction
|
@@ -25,21 +25,58 @@ describe ActAsPageExtractor do
|
|
25
25
|
unless document.match /pdf/
|
26
26
|
expect(book.pdf_path).to match /pdf/
|
27
27
|
expect(book.remove_files.count).to eq 1
|
28
|
+
expect(book.pages_extraction_errors).to be_empty
|
28
29
|
end
|
29
30
|
expect(ActAsPageExtractor.statistics).to include(supported_documents: 1)
|
30
31
|
end
|
31
32
|
end
|
32
33
|
end
|
33
34
|
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
35
|
+
describe 'errors processing' do
|
36
|
+
let(:book) { Book.new({doc_path: document}) }
|
37
|
+
|
38
|
+
before do
|
39
|
+
allow(Book).to receive_message_chain('where') { [book] }
|
40
|
+
end
|
41
|
+
|
42
|
+
context 'when invalid doctype' do
|
43
|
+
let(:document) { 'Oscar_Wilde_The_Happy_Prince_en.wrong' }
|
44
|
+
|
45
|
+
it "logs invalid doctype" do
|
46
|
+
ActAsPageExtractor.start_extraction
|
47
|
+
expect(book.page_extraction_state).to eq 'error_doctype'
|
48
|
+
expect(book.pages_extraction_errors).to match('error_doctype')
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
context 'with extraction timeout' do
|
53
|
+
let(:error_msg) { 'execution expired' }
|
54
|
+
let(:document) { 'Oscar_Wilde_The_Happy_Prince_en.docx' }
|
55
|
+
|
56
|
+
before do
|
57
|
+
allow(Docsplit).to receive(:extract_pdf).and_raise(Timeout::Error.new(error_msg))
|
58
|
+
end
|
59
|
+
|
60
|
+
it "logs timeout error" do
|
61
|
+
ActAsPageExtractor.start_extraction
|
62
|
+
expect(book.page_extraction_state).to eq 'error_extraction'
|
63
|
+
expect(book.pages_extraction_errors).to match(error_msg)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
context 'when Docsplit returns failure' do
|
68
|
+
let(:error_msg) { 'Unknown Docsplit error' }
|
69
|
+
let(:document) { 'Oscar_Wilde_The_Happy_Prince_en.docx' }
|
70
|
+
|
71
|
+
before do
|
72
|
+
allow(Docsplit).to receive(:extract_pdf).and_raise(Timeout::Error.new(error_msg))
|
73
|
+
allow(Docsplit).to receive(:extract_text).and_raise(Timeout::Error.new(error_msg))
|
74
|
+
end
|
75
|
+
|
76
|
+
it "logs Docsplit error" do
|
41
77
|
ActAsPageExtractor.start_extraction
|
42
|
-
expect(book.page_extraction_state).to eq
|
78
|
+
expect(book.page_extraction_state).to eq 'error_extraction'
|
79
|
+
expect(book.pages_extraction_errors).to match(error_msg)
|
43
80
|
end
|
44
81
|
end
|
45
82
|
end
|
data/spec/spec_helper.rb
CHANGED
data/spec/support/models.rb
CHANGED
@@ -14,7 +14,8 @@ class Book
|
|
14
14
|
:page_extraction_state,
|
15
15
|
:page_extraction_pages,
|
16
16
|
:page_extraction_doctype,
|
17
|
-
:page_extraction_filesize
|
17
|
+
:page_extraction_filesize,
|
18
|
+
:pages_extraction_errors
|
18
19
|
|
19
20
|
def self.before_create &block
|
20
21
|
yield
|
@@ -35,6 +36,7 @@ class Book
|
|
35
36
|
filename: :filename, # CarrierWave class with 'filename.url' method
|
36
37
|
document_id: :document_id,
|
37
38
|
additional_fields: [:category_id, :user_id],
|
39
|
+
root_folder: Dir.pwd.to_s,
|
38
40
|
file_storage: "#{Dir.pwd}/test/",
|
39
41
|
pdf_storage: "#{Dir.pwd}/test/uploads/extracted/pdf"
|
40
42
|
}
|
@@ -44,6 +46,7 @@ class Book
|
|
44
46
|
@id = @category_id = @user_id = nil
|
45
47
|
@page_extraction_state = @page_extraction_pages = nil
|
46
48
|
@page_extraction_doctype = @page_extraction_filesize = nil
|
49
|
+
@pages_extraction_errors = ''
|
47
50
|
ExtractedPage.cleanup
|
48
51
|
end
|
49
52
|
|
@@ -62,7 +65,13 @@ class Book
|
|
62
65
|
|
63
66
|
def update params
|
64
67
|
params.each do |key, value|
|
65
|
-
|
68
|
+
if value.nil?
|
69
|
+
instance_eval("self.#{key} = nil")
|
70
|
+
elsif value.class == String
|
71
|
+
instance_eval("self.#{key} = \"#{value}\"")
|
72
|
+
else
|
73
|
+
instance_eval("self.#{key} = #{value}")
|
74
|
+
end
|
66
75
|
end
|
67
76
|
end
|
68
77
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: act_as_page_extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.7.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- PhlowerTeam
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2025-08-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -16,14 +16,20 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '2'
|
19
|
+
version: '2.2'
|
20
|
+
- - ">="
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: 2.2.33
|
20
23
|
type: :development
|
21
24
|
prerelease: false
|
22
25
|
version_requirements: !ruby/object:Gem::Requirement
|
23
26
|
requirements:
|
24
27
|
- - "~>"
|
25
28
|
- !ruby/object:Gem::Version
|
26
|
-
version: '2'
|
29
|
+
version: '2.2'
|
30
|
+
- - ">="
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: 2.2.33
|
27
33
|
- !ruby/object:Gem::Dependency
|
28
34
|
name: rake
|
29
35
|
requirement: !ruby/object:Gem::Requirement
|
@@ -92,16 +98,16 @@ dependencies:
|
|
92
98
|
requirements:
|
93
99
|
- - "~>"
|
94
100
|
- !ruby/object:Gem::Version
|
95
|
-
version: '6'
|
101
|
+
version: '6.0'
|
96
102
|
type: :runtime
|
97
103
|
prerelease: false
|
98
104
|
version_requirements: !ruby/object:Gem::Requirement
|
99
105
|
requirements:
|
100
106
|
- - "~>"
|
101
107
|
- !ruby/object:Gem::Version
|
102
|
-
version: '6'
|
108
|
+
version: '6.0'
|
103
109
|
- !ruby/object:Gem::Dependency
|
104
|
-
name:
|
110
|
+
name: amazing_print
|
105
111
|
requirement: !ruby/object:Gem::Requirement
|
106
112
|
requirements:
|
107
113
|
- - "~>"
|
@@ -212,9 +218,12 @@ executables: []
|
|
212
218
|
extensions: []
|
213
219
|
extra_rdoc_files: []
|
214
220
|
files:
|
221
|
+
- ".github/workflows/coverage.yml"
|
215
222
|
- ".gitignore"
|
216
223
|
- ".rspec"
|
217
224
|
- ".ruby-gemset"
|
225
|
+
- Aptfile.sh
|
226
|
+
- CHANGELOG.md
|
218
227
|
- Gemfile
|
219
228
|
- Gemfile.lock
|
220
229
|
- LICENSE
|
@@ -261,14 +270,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
261
270
|
requirements:
|
262
271
|
- - ">="
|
263
272
|
- !ruby/object:Gem::Version
|
264
|
-
version: '
|
273
|
+
version: '3.2'
|
265
274
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
266
275
|
requirements:
|
267
276
|
- - ">="
|
268
277
|
- !ruby/object:Gem::Version
|
269
278
|
version: '0'
|
270
279
|
requirements: []
|
271
|
-
rubygems_version: 3.4.
|
280
|
+
rubygems_version: 3.4.19
|
272
281
|
signing_key:
|
273
282
|
specification_version: 4
|
274
283
|
summary: Uses system calls
|