act_as_page_extractor 0.6.4 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f412c60bccb3fca934efecbc7922af07b41297423e6a2c4fbe04b8110a0e22e8
4
- data.tar.gz: 4b281d9c93de0955e90b1a9d500213b1fa7103c449d72354caaa3d5d29702ff5
3
+ metadata.gz: 0e2f7b9cc30bb8ba03a61ed873b43d4b1764997c65f470a751abe3a2042561e4
4
+ data.tar.gz: ec3e7125f2119a3666e8e9732ff2ae171df2b1c7e51a02eb9c0794e9f73a5bf5
5
5
  SHA512:
6
- metadata.gz: b545143db8d5fd51fb4c5c3d95d76b8122576e26e6587bb0b8c1ec62303e7e7bc5509554132ba9e65d47294fd2fa7c803a01634529c8e4ca8ecf9d0b3f1a392c
7
- data.tar.gz: 3ff648cca05fe842e97db5e5153399bcfcd34f98141cd47ad5ca511de588de67c080778b643cba9be372178a6d6fc497552082bece168d3e10be6788382c8426
6
+ metadata.gz: b4ba9e08f4a11c250cb9449035d5728dbd764870c85709aa76a2f532e3f41d5e895a813ddd8a7bf98b3885ca871c25aa29867db9f2e7b099e4d99504c3a81907
7
+ data.tar.gz: 36e66d4bbf0300f570cf964902c0163caad97c8c6ca0c8856c7aaba8c72a1716b0a6a8218e5176edebd8dd261786a9142f9ba9cbfe62cf0193c9b677fb17088a
@@ -0,0 +1,32 @@
1
+ # .github/workflows/coverage.yml
2
+ name: Coverage
3
+
4
+
5
+ on:
6
+ push:
7
+ branches: [ main, master ]
8
+ pull_request:
9
+ branches: [ main, master ]
10
+
11
+ jobs:
12
+ test:
13
+ runs-on: ubuntu-latest
14
+ steps:
15
+ - uses: actions/checkout@v4
16
+ - name: Set up Ruby
17
+ uses: ruby/setup-ruby@v1
18
+ with:
19
+ ruby-version: 3.2.3
20
+ - name: Install dependencies
21
+ run: |
22
+ sudo apt-get update
23
+ sudo apt-get install -y libreoffice unoconv poppler-utils zlib1g zlib1g-dev zip rar p7zip-full
24
+ bundle install --jobs 4 --retry 3
25
+ - name: Run tests with coverage
26
+ run: |
27
+ bundle exec rspec
28
+ - name: Upload coverage to Codecov
29
+ uses: codecov/codecov-action@v4
30
+ with:
31
+ files: ./coverage/.resultset.json,./coverage/coverage.json,./coverage/index.html
32
+ fail_ci_if_error: false
data/.gitignore CHANGED
@@ -20,6 +20,7 @@ build/
20
20
  *.bridgesupport
21
21
  build-iPhoneOS/
22
22
  build-iPhoneSimulator/
23
+ .byebug_history
23
24
 
24
25
  ## Specific to RubyMotion (use of CocoaPods):
25
26
  #
data/Aptfile.sh ADDED
@@ -0,0 +1,55 @@
1
+ # https://github.com/phlowerteam/total_compressor
2
+ sudo apt-get install zlib1g
3
+ sudo apt-get install zlib1g-dev
4
+ sudo apt-get install zip
5
+ sudo apt-get install rar
6
+ sudo apt-get install p7zip-full
7
+
8
+ # PDF
9
+ sudo apt-get install poppler-utils
10
+
11
+ # OpenOffice
12
+ sudo apt-get install jodconverter
13
+
14
+ sudo apt-get install nautilus-filename-repairer
15
+ sudo apt-get install python3-chardet
16
+ sudo apt-get install xfonts-encodings
17
+ sudo apt-get install libfontenc1
18
+ sudo apt-get install console-setup
19
+ sudo apt-get install fontconfig
20
+ sudo apt-get install fontconfig-config
21
+ sudo apt-get install fonts-kacst
22
+ sudo apt-get install fonts-kacst-one
23
+ sudo apt-get install fonts-khmeros-core
24
+ sudo apt-get install fonts-lao
25
+ sudo apt-get install fonts-liberation
26
+ sudo apt-get install fonts-nanum
27
+ sudo apt-get install fonts-opensymbol
28
+ sudo apt-get install fonts-sil-gentium-basic
29
+ sudo apt-get install fonts-takao-pgothic
30
+ sudo apt-get install fonts-thai-tlwg
31
+ sudo apt-get install fonts-tlwg-garuda
32
+ sudo apt-get install fonts-tlwg-kinnari
33
+ sudo apt-get install fonts-tlwg-loma
34
+ sudo apt-get install fonts-tlwg-mono
35
+ sudo apt-get install fonts-tlwg-norasi
36
+ sudo apt-get install fonts-tlwg-purisa
37
+ sudo apt-get install fonts-tlwg-sawasdee
38
+ sudo apt-get install fonts-tlwg-typewriter
39
+ sudo apt-get install fonts-tlwg-typist
40
+ sudo apt-get install fonts-tlwg-typo
41
+ sudo apt-get install fonts-tlwg-umpush
42
+ sudo apt-get install fonts-tlwg-waree
43
+ sudo apt-get install gnome-font-viewer
44
+ sudo apt-get install gsfonts gucharmap
45
+ sudo apt-get install kbd
46
+ sudo apt-get install libfontconfig1
47
+ sudo apt-get install libfontenc1
48
+ sudo apt-get install libfreetype6
49
+ sudo apt-get install libxft2
50
+ sudo apt-get install fonts-ubuntu
51
+ sudo apt-get install fonts-wqy-microhei
52
+ sudo apt-get install x11-xfs-utils xfonts-base
53
+ sudo apt-get install xfonts-encodings
54
+ sudo apt-get install xfonts-scalable
55
+ sudo apt-get install xfonts-utils
data/CHANGELOG.md ADDED
@@ -0,0 +1,67 @@
1
+ # Changelog
2
+
3
+ ## [0.7.0] - 2025-08-21
4
+ ### Added
5
+ - Breaking changes: added root folder as an option to access the folder between deployments, improved error processing ([883cafc], [b10a367])
6
+
7
+ HINT: to upgrade an older version, you need to fix the DB scheme and data migrations like this:
8
+ ```rb
9
+ # db/migrate/20250804182426_upgrade_act_as_page_extractor_to_version.rb
10
+ class UpgradeActAsPageExtractorToVersion < ActiveRecord::Migration
11
+ def change
12
+ add_column :documents, :pages_extraction_errors, :string, default: ''
13
+ end
14
+ end
15
+
16
+ # db/data/20250804183544_upgrade_act_as_page_extractor_to_version.rb
17
+ class UpgradeActAsPageExtractorToVersion < ActiveRecord::Migration
18
+ def up
19
+ Document
20
+ .where(page_extraction_state: 'error.extraction')
21
+ .update_all(page_extraction_state: 'error_extraction')
22
+ end
23
+
24
+ def down
25
+ raise ActiveRecord::IrreversibleMigration
26
+ end
27
+ end
28
+ ```
29
+
30
+ ## [0.6.0] - 2024-08-31
31
+ ### Changed
32
+ - Upgraded to Ruby 3.2 minimal version ([4b463a1], [be52d9c])
33
+ - Upgraded to ActiveRecord >=6.x.x ([7881613], [48e6d8b], [8bd3707])
34
+ - Improved docs & Readme ([c405044], [6e895bb])
35
+
36
+ ## [0.5.0] - 2024-08-30
37
+ ### Changed
38
+ - Upgraded to ActiveRecord 6.0 ([9eea586])
39
+
40
+ ## [0.2.3] - 2020-06-05
41
+ ### Changed
42
+ - Upgraded to ActiveRecord-5.2.0 ([cde1f36])
43
+
44
+ ## [0.2.2] - 2020-06-04
45
+ ### Changed
46
+ - Upgraded to ActiveRecord-5.1.0 ([f6ea8d7])
47
+
48
+ ## [0.2.1] - 2020-05-11
49
+ ### Changed
50
+ - Upgraded to ActiveRecord-5.0.0 ([5c595ee], [3eb4ad7])
51
+
52
+ ## [0.1.6] - 2020-05-10
53
+ ### Changed
54
+ - Updated libraries ([eca4346]), ([7d3bb4f], [203c689], [171cf27])
55
+
56
+ ## [0.1.2] - 2018-11-29
57
+ ### Changed
58
+ - Updated rubyzip library ([38c4156])
59
+
60
+ ## [0.1.1] - 2017-01-10
61
+ ### Changed
62
+ - Removed code coverage from Rails generators ([a990357])
63
+
64
+ ## [0.1.0] - 2017-01-09
65
+ ### Added
66
+ - Initial commit ([47c0950], [5225f33])
67
+ - Fixed tests ([e68a6b7])
data/Gemfile CHANGED
@@ -3,11 +3,11 @@ source 'https://rubygems.org'
3
3
  # Specify your gem's dependencies in total_compressor.gemspec
4
4
  gemspec
5
5
 
6
- ruby '>= 3.2'
6
+ ruby '3.2.3'
7
7
 
8
- gem 'activerecord', '~> 6'
8
+ gem 'activerecord'
9
9
 
10
- gem 'awesome_print'
10
+ gem 'amazing_print'
11
11
 
12
12
  gem 'docsplit' # API for OpenOffice jodconverter (any to pdf)
13
13
  gem 'pdf_utils' # getting text from pdf
data/Gemfile.lock CHANGED
@@ -1,9 +1,9 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- act_as_page_extractor (0.6.4)
5
- activerecord (~> 6)
6
- awesome_print (~> 1)
4
+ act_as_page_extractor (0.7.0)
5
+ activerecord (>= 6)
6
+ amazing_print (~> 1)
7
7
  docsplit (~> 0)
8
8
  filesize (~> 0)
9
9
  pdf-reader (~> 1, >= 1.4)
@@ -27,8 +27,9 @@ GEM
27
27
  tzinfo (~> 2.0)
28
28
  zeitwerk (~> 2.3)
29
29
  afm (0.2.2)
30
+ amazing_print (1.8.1)
30
31
  awesome_print (1.9.2)
31
- byebug (11.1.3)
32
+ byebug (12.0.0)
32
33
  concurrent-ruby (1.3.4)
33
34
  diff-lcs (1.5.1)
34
35
  docile (1.4.1)
@@ -84,9 +85,9 @@ PLATFORMS
84
85
 
85
86
  DEPENDENCIES
86
87
  act_as_page_extractor!
87
- activerecord (~> 6)
88
- awesome_print
89
- bundler (~> 2)
88
+ activerecord
89
+ amazing_print
90
+ bundler (>= 2.2.33)
90
91
  byebug
91
92
  docsplit
92
93
  filesize
@@ -99,7 +100,7 @@ DEPENDENCIES
99
100
  total_compressor
100
101
 
101
102
  RUBY VERSION
102
- ruby 3.2.0p0
103
+ ruby 3.2.3p157
103
104
 
104
105
  BUNDLED WITH
105
106
  2.4.1
data/README.md CHANGED
@@ -1,30 +1,29 @@
1
+ [![codecov](https://codecov.io/gh/phlowerteam/act_as_page_extractor/branch/41-Feature-Improve-error-procesing/graph/badge.svg)](https://codecov.io/gh/phlowerteam/act_as_page_extractor)
2
+
1
3
  act_as_page_extractor
2
4
  ================
3
5
 
4
- Library for extracting plain text from documents(files) for further processing (indexing and searching).
6
+ A library that extracts plain text from documents for subsequent processing, such as indexing and search.
5
7
 
6
8
  ## Installation
7
9
 
8
- Install appropriate tools before using:
9
-
10
+ Install all dependencies before use:
10
11
  ```sh
11
- sudo apt-get install zlib1g zlib1g-dev zip rar p7zip-full
12
+ sh Aptfile.sh
12
13
  ```
13
- Add this line to your application's Gemfile:
14
14
 
15
+ Add this to your Gemfile:
15
16
  ```rb
16
17
  gem 'act_as_page_extractor'
17
- bundle
18
18
  ```
19
19
  ## Usage
20
20
 
21
- For example, for model Document in the Rails framework we need run:
22
-
21
+ Generate a migration, for example for a Document model:
23
22
  ```sh
24
23
  rails g act_as_page_extractor:migration Document category_id user_id
25
24
  ```
26
25
 
27
- As a result we get two migration files:
26
+ This will generate two migration files:
28
27
  ```rb
29
28
  class AddPageExtractorFields < ActiveRecord::Migration
30
29
  def change
@@ -32,6 +31,7 @@ class AddPageExtractorFields < ActiveRecord::Migration
32
31
  add_column :documents, :page_extraction_pages, :integer, default: 0
33
32
  add_column :documents, :page_extraction_doctype, :string, default: ''
34
33
  add_column :documents, :page_extraction_filesize, :string, default: ''
34
+ add_column :documents, :pages_extraction_errors, :string, default: ''
35
35
  end
36
36
  end
37
37
 
@@ -65,20 +65,20 @@ Add to model next parameters for initializing:
65
65
 
66
66
  act_as_page_extractor options: {
67
67
  document_class: 'Document',
68
- save_as_pdf: true,
68
+ save_as_pdf: true, # store converted document as PDF
69
69
  filename: :filename,
70
70
  document_id: :document_id,
71
- additional_fields: [:category_id, :user_id],
72
- #file_storage: "/full/path/to/documents/storage",
73
- #pdf_storage: "/full/path/to/extracted/pdf/storage"
71
+ additional_fields: [:category_id, :user_id], # copy values of these fields from document to extracted_page
72
+ root_folder: Rails.root.to_s, # or "/full/path/to/project", it needs to share folder between deployments
73
+ # file_storage: "/full/path/to/project/public/uploads/documents/storage" # optional
74
+ # pdf_storage: "/full/path/to/project/public/uploads/extracted/pdf/storage" # optional
74
75
  }
75
76
 
76
77
  has_many :extracted_pages, dependent: :destroy
77
78
  end
78
79
  ```
79
80
 
80
- Now our instance has few new methods:
81
-
81
+ The instance now provides several new methods:
82
82
  ```rb
83
83
  document = Document.first
84
84
  document.page_extract!
@@ -100,29 +100,24 @@ ActAsPageExtractor.statistics
100
100
 
101
101
  Parameters of initializing **act_as_page_extractor**:
102
102
 
103
- * **document_class** - name of model (e.g. Document)
104
- * **save_as_pdf** - boolean [true, false] when we want save temporary pdf
105
- * **filename** - name of field which contains access to the file and it should be an object with 'url' method that returns path to file (e.g. CarrierWave object with 'filename.url')
106
- * **document_id** - name for saving id
107
- * **additional_fields** - additional fields that added to extracted page (e.g. for indexing, etc.)
108
- * **file_storage** - path for saving tmp files (by default it is "public")
109
- * **pdf_storage** - path for saving pdf (by default it is "public/uploads/extracted/pdf")
103
+ * **document_class** The name of the model (e.g., `Document`).
104
+ * **save_as_pdf** Boolean (`true`/`false`). Indicates whether to save a temporary PDF.
105
+ * **filename** The field containing access to the file. This should be an object with a `url` method that returns the file path (e.g., a CarrierWave object with `filename.url`).
106
+ * **document_id** The field name for storing the document ID.
107
+ * **additional_fields** Extra fields to be added to the extracted page (useful for indexing, etc.).
108
+ * **root_folder** The root folder to be shared across deployments (e.g., `Rails.root.to_s`).
109
+ * **file_storage** Path for saving temporary files (default: `"public"`).
110
+ * **pdf_storage** — Path for saving PDFs (default: `"public/uploads/extracted/pdf"`).
110
111
 
111
112
  ## Run tests
112
113
  ```sh
114
+ bundle
113
115
  rspec
114
116
  ```
115
- ## Contributing
116
- 1. Fork it
117
- 2. Create your feature branch (`git checkout -b my-new-feature`)
118
- 3. Commit your changes (`git commit -am 'Add some feature'`)
119
- 4. Push to the branch (`git push origin my-new-feature`)
120
- 5. Create new Pull Request
121
117
 
122
118
  ## Contacts
123
- https://github.com/phlowerteam
124
- phlowerteam@gmail.com
119
+ https://github.com/phlowerteam / phlowerteam[A]gmail.com
125
120
 
126
121
  ## License
127
- Copyright (c) 2024 PhlowerTeam
128
- MIT License
122
+
123
+ MIT License © 2025 PhlowerTeam
@@ -16,15 +16,16 @@ Gem::Specification.new do |spec|
16
16
  spec.files = `git ls-files`.split($/)
17
17
  spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
18
18
  spec.require_paths = ['lib']
19
+ spec.required_ruby_version = '>= 3.2'
19
20
 
20
- spec.add_development_dependency 'bundler', '~> 2'
21
+ spec.add_development_dependency 'bundler', '>= 2.2.33'
21
22
  spec.add_development_dependency 'rake', '~> 12', '>= 12.3.3'
22
23
  spec.add_development_dependency 'byebug', '~> 0'
23
24
  spec.add_development_dependency 'rspec', '~> 0'
24
25
  spec.add_development_dependency 'simplecov', '~> 0'
25
26
 
26
- spec.add_runtime_dependency 'activerecord', '~> 6'
27
- spec.add_runtime_dependency 'awesome_print', '~> 1'
27
+ spec.add_runtime_dependency 'activerecord', '>= 6'
28
+ spec.add_runtime_dependency 'amazing_print', '~> 1'
28
29
  spec.add_runtime_dependency 'docsplit', '~> 0' # API for OpenOffice jodconverter (any to pdf)
29
30
  spec.add_runtime_dependency 'pdf_utils', '~> 0' # getting text from pdf
30
31
  spec.add_runtime_dependency 'prawn', '~> 1' # need for pdf_utils
@@ -20,23 +20,22 @@ module ActAsPageExtractor
20
20
  pdf_path if File.exist?(pdf_path)
21
21
  end
22
22
  end
23
+ rescue StandardError => e
24
+ add_error(e)
23
25
  end
24
26
 
25
27
  def convert_to_text
26
- begin
27
- @pdf_pages = PdfUtils.info(@pdf_path).pages
28
- if @pdf_pages
29
- if timeout_wrapper{ Docsplit::extract_text(@pdf_path, ocr: false, pages: 'all', output: @tmp_dir) }
30
- else
31
- # :nocov:
32
- @pdf_pages = nil
33
- raise
34
- # :nocov:
35
- end
28
+ @pdf_pages = PdfUtils.info(@pdf_path).pages
29
+ if @pdf_pages
30
+ if timeout_wrapper{ Docsplit::extract_text(@pdf_path, ocr: false, pages: 'all', output: @tmp_dir) }
31
+ else
32
+ # :nocov:
33
+ @pdf_pages = nil
34
+ raise ERRORS[:unknown_docsplit_error]
35
+ # :nocov:
36
36
  end
37
- # :nocov:
38
- rescue
39
37
  end
40
- # :nocov:
38
+ rescue StandardError => e
39
+ add_error(e)
41
40
  end
42
41
  end
@@ -5,8 +5,9 @@ module ActAsPageExtractor
5
5
  def timeout_wrapper
6
6
  result = nil
7
7
  begin
8
- result = Timeout::timeout(60*5) { yield }
9
- rescue
8
+ result = Timeout::timeout(EXTRACTION_TIMEOUT) { yield }
9
+ rescue StandardError => e
10
+ add_error(e)
10
11
  ensure
11
12
  result
12
13
  end
@@ -25,12 +26,13 @@ module ActAsPageExtractor
25
26
  }
26
27
  else
27
28
  {
28
- page_extraction_state: EXTRACTING_STATES[:'error.extraction'],
29
+ page_extraction_state: @page_extraction_state || EXTRACTING_STATES[:error_extraction],
29
30
  page_extraction_pages: 0
30
31
  }
31
32
  end.merge({
32
33
  page_extraction_doctype: @document_path&.split('.')&.last,
33
- page_extraction_filesize: Filesize.from("#{File.size(@document_path)} B").pretty
34
+ page_extraction_filesize: Filesize.from("#{File.size(@document_path)} B").pretty,
35
+ pages_extraction_errors: @pages_extraction_errors.chomp
34
36
  })
35
37
  self.update(updated_attributes)
36
38
  end
@@ -39,6 +41,14 @@ module ActAsPageExtractor
39
41
  self.extracted_pages.destroy_all
40
42
  end
41
43
 
44
+ def add_error(e)
45
+ if ERRORS.values.include?(e.message)
46
+ @pages_extraction_errors << "#{e.message}\n\n"
47
+ else
48
+ @pages_extraction_errors << "#{e.class}, #{e.message}\n#{e.backtrace[0..ERROR_BACKTRACE_LINES].join("\n")}\n"
49
+ end
50
+ end
51
+
42
52
  # :nocov:
43
53
  def debug_info
44
54
  # ap "@tmp_dir"
@@ -1,6 +1,9 @@
1
1
  module ActAsPageExtractor
2
2
  def unzip_document
3
3
  @document_path = @copy_document_path
4
+
5
+ return if VALIDATE_DOC_TYPES.include?(@document_path.split('.').last&.downcase)
6
+
4
7
  if validate_compress_types
5
8
  result = TotalCompressor.decompress(@copy_document_path)
6
9
  if result[:success] && result[:files].length == 1
@@ -12,4 +15,15 @@ module ActAsPageExtractor
12
15
  end
13
16
  end
14
17
  end
18
+
19
+ def validate_compress_types
20
+ valid = VALIDATE_COMPRESS_TYPES.include?(@copy_document_path.split('.').last&.downcase)
21
+
22
+ unless valid
23
+ @page_extraction_state = EXTRACTING_STATES[:error_doctype]
24
+ @pages_extraction_errors << "#{EXTRACTING_STATES[:error_doctype]} "
25
+ end
26
+
27
+ valid
28
+ end
15
29
  end
@@ -1,22 +1,28 @@
1
1
  module ActAsPageExtractor
2
- VALIDATE_COMPRESS_TYPES = ['zip', 'rar', '7z', 'gzip'].freeze
3
- VALIDATE_DOC_TYPES = ['txt', 'pdf', 'doc', 'docx',
4
- 'rtf', 'odt', 'htm', 'html'].freeze
5
-
6
2
  def valid_document
7
3
  validate_size && validate_doc_types
8
4
  end
9
5
 
10
6
  def validate_size
11
7
  mb = 2**20
12
- File.size(@copy_document_path) <= 1*mb
13
- end
8
+ valid = File.size(@copy_document_path) <= 1*mb
9
+
10
+ unless valid
11
+ @page_extraction_state = EXTRACTING_STATES[:error_filesize]
12
+ @pages_extraction_errors << "#{EXTRACTING_STATES[:error_filesize]} "
13
+ end
14
14
 
15
- def validate_compress_types
16
- VALIDATE_COMPRESS_TYPES.include?(@copy_document_path.split('.').last&.downcase)
15
+ valid
17
16
  end
18
17
 
19
18
  def validate_doc_types
20
- VALIDATE_DOC_TYPES.include?(@document_path.split('.').last&.downcase)
19
+ valid = VALIDATE_DOC_TYPES.include?(@document_path.split('.').last&.downcase)
20
+
21
+ unless valid
22
+ @page_extraction_state = EXTRACTING_STATES[:error_doctype]
23
+ @pages_extraction_errors << "#{EXTRACTING_STATES[:error_doctype]} "
24
+ end
25
+
26
+ valid
21
27
  end
22
28
  end
@@ -1,5 +1,5 @@
1
1
  # :nocov:
2
2
  module ActAsPageExtractor
3
- VERSION = "0.6.4"
3
+ VERSION = "0.7.0"
4
4
  end
5
5
  # :nocov:
@@ -19,9 +19,26 @@ require 'act_as_page_extractor/modules/saving.rb'
19
19
  require 'act_as_page_extractor/modules/interface'
20
20
 
21
21
  module ActAsPageExtractor
22
-
23
22
  extend ActiveSupport::Concern
24
23
 
24
+ DEFAULT_ROOT_FOLDER = Dir.pwd.to_s
25
+ ERRORS = {
26
+ unknown_docsplit_error: 'Unknown Docsplit error'
27
+ }.freeze
28
+ ERROR_BACKTRACE_LINES = 15
29
+ EXTRACTING_STATES = {
30
+ new: 'new',
31
+ extracting: 'extracting',
32
+ extracted: 'extracted',
33
+ error_doctype: 'error_doctype',
34
+ error_extraction: 'error_extraction',
35
+ error_filesize: 'error_filesize'
36
+ }.freeze
37
+ EXTRACTION_TIMEOUT = 60*5 # 5 minutes
38
+ VALIDATE_COMPRESS_TYPES = ['zip', 'rar', '7z', 'gzip'].freeze
39
+ VALIDATE_DOC_TYPES = ['txt', 'pdf', 'doc', 'docx',
40
+ 'rtf', 'odt', 'htm', 'html'].freeze
41
+
25
42
  included do
26
43
  before_create { self.page_extraction_state = EXTRACTING_STATES[:new] }
27
44
  before_destroy :remove_files
@@ -35,23 +52,16 @@ module ActAsPageExtractor
35
52
  ActAsPageExtractor.define_singleton_method(:document_class) {|*args| Object.const_get(options[:document_class]) }
36
53
  define_method(:extracted_document_id){|*args| options[:document_id] }
37
54
  define_method(:additional_fields){|*args| options[:additional_fields] || [] }
38
- define_method(:file_storage){|*args| options[:file_storage] || FILE_STORAGE }
39
- define_method(:pdf_storage){|*args| options[:pdf_storage] || PDF_STORAGE }
55
+ define_method(:root_folder){|*args| options[:root_folder] || DEFAULT_ROOT_FOLDER }
56
+ define_method(:file_storage){|*args| options[:file_storage] || "#{root_folder}/public".freeze }
57
+ define_method(:pdf_storage){|*args| options[:pdf_storage] || "#{file_storage}/uploads/extracted/pdf".freeze }
58
+ define_method(:tmp_extraction_file_storage){|*args| "#{root_folder}/tmp/page_extraction" }
40
59
  end
41
60
  end
42
61
 
43
- EXTRACTING_STATES = {
44
- new: 'new',
45
- extracting: 'extracting',
46
- extracted: 'extracted',
47
- 'error.extraction': 'error.extraction'
48
- }.freeze
49
-
50
- TMP_EXTRACTION_FILE_STORAGE = "#{Dir.pwd}/tmp/page_extraction".freeze
51
- FILE_STORAGE = "#{Dir.pwd}/public".freeze
52
- PDF_STORAGE = "#{FILE_STORAGE}/uploads/extracted/pdf".freeze
53
-
54
62
  def initialized
63
+ @page_extraction_state = nil
64
+ @pages_extraction_errors = ''
55
65
  # add all need callbacks
56
66
  #on destroy remove pdf
57
67
 
@@ -90,7 +100,7 @@ module ActAsPageExtractor
90
100
  end
91
101
 
92
102
  def create_tmp_dir
93
- @tmp_dir = "#{TMP_EXTRACTION_FILE_STORAGE}/#{SecureRandom.hex(6)}"
103
+ @tmp_dir = "#{tmp_extraction_file_storage}/#{SecureRandom.hex(6)}"
94
104
  FileUtils::mkdir_p(@tmp_dir) unless File.exist?(@tmp_dir)
95
105
  end
96
106
 
@@ -4,5 +4,6 @@ class <%= migration_class_name_documents %> < ActiveRecord::Migration
4
4
  add_column :<%= documents_table_name %>, :page_extraction_pages, :integer, default: 0
5
5
  add_column :<%= documents_table_name %>, :page_extraction_doctype, :string, default: ''
6
6
  add_column :<%= documents_table_name %>, :page_extraction_filesize, :string, default: ''
7
+ add_column :<%= documents_table_name %>, :pages_extraction_errors, :text, default: ''
7
8
  end
8
9
  end
@@ -15,7 +15,7 @@ describe ActAsPageExtractor do
15
15
  'Oscar_Wilde_The_Happy_Prince_en.docx.rar',
16
16
  'Oscar_Wilde_The_Happy_Prince_en.docx.7z'
17
17
  ].each do |document|
18
- it "extraction valid document #{document}" do
18
+ it "extracts valid document #{document}" do
19
19
  book = Book.new({doc_path: document})
20
20
  allow(Book).to receive_message_chain('where') { [book] }
21
21
  ActAsPageExtractor.start_extraction
@@ -25,21 +25,58 @@ describe ActAsPageExtractor do
25
25
  unless document.match /pdf/
26
26
  expect(book.pdf_path).to match /pdf/
27
27
  expect(book.remove_files.count).to eq 1
28
+ expect(book.pages_extraction_errors).to be_empty
28
29
  end
29
30
  expect(ActAsPageExtractor.statistics).to include(supported_documents: 1)
30
31
  end
31
32
  end
32
33
  end
33
34
 
34
- context 'incorrect extraction' do
35
- [
36
- 'Oscar_Wilde_The_Happy_Prince_en.wrong',
37
- ].each do |document|
38
- it "extraction invalid document #{document}" do
39
- book = Book.new({doc_path: document})
40
- allow(Book).to receive_message_chain('where') { [book] }
35
+ describe 'errors processing' do
36
+ let(:book) { Book.new({doc_path: document}) }
37
+
38
+ before do
39
+ allow(Book).to receive_message_chain('where') { [book] }
40
+ end
41
+
42
+ context 'when invalid doctype' do
43
+ let(:document) { 'Oscar_Wilde_The_Happy_Prince_en.wrong' }
44
+
45
+ it "logs invalid doctype" do
46
+ ActAsPageExtractor.start_extraction
47
+ expect(book.page_extraction_state).to eq 'error_doctype'
48
+ expect(book.pages_extraction_errors).to match('error_doctype')
49
+ end
50
+ end
51
+
52
+ context 'with extraction timeout' do
53
+ let(:error_msg) { 'execution expired' }
54
+ let(:document) { 'Oscar_Wilde_The_Happy_Prince_en.docx' }
55
+
56
+ before do
57
+ allow(Docsplit).to receive(:extract_pdf).and_raise(Timeout::Error.new(error_msg))
58
+ end
59
+
60
+ it "logs timeout error" do
61
+ ActAsPageExtractor.start_extraction
62
+ expect(book.page_extraction_state).to eq 'error_extraction'
63
+ expect(book.pages_extraction_errors).to match(error_msg)
64
+ end
65
+ end
66
+
67
+ context 'when Docsplit returns failure' do
68
+ let(:error_msg) { 'Unknown Docsplit error' }
69
+ let(:document) { 'Oscar_Wilde_The_Happy_Prince_en.docx' }
70
+
71
+ before do
72
+ allow(Docsplit).to receive(:extract_pdf).and_raise(Timeout::Error.new(error_msg))
73
+ allow(Docsplit).to receive(:extract_text).and_raise(Timeout::Error.new(error_msg))
74
+ end
75
+
76
+ it "logs Docsplit error" do
41
77
  ActAsPageExtractor.start_extraction
42
- expect(book.page_extraction_state).to eq ActAsPageExtractor::EXTRACTING_STATES[:'error.extraction']
78
+ expect(book.page_extraction_state).to eq 'error_extraction'
79
+ expect(book.pages_extraction_errors).to match(error_msg)
43
80
  end
44
81
  end
45
82
  end
data/spec/spec_helper.rb CHANGED
@@ -3,7 +3,7 @@ unless ENV['SKIP_COVERAGE']
3
3
  SimpleCov.start 'rails' do
4
4
  add_filter 'vendor'
5
5
  end
6
- SimpleCov.minimum_coverage 100
6
+ SimpleCov.minimum_coverage 98
7
7
  end
8
8
 
9
9
  require 'rspec'
@@ -14,7 +14,8 @@ class Book
14
14
  :page_extraction_state,
15
15
  :page_extraction_pages,
16
16
  :page_extraction_doctype,
17
- :page_extraction_filesize
17
+ :page_extraction_filesize,
18
+ :pages_extraction_errors
18
19
 
19
20
  def self.before_create &block
20
21
  yield
@@ -35,6 +36,7 @@ class Book
35
36
  filename: :filename, # CarrierWave class with 'filename.url' method
36
37
  document_id: :document_id,
37
38
  additional_fields: [:category_id, :user_id],
39
+ root_folder: Dir.pwd.to_s,
38
40
  file_storage: "#{Dir.pwd}/test/",
39
41
  pdf_storage: "#{Dir.pwd}/test/uploads/extracted/pdf"
40
42
  }
@@ -44,6 +46,7 @@ class Book
44
46
  @id = @category_id = @user_id = nil
45
47
  @page_extraction_state = @page_extraction_pages = nil
46
48
  @page_extraction_doctype = @page_extraction_filesize = nil
49
+ @pages_extraction_errors = ''
47
50
  ExtractedPage.cleanup
48
51
  end
49
52
 
@@ -62,7 +65,13 @@ class Book
62
65
 
63
66
  def update params
64
67
  params.each do |key, value|
65
- instance_eval("self.#{key} = #{value.class == String ? '\'' + value + '\'': value }")
68
+ if value.nil?
69
+ instance_eval("self.#{key} = nil")
70
+ elsif value.class == String
71
+ instance_eval("self.#{key} = \"#{value}\"")
72
+ else
73
+ instance_eval("self.#{key} = #{value}")
74
+ end
66
75
  end
67
76
  end
68
77
  end
metadata CHANGED
@@ -1,29 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: act_as_page_extractor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.4
4
+ version: 0.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - PhlowerTeam
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-08-31 00:00:00.000000000 Z
11
+ date: 2025-08-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - "~>"
17
+ - - ">="
18
18
  - !ruby/object:Gem::Version
19
- version: '2'
19
+ version: 2.2.33
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - "~>"
24
+ - - ">="
25
25
  - !ruby/object:Gem::Version
26
- version: '2'
26
+ version: 2.2.33
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rake
29
29
  requirement: !ruby/object:Gem::Requirement
@@ -90,18 +90,18 @@ dependencies:
90
90
  name: activerecord
91
91
  requirement: !ruby/object:Gem::Requirement
92
92
  requirements:
93
- - - "~>"
93
+ - - ">="
94
94
  - !ruby/object:Gem::Version
95
95
  version: '6'
96
96
  type: :runtime
97
97
  prerelease: false
98
98
  version_requirements: !ruby/object:Gem::Requirement
99
99
  requirements:
100
- - - "~>"
100
+ - - ">="
101
101
  - !ruby/object:Gem::Version
102
102
  version: '6'
103
103
  - !ruby/object:Gem::Dependency
104
- name: awesome_print
104
+ name: amazing_print
105
105
  requirement: !ruby/object:Gem::Requirement
106
106
  requirements:
107
107
  - - "~>"
@@ -212,9 +212,12 @@ executables: []
212
212
  extensions: []
213
213
  extra_rdoc_files: []
214
214
  files:
215
+ - ".github/workflows/coverage.yml"
215
216
  - ".gitignore"
216
217
  - ".rspec"
217
218
  - ".ruby-gemset"
219
+ - Aptfile.sh
220
+ - CHANGELOG.md
218
221
  - Gemfile
219
222
  - Gemfile.lock
220
223
  - LICENSE
@@ -261,14 +264,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
261
264
  requirements:
262
265
  - - ">="
263
266
  - !ruby/object:Gem::Version
264
- version: '0'
267
+ version: '3.2'
265
268
  required_rubygems_version: !ruby/object:Gem::Requirement
266
269
  requirements:
267
270
  - - ">="
268
271
  - !ruby/object:Gem::Version
269
272
  version: '0'
270
273
  requirements: []
271
- rubygems_version: 3.4.1
274
+ rubygems_version: 3.4.19
272
275
  signing_key:
273
276
  specification_version: 4
274
277
  summary: Uses system calls