act_as_page_extractor 0.6.3 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/coverage.yml +32 -0
  3. data/.gitignore +1 -0
  4. data/Aptfile.sh +55 -0
  5. data/CHANGELOG.md +67 -0
  6. data/Gemfile +4 -2
  7. data/Gemfile.lock +13 -9
  8. data/README.md +27 -32
  9. data/act_as_page_extractor.gemspec +4 -3
  10. data/lib/act_as_page_extractor/modules/extracting.rb +20 -14
  11. data/lib/act_as_page_extractor/modules/interface.rb +1 -1
  12. data/lib/act_as_page_extractor/modules/tools.rb +14 -4
  13. data/lib/act_as_page_extractor/modules/unzipping.rb +14 -0
  14. data/lib/act_as_page_extractor/modules/validating.rb +15 -9
  15. data/lib/act_as_page_extractor/version.rb +1 -1
  16. data/lib/act_as_page_extractor.rb +27 -17
  17. data/lib/generators/act_as_page_extractor/templates/add_page_extractor_fields_to_documents.rb.erb +1 -0
  18. data/spec/act_as_page_extractor_spec.rb +58 -21
  19. data/spec/spec_helper.rb +1 -1
  20. data/spec/support/models.rb +11 -2
  21. data/test/Oscar_Wilde_The_Happy_Prince_en.doc +0 -0
  22. data/test/Oscar_Wilde_The_Happy_Prince_en.docx +0 -0
  23. data/test/Oscar_Wilde_The_Happy_Prince_en.docx.7z +0 -0
  24. data/test/Oscar_Wilde_The_Happy_Prince_en.docx.rar +0 -0
  25. data/test/Oscar_Wilde_The_Happy_Prince_en.docx.zip +0 -0
  26. data/test/Oscar_Wilde_The_Happy_Prince_en.html +395 -0
  27. data/test/Oscar_Wilde_The_Happy_Prince_en.odt +0 -0
  28. data/test/Oscar_Wilde_The_Happy_Prince_en.pdf +0 -0
  29. data/test/Oscar_Wilde_The_Happy_Prince_en.rtf +257 -0
  30. data/test/Oscar_Wilde_The_Happy_Prince_en.txt +79 -0
  31. data/test/Oscar_Wilde_The_Happy_Prince_en.wrong +0 -0
  32. metadata +36 -33
  33. data/test/test-doc-3-pages.doc +0 -0
  34. data/test/test-doc-3-pages.docx +0 -0
  35. data/test/test-doc-3-pages.docx.7z +0 -0
  36. data/test/test-doc-3-pages.docx.rar +0 -0
  37. data/test/test-doc-3-pages.docx.zip +0 -0
  38. data/test/test-doc-3-pages.html +0 -279
  39. data/test/test-doc-3-pages.odt +0 -0
  40. data/test/test-doc-3-pages.pdf +0 -0
  41. data/test/test-doc-3-pages.rtf +0 -339
  42. data/test/test-doc-3-pages.txt +0 -125
  43. data/test/test-doc-3-pages.wrong +0 -0
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9c4528cd1633b4691d46d7cef1eac6b66c986bfd3ebdebedec786b01bab0f0e6
4
- data.tar.gz: 05b6967bb2092e17fefca9923614a3b16fae9e5dc3bfd5caf91a16f4e93b6d57
3
+ metadata.gz: 0e2f7b9cc30bb8ba03a61ed873b43d4b1764997c65f470a751abe3a2042561e4
4
+ data.tar.gz: ec3e7125f2119a3666e8e9732ff2ae171df2b1c7e51a02eb9c0794e9f73a5bf5
5
5
  SHA512:
6
- metadata.gz: 1ea4ba52c2cfc10c21ca3ca82791af40689140de14ce56781859e6cee6ce538d8c8ad9c9b64f2acb39f0fce4848ca548395f3442146d2e1b4e7f4711f2f0c801
7
- data.tar.gz: 3cfbd8aef08741aceaf34103bdfbeb45ddbf9b34d863e9daf92f2d9a7c1636577cc265770cfd318287d0b77dacfb896c5c2d68923ed65393c250246f29053e3d
6
+ metadata.gz: b4ba9e08f4a11c250cb9449035d5728dbd764870c85709aa76a2f532e3f41d5e895a813ddd8a7bf98b3885ca871c25aa29867db9f2e7b099e4d99504c3a81907
7
+ data.tar.gz: 36e66d4bbf0300f570cf964902c0163caad97c8c6ca0c8856c7aaba8c72a1716b0a6a8218e5176edebd8dd261786a9142f9ba9cbfe62cf0193c9b677fb17088a
@@ -0,0 +1,32 @@
1
+ # .github/workflows/coverage.yml
2
+ name: Coverage
3
+
4
+
5
+ on:
6
+ push:
7
+ branches: [ main, master ]
8
+ pull_request:
9
+ branches: [ main, master ]
10
+
11
+ jobs:
12
+ test:
13
+ runs-on: ubuntu-latest
14
+ steps:
15
+ - uses: actions/checkout@v4
16
+ - name: Set up Ruby
17
+ uses: ruby/setup-ruby@v1
18
+ with:
19
+ ruby-version: 3.2.3
20
+ - name: Install dependencies
21
+ run: |
22
+ sudo apt-get update
23
+ sudo apt-get install -y libreoffice unoconv poppler-utils zlib1g zlib1g-dev zip rar p7zip-full
24
+ bundle install --jobs 4 --retry 3
25
+ - name: Run tests with coverage
26
+ run: |
27
+ bundle exec rspec
28
+ - name: Upload coverage to Codecov
29
+ uses: codecov/codecov-action@v4
30
+ with:
31
+ files: ./coverage/.resultset.json,./coverage/coverage.json,./coverage/index.html
32
+ fail_ci_if_error: false
data/.gitignore CHANGED
@@ -20,6 +20,7 @@ build/
20
20
  *.bridgesupport
21
21
  build-iPhoneOS/
22
22
  build-iPhoneSimulator/
23
+ .byebug_history
23
24
 
24
25
  ## Specific to RubyMotion (use of CocoaPods):
25
26
  #
data/Aptfile.sh ADDED
@@ -0,0 +1,55 @@
1
+ # https://github.com/phlowerteam/total_compressor
2
+ sudo apt-get install zlib1g
3
+ sudo apt-get install zlib1g-dev
4
+ sudo apt-get install zip
5
+ sudo apt-get install rar
6
+ sudo apt-get install p7zip-full
7
+
8
+ # PDF
9
+ sudo apt-get install poppler-utils
10
+
11
+ # OpenOffice
12
+ sudo apt-get install jodconverter
13
+
14
+ sudo apt-get install nautilus-filename-repairer
15
+ sudo apt-get install python3-chardet
16
+ sudo apt-get install xfonts-encodings
17
+ sudo apt-get install libfontenc1
18
+ sudo apt-get install console-setup
19
+ sudo apt-get install fontconfig
20
+ sudo apt-get install fontconfig-config
21
+ sudo apt-get install fonts-kacst
22
+ sudo apt-get install fonts-kacst-one
23
+ sudo apt-get install fonts-khmeros-core
24
+ sudo apt-get install fonts-lao
25
+ sudo apt-get install fonts-liberation
26
+ sudo apt-get install fonts-nanum
27
+ sudo apt-get install fonts-opensymbol
28
+ sudo apt-get install fonts-sil-gentium-basic
29
+ sudo apt-get install fonts-takao-pgothic
30
+ sudo apt-get install fonts-thai-tlwg
31
+ sudo apt-get install fonts-tlwg-garuda
32
+ sudo apt-get install fonts-tlwg-kinnari
33
+ sudo apt-get install fonts-tlwg-loma
34
+ sudo apt-get install fonts-tlwg-mono
35
+ sudo apt-get install fonts-tlwg-norasi
36
+ sudo apt-get install fonts-tlwg-purisa
37
+ sudo apt-get install fonts-tlwg-sawasdee
38
+ sudo apt-get install fonts-tlwg-typewriter
39
+ sudo apt-get install fonts-tlwg-typist
40
+ sudo apt-get install fonts-tlwg-typo
41
+ sudo apt-get install fonts-tlwg-umpush
42
+ sudo apt-get install fonts-tlwg-waree
43
+ sudo apt-get install gnome-font-viewer
44
+ sudo apt-get install gsfonts gucharmap
45
+ sudo apt-get install kbd
46
+ sudo apt-get install libfontconfig1
47
+ sudo apt-get install libfontenc1
48
+ sudo apt-get install libfreetype6
49
+ sudo apt-get install libxft2
50
+ sudo apt-get install fonts-ubuntu
51
+ sudo apt-get install fonts-wqy-microhei
52
+ sudo apt-get install x11-xfs-utils xfonts-base
53
+ sudo apt-get install xfonts-encodings
54
+ sudo apt-get install xfonts-scalable
55
+ sudo apt-get install xfonts-utils
data/CHANGELOG.md ADDED
@@ -0,0 +1,67 @@
1
+ # Changelog
2
+
3
+ ## [0.7.0] - 2025-08-21
4
+ ### Added
5
+ - Breaking changes: added root folder as an option to access the folder between deployments, improved error processing ([883cafc], [b10a367])
6
+
7
+ HINT: to upgrade an older version, you need to fix the DB scheme and data migrations like this:
8
+ ```rb
9
+ # db/migrate/20250804182426_upgrade_act_as_page_extractor_to_version.rb
10
+ class UpgradeActAsPageExtractorToVersion < ActiveRecord::Migration
11
+ def change
12
+ add_column :documents, :pages_extraction_errors, :string, default: ''
13
+ end
14
+ end
15
+
16
+ # db/data/20250804183544_upgrade_act_as_page_extractor_to_version.rb
17
+ class UpgradeActAsPageExtractorToVersion < ActiveRecord::Migration
18
+ def up
19
+ Document
20
+ .where(page_extraction_state: 'error.extraction')
21
+ .update_all(page_extraction_state: 'error_extraction')
22
+ end
23
+
24
+ def down
25
+ raise ActiveRecord::IrreversibleMigration
26
+ end
27
+ end
28
+ ```
29
+
30
+ ## [0.6.0] - 2024-08-31
31
+ ### Changed
32
+ - Upgraded to Ruby 3.2 minimal version ([4b463a1], [be52d9c])
33
+ - Upgraded to ActiveRecord >=6.x.x ([7881613], [48e6d8b], [8bd3707])
34
+ - Improved docs & Readme ([c405044], [6e895bb])
35
+
36
+ ## [0.5.0] - 2024-08-30
37
+ ### Changed
38
+ - Upgraded to ActiveRecord 6.0 ([9eea586])
39
+
40
+ ## [0.2.3] - 2020-06-05
41
+ ### Changed
42
+ - Upgraded to ActiveRecord-5.2.0 ([cde1f36])
43
+
44
+ ## [0.2.2] - 2020-06-04
45
+ ### Changed
46
+ - Upgraded to ActiveRecord-5.1.0 ([f6ea8d7])
47
+
48
+ ## [0.2.1] - 2020-05-11
49
+ ### Changed
50
+ - Upgraded to ActiveRecord-5.0.0 ([5c595ee], [3eb4ad7])
51
+
52
+ ## [0.1.6] - 2020-05-10
53
+ ### Changed
54
+ - Updated libraries ([eca4346]), ([7d3bb4f], [203c689], [171cf27])
55
+
56
+ ## [0.1.2] - 2018-11-29
57
+ ### Changed
58
+ - Updated rubyzip library ([38c4156])
59
+
60
+ ## [0.1.1] - 2017-01-10
61
+ ### Changed
62
+ - Removed code coverage from Rails generators ([a990357])
63
+
64
+ ## [0.1.0] - 2017-01-09
65
+ ### Added
66
+ - Initial commit ([47c0950], [5225f33])
67
+ - Fixed tests ([e68a6b7])
data/Gemfile CHANGED
@@ -3,9 +3,11 @@ source 'https://rubygems.org'
3
3
  # Specify your gem's dependencies in total_compressor.gemspec
4
4
  gemspec
5
5
 
6
- gem 'activerecord', '~> 6'
6
+ ruby '3.2.3'
7
7
 
8
- gem 'awesome_print'
8
+ gem 'activerecord'
9
+
10
+ gem 'amazing_print'
9
11
 
10
12
  gem 'docsplit' # API for OpenOffice jodconverter (any to pdf)
11
13
  gem 'pdf_utils' # getting text from pdf
data/Gemfile.lock CHANGED
@@ -1,9 +1,9 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- act_as_page_extractor (0.6.1)
5
- activerecord (~> 6)
6
- awesome_print (~> 1)
4
+ act_as_page_extractor (0.7.0)
5
+ activerecord (>= 6)
6
+ amazing_print (~> 1)
7
7
  docsplit (~> 0)
8
8
  filesize (~> 0)
9
9
  pdf-reader (~> 1, >= 1.4)
@@ -27,8 +27,9 @@ GEM
27
27
  tzinfo (~> 2.0)
28
28
  zeitwerk (~> 2.3)
29
29
  afm (0.2.2)
30
+ amazing_print (1.8.1)
30
31
  awesome_print (1.9.2)
31
- byebug (11.1.3)
32
+ byebug (12.0.0)
32
33
  concurrent-ruby (1.3.4)
33
34
  diff-lcs (1.5.1)
34
35
  docile (1.4.1)
@@ -80,13 +81,13 @@ GEM
80
81
  zeitwerk (2.6.17)
81
82
 
82
83
  PLATFORMS
83
- ruby
84
+ x86_64-linux
84
85
 
85
86
  DEPENDENCIES
86
87
  act_as_page_extractor!
87
- activerecord (~> 6)
88
- awesome_print
89
- bundler (~> 1)
88
+ activerecord
89
+ amazing_print
90
+ bundler (>= 2.2.33)
90
91
  byebug
91
92
  docsplit
92
93
  filesize
@@ -98,5 +99,8 @@ DEPENDENCIES
98
99
  simplecov
99
100
  total_compressor
100
101
 
102
+ RUBY VERSION
103
+ ruby 3.2.3p157
104
+
101
105
  BUNDLED WITH
102
- 1.17.3
106
+ 2.4.1
data/README.md CHANGED
@@ -1,30 +1,29 @@
1
+ [![codecov](https://codecov.io/gh/phlowerteam/act_as_page_extractor/branch/41-Feature-Improve-error-procesing/graph/badge.svg)](https://codecov.io/gh/phlowerteam/act_as_page_extractor)
2
+
1
3
  act_as_page_extractor
2
4
  ================
3
5
 
4
- Library for extracting plain text from documents(files) for further processing (indexing and searching).
6
+ A library that extracts plain text from documents for subsequent processing, such as indexing and search.
5
7
 
6
8
  ## Installation
7
9
 
8
- Install appropriate tools before using:
9
-
10
+ Install all dependencies before use:
10
11
  ```sh
11
- sudo apt-get install zlib1g zlib1g-dev zip rar p7zip-full
12
+ sh Aptfile.sh
12
13
  ```
13
- Add this line to your application's Gemfile:
14
14
 
15
+ Add this to your Gemfile:
15
16
  ```rb
16
17
  gem 'act_as_page_extractor'
17
- bundle
18
18
  ```
19
19
  ## Usage
20
20
 
21
- For example, for model Document in the Rails framework we need run:
22
-
21
+ Generate a migration, for example for a Document model:
23
22
  ```sh
24
23
  rails g act_as_page_extractor:migration Document category_id user_id
25
24
  ```
26
25
 
27
- As a result we get two migration files:
26
+ This will generate two migration files:
28
27
  ```rb
29
28
  class AddPageExtractorFields < ActiveRecord::Migration
30
29
  def change
@@ -32,6 +31,7 @@ class AddPageExtractorFields < ActiveRecord::Migration
32
31
  add_column :documents, :page_extraction_pages, :integer, default: 0
33
32
  add_column :documents, :page_extraction_doctype, :string, default: ''
34
33
  add_column :documents, :page_extraction_filesize, :string, default: ''
34
+ add_column :documents, :pages_extraction_errors, :string, default: ''
35
35
  end
36
36
  end
37
37
 
@@ -65,20 +65,20 @@ Add to model next parameters for initializing:
65
65
 
66
66
  act_as_page_extractor options: {
67
67
  document_class: 'Document',
68
- save_as_pdf: true,
68
+ save_as_pdf: true, # store converted document as PDF
69
69
  filename: :filename,
70
70
  document_id: :document_id,
71
- additional_fields: [:category_id, :user_id],
72
- #file_storage: "/full/path/to/documents/storage",
73
- #pdf_storage: "/full/path/to/extracted/pdf/storage"
71
+ additional_fields: [:category_id, :user_id], # copy values of these fields from document to extracted_page
72
+ root_folder: Rails.root.to_s, # or "/full/path/to/project", it needs to share folder between deployments
73
+ # file_storage: "/full/path/to/project/public/uploads/documents/storage" # optional
74
+ # pdf_storage: "/full/path/to/project/public/uploads/extracted/pdf/storage" # optional
74
75
  }
75
76
 
76
77
  has_many :extracted_pages, dependent: :destroy
77
78
  end
78
79
  ```
79
80
 
80
- Now our instance has few new methods:
81
-
81
+ The instance now provides several new methods:
82
82
  ```rb
83
83
  document = Document.first
84
84
  document.page_extract!
@@ -100,29 +100,24 @@ ActAsPageExtractor.statistics
100
100
 
101
101
  Parameters of initializing **act_as_page_extractor**:
102
102
 
103
- * **document_class** - name of model (e.g. Document)
104
- * **save_as_pdf** - boolean [true, false] when we want save temporary pdf
105
- * **filename** - name of field which contains access to the file and it should be an object with 'url' method that returns path to file (e.g. CarrierWave object with 'filename.url')
106
- * **document_id** - name for saving id
107
- * **additional_fields** - additional fields that added to extracted page (e.g. for indexing, etc.)
108
- * **file_storage** - path for saving tmp files (by default it is "public")
109
- * **pdf_storage** - path for saving pdf (by default it is "public/uploads/extracted/pdf")
103
+ * **document_class** The name of the model (e.g., `Document`).
104
+ * **save_as_pdf** Boolean (`true`/`false`). Indicates whether to save a temporary PDF.
105
+ * **filename** The field containing access to the file. This should be an object with a `url` method that returns the file path (e.g., a CarrierWave object with `filename.url`).
106
+ * **document_id** The field name for storing the document ID.
107
+ * **additional_fields** Extra fields to be added to the extracted page (useful for indexing, etc.).
108
+ * **root_folder** The root folder to be shared across deployments (e.g., `Rails.root.to_s`).
109
+ * **file_storage** Path for saving temporary files (default: `"public"`).
110
+ * **pdf_storage** — Path for saving PDFs (default: `"public/uploads/extracted/pdf"`).
110
111
 
111
112
  ## Run tests
112
113
  ```sh
114
+ bundle
113
115
  rspec
114
116
  ```
115
- ## Contributing
116
- 1. Fork it
117
- 2. Create your feature branch (`git checkout -b my-new-feature`)
118
- 3. Commit your changes (`git commit -am 'Add some feature'`)
119
- 4. Push to the branch (`git push origin my-new-feature`)
120
- 5. Create new Pull Request
121
117
 
122
118
  ## Contacts
123
- https://github.com/phlowerteam
124
- phlowerteam@gmail.com
119
+ https://github.com/phlowerteam / phlowerteam[A]gmail.com
125
120
 
126
121
  ## License
127
- Copyright (c) 2024 PhlowerTeam
128
- MIT License
122
+
123
+ MIT License © 2025 PhlowerTeam
@@ -16,15 +16,16 @@ Gem::Specification.new do |spec|
16
16
  spec.files = `git ls-files`.split($/)
17
17
  spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
18
18
  spec.require_paths = ['lib']
19
+ spec.required_ruby_version = '>= 3.2'
19
20
 
20
- spec.add_development_dependency 'bundler', '~> 1'
21
+ spec.add_development_dependency 'bundler', '>= 2.2.33'
21
22
  spec.add_development_dependency 'rake', '~> 12', '>= 12.3.3'
22
23
  spec.add_development_dependency 'byebug', '~> 0'
23
24
  spec.add_development_dependency 'rspec', '~> 0'
24
25
  spec.add_development_dependency 'simplecov', '~> 0'
25
26
 
26
- spec.add_runtime_dependency 'activerecord', '~> 6'
27
- spec.add_runtime_dependency 'awesome_print', '~> 1'
27
+ spec.add_runtime_dependency 'activerecord', '>= 6'
28
+ spec.add_runtime_dependency 'amazing_print', '~> 1'
28
29
  spec.add_runtime_dependency 'docsplit', '~> 0' # API for OpenOffice jodconverter (any to pdf)
29
30
  spec.add_runtime_dependency 'pdf_utils', '~> 0' # getting text from pdf
30
31
  spec.add_runtime_dependency 'prawn', '~> 1' # need for pdf_utils
@@ -1,3 +1,10 @@
1
+ # Fix: https://github.com/documentcloud/docsplit/pull/159
2
+ class File
3
+ class << self
4
+ alias_method :exists?, :exist?
5
+ end
6
+ end
7
+
1
8
  module ActAsPageExtractor
2
9
  def extract_pages
3
10
  convert_to_pdf
@@ -10,26 +17,25 @@ module ActAsPageExtractor
10
17
  else
11
18
  if timeout_wrapper{ Docsplit.extract_pdf(@document_path, output: @tmp_dir)}
12
19
  pdf_path = (@document_path.split('.')[0..-2] + ['pdf']).join('.')
13
- pdf_path if File.exists?(pdf_path)
20
+ pdf_path if File.exist?(pdf_path)
14
21
  end
15
22
  end
23
+ rescue StandardError => e
24
+ add_error(e)
16
25
  end
17
26
 
18
27
  def convert_to_text
19
- begin
20
- @pdf_pages = PdfUtils.info(@pdf_path).pages
21
- if @pdf_pages
22
- if timeout_wrapper{ Docsplit::extract_text(@pdf_path, ocr: false, pages: 'all', output: @tmp_dir) }
23
- else
24
- # :nocov:
25
- @pdf_pages = nil
26
- raise
27
- # :nocov:
28
- end
28
+ @pdf_pages = PdfUtils.info(@pdf_path).pages
29
+ if @pdf_pages
30
+ if timeout_wrapper{ Docsplit::extract_text(@pdf_path, ocr: false, pages: 'all', output: @tmp_dir) }
31
+ else
32
+ # :nocov:
33
+ @pdf_pages = nil
34
+ raise ERRORS[:unknown_docsplit_error]
35
+ # :nocov:
29
36
  end
30
- # :nocov:
31
- rescue
32
37
  end
33
- # :nocov:
38
+ rescue StandardError => e
39
+ add_error(e)
34
40
  end
35
41
  end
@@ -10,7 +10,7 @@ module ActAsPageExtractor
10
10
  end
11
11
 
12
12
  def remove_files
13
- FileUtils::rm_rf(pdf_path) if File.exists?(pdf_path.to_s)
13
+ FileUtils::rm_rf(pdf_path) if File.exist?(pdf_path.to_s)
14
14
  end
15
15
 
16
16
  def self.start_extraction
@@ -5,8 +5,9 @@ module ActAsPageExtractor
5
5
  def timeout_wrapper
6
6
  result = nil
7
7
  begin
8
- result = Timeout::timeout(60*5) { yield }
9
- rescue
8
+ result = Timeout::timeout(EXTRACTION_TIMEOUT) { yield }
9
+ rescue StandardError => e
10
+ add_error(e)
10
11
  ensure
11
12
  result
12
13
  end
@@ -25,12 +26,13 @@ module ActAsPageExtractor
25
26
  }
26
27
  else
27
28
  {
28
- page_extraction_state: EXTRACTING_STATES[:'error.extraction'],
29
+ page_extraction_state: @page_extraction_state || EXTRACTING_STATES[:error_extraction],
29
30
  page_extraction_pages: 0
30
31
  }
31
32
  end.merge({
32
33
  page_extraction_doctype: @document_path&.split('.')&.last,
33
- page_extraction_filesize: Filesize.from("#{File.size(@document_path)} B").pretty
34
+ page_extraction_filesize: Filesize.from("#{File.size(@document_path)} B").pretty,
35
+ pages_extraction_errors: @pages_extraction_errors.chomp
34
36
  })
35
37
  self.update(updated_attributes)
36
38
  end
@@ -39,6 +41,14 @@ module ActAsPageExtractor
39
41
  self.extracted_pages.destroy_all
40
42
  end
41
43
 
44
+ def add_error(e)
45
+ if ERRORS.values.include?(e.message)
46
+ @pages_extraction_errors << "#{e.message}\n\n"
47
+ else
48
+ @pages_extraction_errors << "#{e.class}, #{e.message}\n#{e.backtrace[0..ERROR_BACKTRACE_LINES].join("\n")}\n"
49
+ end
50
+ end
51
+
42
52
  # :nocov:
43
53
  def debug_info
44
54
  # ap "@tmp_dir"
@@ -1,6 +1,9 @@
1
1
  module ActAsPageExtractor
2
2
  def unzip_document
3
3
  @document_path = @copy_document_path
4
+
5
+ return if VALIDATE_DOC_TYPES.include?(@document_path.split('.').last&.downcase)
6
+
4
7
  if validate_compress_types
5
8
  result = TotalCompressor.decompress(@copy_document_path)
6
9
  if result[:success] && result[:files].length == 1
@@ -12,4 +15,15 @@ module ActAsPageExtractor
12
15
  end
13
16
  end
14
17
  end
18
+
19
+ def validate_compress_types
20
+ valid = VALIDATE_COMPRESS_TYPES.include?(@copy_document_path.split('.').last&.downcase)
21
+
22
+ unless valid
23
+ @page_extraction_state = EXTRACTING_STATES[:error_doctype]
24
+ @pages_extraction_errors << "#{EXTRACTING_STATES[:error_doctype]} "
25
+ end
26
+
27
+ valid
28
+ end
15
29
  end
@@ -1,22 +1,28 @@
1
1
  module ActAsPageExtractor
2
- VALIDATE_COMPRESS_TYPES = ['zip', 'rar', '7z', 'gzip'].freeze
3
- VALIDATE_DOC_TYPES = ['txt', 'pdf', 'doc', 'docx',
4
- 'rtf', 'odt', 'htm', 'html'].freeze
5
-
6
2
  def valid_document
7
3
  validate_size && validate_doc_types
8
4
  end
9
5
 
10
6
  def validate_size
11
7
  mb = 2**20
12
- File.size(@copy_document_path) <= 1*mb
13
- end
8
+ valid = File.size(@copy_document_path) <= 1*mb
9
+
10
+ unless valid
11
+ @page_extraction_state = EXTRACTING_STATES[:error_filesize]
12
+ @pages_extraction_errors << "#{EXTRACTING_STATES[:error_filesize]} "
13
+ end
14
14
 
15
- def validate_compress_types
16
- VALIDATE_COMPRESS_TYPES.include?(@copy_document_path.split('.').last&.downcase)
15
+ valid
17
16
  end
18
17
 
19
18
  def validate_doc_types
20
- VALIDATE_DOC_TYPES.include?(@document_path.split('.').last&.downcase)
19
+ valid = VALIDATE_DOC_TYPES.include?(@document_path.split('.').last&.downcase)
20
+
21
+ unless valid
22
+ @page_extraction_state = EXTRACTING_STATES[:error_doctype]
23
+ @pages_extraction_errors << "#{EXTRACTING_STATES[:error_doctype]} "
24
+ end
25
+
26
+ valid
21
27
  end
22
28
  end
@@ -1,5 +1,5 @@
1
1
  # :nocov:
2
2
  module ActAsPageExtractor
3
- VERSION = "0.6.3"
3
+ VERSION = "0.7.0"
4
4
  end
5
5
  # :nocov:
@@ -19,9 +19,26 @@ require 'act_as_page_extractor/modules/saving.rb'
19
19
  require 'act_as_page_extractor/modules/interface'
20
20
 
21
21
  module ActAsPageExtractor
22
-
23
22
  extend ActiveSupport::Concern
24
23
 
24
+ DEFAULT_ROOT_FOLDER = Dir.pwd.to_s
25
+ ERRORS = {
26
+ unknown_docsplit_error: 'Unknown Docsplit error'
27
+ }.freeze
28
+ ERROR_BACKTRACE_LINES = 15
29
+ EXTRACTING_STATES = {
30
+ new: 'new',
31
+ extracting: 'extracting',
32
+ extracted: 'extracted',
33
+ error_doctype: 'error_doctype',
34
+ error_extraction: 'error_extraction',
35
+ error_filesize: 'error_filesize'
36
+ }.freeze
37
+ EXTRACTION_TIMEOUT = 60*5 # 5 minutes
38
+ VALIDATE_COMPRESS_TYPES = ['zip', 'rar', '7z', 'gzip'].freeze
39
+ VALIDATE_DOC_TYPES = ['txt', 'pdf', 'doc', 'docx',
40
+ 'rtf', 'odt', 'htm', 'html'].freeze
41
+
25
42
  included do
26
43
  before_create { self.page_extraction_state = EXTRACTING_STATES[:new] }
27
44
  before_destroy :remove_files
@@ -35,23 +52,16 @@ module ActAsPageExtractor
35
52
  ActAsPageExtractor.define_singleton_method(:document_class) {|*args| Object.const_get(options[:document_class]) }
36
53
  define_method(:extracted_document_id){|*args| options[:document_id] }
37
54
  define_method(:additional_fields){|*args| options[:additional_fields] || [] }
38
- define_method(:file_storage){|*args| options[:file_storage] || FILE_STORAGE }
39
- define_method(:pdf_storage){|*args| options[:pdf_storage] || PDF_STORAGE }
55
+ define_method(:root_folder){|*args| options[:root_folder] || DEFAULT_ROOT_FOLDER }
56
+ define_method(:file_storage){|*args| options[:file_storage] || "#{root_folder}/public".freeze }
57
+ define_method(:pdf_storage){|*args| options[:pdf_storage] || "#{file_storage}/uploads/extracted/pdf".freeze }
58
+ define_method(:tmp_extraction_file_storage){|*args| "#{root_folder}/tmp/page_extraction" }
40
59
  end
41
60
  end
42
61
 
43
- EXTRACTING_STATES = {
44
- new: 'new',
45
- extracting: 'extracting',
46
- extracted: 'extracted',
47
- 'error.extraction': 'error.extraction'
48
- }.freeze
49
-
50
- TMP_EXTRACTION_FILE_STORAGE = "#{Dir.pwd}/tmp/page_extraction".freeze
51
- FILE_STORAGE = "#{Dir.pwd}/public".freeze
52
- PDF_STORAGE = "#{FILE_STORAGE}/uploads/extracted/pdf".freeze
53
-
54
62
  def initialized
63
+ @page_extraction_state = nil
64
+ @pages_extraction_errors = ''
55
65
  # add all need callbacks
56
66
  #on destroy remove pdf
57
67
 
@@ -85,13 +95,13 @@ module ActAsPageExtractor
85
95
 
86
96
  def create_pdf_dir
87
97
  if save_as_pdf
88
- FileUtils::mkdir_p(pdf_storage) unless File.exists?(pdf_storage)
98
+ FileUtils::mkdir_p(pdf_storage) unless File.exist?(pdf_storage)
89
99
  end
90
100
  end
91
101
 
92
102
  def create_tmp_dir
93
- @tmp_dir = "#{TMP_EXTRACTION_FILE_STORAGE}/#{SecureRandom.hex(6)}"
94
- FileUtils::mkdir_p(@tmp_dir) unless File.exists?(@tmp_dir)
103
+ @tmp_dir = "#{tmp_extraction_file_storage}/#{SecureRandom.hex(6)}"
104
+ FileUtils::mkdir_p(@tmp_dir) unless File.exist?(@tmp_dir)
95
105
  end
96
106
 
97
107
  def copy_document
@@ -4,5 +4,6 @@ class <%= migration_class_name_documents %> < ActiveRecord::Migration
4
4
  add_column :<%= documents_table_name %>, :page_extraction_pages, :integer, default: 0
5
5
  add_column :<%= documents_table_name %>, :page_extraction_doctype, :string, default: ''
6
6
  add_column :<%= documents_table_name %>, :page_extraction_filesize, :string, default: ''
7
+ add_column :<%= documents_table_name %>, :pages_extraction_errors, :text, default: ''
7
8
  end
8
9
  end