act_as_page_extractor 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +57 -0
- data/.rmvrc +1 -0
- data/.rspec +3 -0
- data/.ruby-gemset +1 -0
- data/.ruby-version +1 -0
- data/Gemfile +22 -0
- data/Gemfile.lock +107 -0
- data/LICENSE +21 -0
- data/README.md +119 -0
- data/Rakefile +6 -0
- data/act_as_page_extractor.gemspec +34 -0
- data/lib/act_as_page_extractor.rb +126 -0
- data/lib/act_as_page_extractor/modules/extracting.rb +35 -0
- data/lib/act_as_page_extractor/modules/interface.rb +30 -0
- data/lib/act_as_page_extractor/modules/saving.rb +47 -0
- data/lib/act_as_page_extractor/modules/tools.rb +54 -0
- data/lib/act_as_page_extractor/modules/unzipping.rb +15 -0
- data/lib/act_as_page_extractor/modules/validating.rb +22 -0
- data/lib/act_as_page_extractor/version.rb +5 -0
- data/lib/generators/act_as_page_extractor/migration_generator.rb +49 -0
- data/lib/generators/act_as_page_extractor/templates/act_as_page_extractor.rb.erb +14 -0
- data/lib/generators/act_as_page_extractor/templates/add_page_extractor_fields_to_documents.rb.erb +8 -0
- data/lib/generators/act_as_page_extractor/templates/create_extracted_pages_table.rb.erb +19 -0
- data/lib/generators/act_as_page_extractor/templates/extracted_page.rb.erb +3 -0
- data/spec/act_as_page_extractor_spec.rb +46 -0
- data/spec/spec_helper.rb +8 -0
- data/spec/support/models.rb +92 -0
- data/test/test-doc-3-pages.doc +0 -0
- data/test/test-doc-3-pages.docx +0 -0
- data/test/test-doc-3-pages.docx.7z +0 -0
- data/test/test-doc-3-pages.docx.rar +0 -0
- data/test/test-doc-3-pages.docx.zip +0 -0
- data/test/test-doc-3-pages.html +279 -0
- data/test/test-doc-3-pages.odt +0 -0
- data/test/test-doc-3-pages.pdf +0 -0
- data/test/test-doc-3-pages.rtf +339 -0
- data/test/test-doc-3-pages.txt +125 -0
- data/test/test-doc-3-pages.wrong +0 -0
- metadata +279 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 5b69a8043a6f3c7c01db160cbefaac0901a095e9
|
4
|
+
data.tar.gz: 7f18973005aa3aa1c7ea465bc73b7d4a1fcce266
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: fb9b0bb9ae38c501bb7d0d729a893be689695d6eb5cec46ab629bd529da1eb70c6ce0463974cead6cc4d955da915d3c01d569597d4fe1d30ef6441efa4ff01ba
|
7
|
+
data.tar.gz: 1836aa3cad7d8ff323b55afd3433e8842bb03efd069f0d9d947debc54af7324faa737dc08c9d11851861dd493f17d523b771747e8884b2300b9c593c383dacfb
|
data/.gitignore
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
*.gem
|
2
|
+
*.rbc
|
3
|
+
/.config
|
4
|
+
/coverage/
|
5
|
+
/InstalledFiles
|
6
|
+
/pkg/
|
7
|
+
/spec/reports/
|
8
|
+
/spec/examples.txt
|
9
|
+
/test/tmp/
|
10
|
+
/test/version_tmp/
|
11
|
+
/tmp/
|
12
|
+
|
13
|
+
# Used by dotenv library to load environment variables.
|
14
|
+
# .env
|
15
|
+
|
16
|
+
## Specific to RubyMotion:
|
17
|
+
.dat*
|
18
|
+
.repl_history
|
19
|
+
build/
|
20
|
+
*.bridgesupport
|
21
|
+
build-iPhoneOS/
|
22
|
+
build-iPhoneSimulator/
|
23
|
+
|
24
|
+
## Specific to RubyMotion (use of CocoaPods):
|
25
|
+
#
|
26
|
+
# We recommend against adding the Pods directory to your .gitignore. However
|
27
|
+
# you should judge for yourself, the pros and cons are mentioned at:
|
28
|
+
# https://guides.cocoapods.org/using/using-cocoapods.html#should-i-check-the-pods-directory-into-source-control
|
29
|
+
#
|
30
|
+
# vendor/Pods/
|
31
|
+
|
32
|
+
## Documentation cache and generated files:
|
33
|
+
/.yardoc/
|
34
|
+
/_yardoc/
|
35
|
+
/doc/
|
36
|
+
/rdoc/
|
37
|
+
|
38
|
+
## Environment normalization:
|
39
|
+
/.bundle/
|
40
|
+
/vendor/bundle
|
41
|
+
/lib/bundler/man/
|
42
|
+
|
43
|
+
# for a library or gem, you might want to ignore these files since the code is
|
44
|
+
# intended to run in multiple environments; otherwise, check them in:
|
45
|
+
# Gemfile.lock
|
46
|
+
# .ruby-version
|
47
|
+
# .ruby-gemset
|
48
|
+
|
49
|
+
# unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
|
50
|
+
.rvmrc
|
51
|
+
|
52
|
+
# Ignore all logfiles and tempfiles.
|
53
|
+
/log/*
|
54
|
+
!/log/.keep
|
55
|
+
/public/uploads/*
|
56
|
+
.idea/*
|
57
|
+
test/uploads
|
data/.rmvrc
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
rvm use ruby-2.3.0@act_as_page_extractor --create
|
data/.rspec
ADDED
data/.ruby-gemset
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
act_as_page_extractor
|
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
ruby-2.3.0
|
data/Gemfile
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
source 'https://rubygems.org'
|
2
|
+
|
3
|
+
# Specify your gem's dependencies in total_compressor.gemspec
|
4
|
+
gemspec
|
5
|
+
|
6
|
+
gem 'activerecord', '~> 4.1'
|
7
|
+
|
8
|
+
gem 'awesome_print'
|
9
|
+
|
10
|
+
gem 'docsplit' # API for OpenOffice jodconverter (any to pdf)
|
11
|
+
gem 'pdf_utils' # getting text from pdf
|
12
|
+
gem 'prawn', '~>0.7.1' # need for pdf_utils
|
13
|
+
gem 'pdf-reader' # need for pdf_utils
|
14
|
+
gem 'total_compressor' # decompressing files
|
15
|
+
gem 'filesize' # pretty size of file
|
16
|
+
|
17
|
+
gem 'byebug'
|
18
|
+
|
19
|
+
group :test do
|
20
|
+
gem 'rspec' , '>= 2.14'
|
21
|
+
gem 'simplecov', require: false, group: :test
|
22
|
+
end
|
data/Gemfile.lock
ADDED
@@ -0,0 +1,107 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
act_as_page_extractor (0.0.1)
|
5
|
+
activerecord (~> 4.1)
|
6
|
+
awesome_print
|
7
|
+
docsplit
|
8
|
+
filesize
|
9
|
+
pdf-reader
|
10
|
+
pdf_utils
|
11
|
+
prawn (~> 0.7.1)
|
12
|
+
total_compressor
|
13
|
+
|
14
|
+
GEM
|
15
|
+
remote: https://rubygems.org/
|
16
|
+
specs:
|
17
|
+
Ascii85 (1.0.2)
|
18
|
+
activemodel (4.2.7.1)
|
19
|
+
activesupport (= 4.2.7.1)
|
20
|
+
builder (~> 3.1)
|
21
|
+
activerecord (4.2.7.1)
|
22
|
+
activemodel (= 4.2.7.1)
|
23
|
+
activesupport (= 4.2.7.1)
|
24
|
+
arel (~> 6.0)
|
25
|
+
activesupport (4.2.7.1)
|
26
|
+
i18n (~> 0.7)
|
27
|
+
json (~> 1.7, >= 1.7.7)
|
28
|
+
minitest (~> 5.1)
|
29
|
+
thread_safe (~> 0.3, >= 0.3.4)
|
30
|
+
tzinfo (~> 1.1)
|
31
|
+
afm (0.2.2)
|
32
|
+
arel (6.0.3)
|
33
|
+
awesome_print (1.7.0)
|
34
|
+
builder (3.2.2)
|
35
|
+
byebug (9.0.5)
|
36
|
+
diff-lcs (1.2.5)
|
37
|
+
docile (1.1.5)
|
38
|
+
docsplit (0.7.6)
|
39
|
+
filesize (0.1.1)
|
40
|
+
hashery (2.1.2)
|
41
|
+
i18n (0.7.0)
|
42
|
+
json (1.8.3)
|
43
|
+
minitest (5.9.0)
|
44
|
+
pdf-reader (1.4.0)
|
45
|
+
Ascii85 (~> 1.0.0)
|
46
|
+
afm (~> 0.2.1)
|
47
|
+
hashery (~> 2.0)
|
48
|
+
ruby-rc4
|
49
|
+
ttfunk
|
50
|
+
pdf_utils (0.1.0)
|
51
|
+
prawn (0.7.2)
|
52
|
+
prawn-core (>= 0.7.2, < 0.8)
|
53
|
+
prawn-layout (>= 0.7.2, < 0.8)
|
54
|
+
prawn-security (>= 0.7.1, < 0.8)
|
55
|
+
prawn-core (0.7.2)
|
56
|
+
prawn-layout (0.7.2)
|
57
|
+
prawn-security (0.7.1)
|
58
|
+
rake (11.2.2)
|
59
|
+
rspec (3.5.0)
|
60
|
+
rspec-core (~> 3.5.0)
|
61
|
+
rspec-expectations (~> 3.5.0)
|
62
|
+
rspec-mocks (~> 3.5.0)
|
63
|
+
rspec-core (3.5.3)
|
64
|
+
rspec-support (~> 3.5.0)
|
65
|
+
rspec-expectations (3.5.0)
|
66
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
67
|
+
rspec-support (~> 3.5.0)
|
68
|
+
rspec-mocks (3.5.0)
|
69
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
70
|
+
rspec-support (~> 3.5.0)
|
71
|
+
rspec-support (3.5.0)
|
72
|
+
ruby-rc4 (0.1.5)
|
73
|
+
rubyzip (0.9.9)
|
74
|
+
simplecov (0.11.2)
|
75
|
+
docile (~> 1.1.0)
|
76
|
+
json (~> 1.8)
|
77
|
+
simplecov-html (~> 0.10.0)
|
78
|
+
simplecov-html (0.10.0)
|
79
|
+
thread_safe (0.3.5)
|
80
|
+
total_compressor (0.1.6)
|
81
|
+
awesome_print
|
82
|
+
rubyzip (~> 0.9.9)
|
83
|
+
ttfunk (1.4.0)
|
84
|
+
tzinfo (1.2.2)
|
85
|
+
thread_safe (~> 0.1)
|
86
|
+
|
87
|
+
PLATFORMS
|
88
|
+
ruby
|
89
|
+
|
90
|
+
DEPENDENCIES
|
91
|
+
act_as_page_extractor!
|
92
|
+
activerecord (~> 4.1)
|
93
|
+
awesome_print
|
94
|
+
bundler (~> 1.3)
|
95
|
+
byebug
|
96
|
+
docsplit
|
97
|
+
filesize
|
98
|
+
pdf-reader
|
99
|
+
pdf_utils
|
100
|
+
prawn (~> 0.7.1)
|
101
|
+
rake
|
102
|
+
rspec (>= 2.14)
|
103
|
+
simplecov
|
104
|
+
total_compressor
|
105
|
+
|
106
|
+
BUNDLED WITH
|
107
|
+
1.13.7
|
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2017 Flower Team
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,119 @@
|
|
1
|
+
act_as_page_extractor
|
2
|
+
================
|
3
|
+
|
4
|
+
Library for extracting plain text from documents(files) for further processing (indexing and searching).
|
5
|
+
|
6
|
+
## Installation
|
7
|
+
|
8
|
+
Install appropriate tools before using:
|
9
|
+
|
10
|
+
sudo apt-get install zlib1g zlib1g-dev zip rar p7zip-full
|
11
|
+
|
12
|
+
Add this line to your application's Gemfile:
|
13
|
+
|
14
|
+
gem 'act_as_page_extractor'
|
15
|
+
|
16
|
+
## Usage
|
17
|
+
|
18
|
+
For example, for model Document we need execute:
|
19
|
+
|
20
|
+
$ bundle
|
21
|
+
$ rails g act_as_page_extractor:migration Document category_id user_id
|
22
|
+
|
23
|
+
As a result we get two migration files:
|
24
|
+
|
25
|
+
class AddPageExtractorFields < ActiveRecord::Migration
|
26
|
+
def change
|
27
|
+
add_column :documents, :page_extraction_state, :string, default: ''
|
28
|
+
add_column :documents, :page_extraction_pages, :integer, default: 0
|
29
|
+
add_column :documents, :page_extraction_doctype, :string, default: ''
|
30
|
+
add_column :documents, :page_extraction_filesize, :string, default: ''
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
class CreateExtractedPages < ActiveRecord::Migration
|
35
|
+
def change
|
36
|
+
create_table :extracted_pages do |t|
|
37
|
+
t.text :page
|
38
|
+
t.integer :document_id
|
39
|
+
t.integer :category_id
|
40
|
+
t.integer :user_id
|
41
|
+
t.integer :page_number
|
42
|
+
|
43
|
+
t.timestamps null: false
|
44
|
+
end
|
45
|
+
|
46
|
+
add_index :extracted_pages, :document_id
|
47
|
+
add_index :extracted_pages, :category_id
|
48
|
+
add_index :extracted_pages, [:document_id, :category_id]
|
49
|
+
add_index :extracted_pages, [:document_id, :page_number]
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
|
54
|
+
Model Document must have field which contains path to file(supports [different archive types](https://github.com/phlowerteam/total_compressor) that contains [txt, pdf, doc/x, txt, html, rtf, ...](https://www.exoplatform.com/docs/public/index.jsp?topic=%2FPLF43%2FPLFAdminGuide.Configuration.JODConverter.html))
|
55
|
+
|
56
|
+
Add to model next parameters for initializing:
|
57
|
+
|
58
|
+
class Document < ActiveRecord::Base
|
59
|
+
include ActAsPageExtractor
|
60
|
+
|
61
|
+
act_as_page_extractor options: {
|
62
|
+
document_class: 'Document',
|
63
|
+
save_as_pdf: true,
|
64
|
+
filename: :filename,
|
65
|
+
document_id: :document_id,
|
66
|
+
additional_fields: [:category_id, :user_id],
|
67
|
+
#file_storage: "/full/path/to/documents/storage",
|
68
|
+
#pdf_storage: "/full/path/to/extracted/pdf/storage"
|
69
|
+
}
|
70
|
+
|
71
|
+
has_many :extracted_pages, dependent: :destroy
|
72
|
+
end
|
73
|
+
|
74
|
+
Now our instance has few new methods:
|
75
|
+
|
76
|
+
document = Document.first
|
77
|
+
document.page_extract!
|
78
|
+
document.extracted_pages
|
79
|
+
document.pdf_path # if option 'save_as_pdf' is 'true'
|
80
|
+
|
81
|
+
# Access to pages
|
82
|
+
ExtractedPage.count
|
83
|
+
|
84
|
+
# Importing whole directory of documents
|
85
|
+
ActAsPageExtractor.import_files('/path/to/foler/with/documents')
|
86
|
+
|
87
|
+
# We can use cron for run the processing of all the new documents
|
88
|
+
ActAsPageExtractor.start_extraction
|
89
|
+
|
90
|
+
# Getting statistics information of all documents
|
91
|
+
ActAsPageExtractor.statistics
|
92
|
+
|
93
|
+
Parameters of initializing `act_as_page_extractor options: { ... }`:
|
94
|
+
|
95
|
+
`document_class` - name of model (e.g. 'Document)
|
96
|
+
`save_as_pdf` - boolean [true, false] when we want save temporary pdf
|
97
|
+
`filename` - name of field which contains access to the file and it should be an object with 'url' method that returns path to file (e.g. CarrierWave object with 'filename.url')
|
98
|
+
`document_id` - name for saving id
|
99
|
+
`additional_fields` - additional fields that added to extracted page (e.g. for indexing, etc.)
|
100
|
+
`file_storage` - path for saving tmp files (by default it is "public")
|
101
|
+
`pdf_storage` - path for saving pdf (by default it is "public/uploads/extracted/pdf")
|
102
|
+
|
103
|
+
## Run tests
|
104
|
+
$ COVERAGE=true rspec
|
105
|
+
|
106
|
+
## Contributing
|
107
|
+
1. Fork it
|
108
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
109
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
110
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
111
|
+
5. Create new Pull Request
|
112
|
+
|
113
|
+
## Contacts
|
114
|
+
https://github.com/phlowerteam
|
115
|
+
phlowerteam@gmail.com
|
116
|
+
|
117
|
+
## License
|
118
|
+
Copyright (c) 2017 PhlowerTeam
|
119
|
+
MIT License
|
data/Rakefile
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'act_as_page_extractor/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = 'act_as_page_extractor'
|
8
|
+
spec.version = ActAsPageExtractor::VERSION
|
9
|
+
spec.authors = ['PhlowerTeam']
|
10
|
+
spec.email = ['phlowerteam@gmail.com']
|
11
|
+
spec.description = %q{Library (Docsplit wrapper) for text extraction from pdf, doc/x, txt files with OpenOffice}
|
12
|
+
spec.summary = %q{Uses system calls}
|
13
|
+
spec.homepage = 'https://github.com/phlowerteam'
|
14
|
+
spec.license = 'MIT'
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
18
|
+
spec.require_paths = ['lib']
|
19
|
+
|
20
|
+
spec.add_development_dependency 'bundler', '~> 1.3'
|
21
|
+
spec.add_development_dependency 'rake'
|
22
|
+
spec.add_development_dependency 'byebug'
|
23
|
+
spec.add_development_dependency 'rspec'
|
24
|
+
spec.add_development_dependency 'simplecov'
|
25
|
+
|
26
|
+
spec.add_runtime_dependency 'activerecord', '~> 4.1'
|
27
|
+
spec.add_runtime_dependency 'awesome_print'
|
28
|
+
spec.add_runtime_dependency 'docsplit' # API for OpenOffice jodconverter (any to pdf)
|
29
|
+
spec.add_runtime_dependency 'pdf_utils' # getting text from pdf
|
30
|
+
spec.add_runtime_dependency 'prawn', '~>0.7.1' # need for pdf_utils
|
31
|
+
spec.add_runtime_dependency 'pdf-reader' # need for pdf_utils
|
32
|
+
spec.add_runtime_dependency 'total_compressor' # decompressing files
|
33
|
+
spec.add_runtime_dependency 'filesize' # pretty size of file
|
34
|
+
end
|
@@ -0,0 +1,126 @@
|
|
1
|
+
require 'act_as_page_extractor/version'
|
2
|
+
|
3
|
+
require 'active_record'
|
4
|
+
|
5
|
+
require 'awesome_print'
|
6
|
+
require 'filesize'
|
7
|
+
require 'total_compressor'
|
8
|
+
require 'docsplit'
|
9
|
+
require 'pdf_utils'
|
10
|
+
require 'prawn'
|
11
|
+
require 'pdf-reader'
|
12
|
+
|
13
|
+
require 'act_as_page_extractor/modules/tools.rb'
|
14
|
+
require 'act_as_page_extractor/modules/validating.rb'
|
15
|
+
require 'act_as_page_extractor/modules/unzipping.rb'
|
16
|
+
require 'act_as_page_extractor/modules/extracting.rb'
|
17
|
+
require 'act_as_page_extractor/modules/saving.rb'
|
18
|
+
|
19
|
+
require 'act_as_page_extractor/modules/interface'
|
20
|
+
|
21
|
+
module ActAsPageExtractor
|
22
|
+
|
23
|
+
extend ActiveSupport::Concern
|
24
|
+
|
25
|
+
included do
|
26
|
+
before_create { self.page_extraction_state = EXTRACTING_STATES[:new] }
|
27
|
+
before_destroy :remove_files
|
28
|
+
end
|
29
|
+
|
30
|
+
module ClassMethods
|
31
|
+
def act_as_page_extractor(options: {})
|
32
|
+
define_method(:save_as_pdf){|*args| options[:save_as_pdf] }
|
33
|
+
define_method(:extracted_filename){|*args| self.send(options[:filename].to_sym) }
|
34
|
+
ActAsPageExtractor.define_singleton_method(:extracted_filename) {|*args| options[:filename] }
|
35
|
+
ActAsPageExtractor.define_singleton_method(:document_class) {|*args| Object.const_get(options[:document_class]) }
|
36
|
+
define_method(:extracted_document_id){|*args| options[:document_id] }
|
37
|
+
define_method(:additional_fields){|*args| options[:additional_fields] || [] }
|
38
|
+
define_method(:file_storage){|*args| options[:file_storage] || FILE_STORAGE }
|
39
|
+
define_method(:pdf_storage){|*args| options[:pdf_storage] || PDF_STORAGE }
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
EXTRACTING_STATES = {
|
44
|
+
new: 'new',
|
45
|
+
extracting: 'extracting',
|
46
|
+
extracted: 'extracted',
|
47
|
+
'error.extraction': 'error.extraction'
|
48
|
+
}.freeze
|
49
|
+
|
50
|
+
TMP_EXTRACTION_FILE_STORAGE = "#{Dir.pwd}/tmp/page_extraction".freeze
|
51
|
+
FILE_STORAGE = "#{Dir.pwd}/public".freeze
|
52
|
+
PDF_STORAGE = "#{FILE_STORAGE}/uploads/extracted/pdf".freeze
|
53
|
+
|
54
|
+
def initialized
|
55
|
+
# add all need callbacks
|
56
|
+
#on destroy remove pdf
|
57
|
+
|
58
|
+
#Add to Readme!!
|
59
|
+
#rails g act_as_page_extractor:migration Document category_id user_id
|
60
|
+
# add to [Document] model:
|
61
|
+
# has_many :extracted_pages, dependent: :destroy
|
62
|
+
create_pdf_dir
|
63
|
+
end
|
64
|
+
|
65
|
+
def page_extract!
|
66
|
+
initialized
|
67
|
+
cleanup_pages
|
68
|
+
create_tmp_dir
|
69
|
+
begin
|
70
|
+
copy_document
|
71
|
+
unzip_document
|
72
|
+
if valid_document
|
73
|
+
extract_pages
|
74
|
+
save_to_db
|
75
|
+
end
|
76
|
+
ensure
|
77
|
+
update_state
|
78
|
+
save_pdf
|
79
|
+
debug_info
|
80
|
+
finish
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
private
|
85
|
+
|
86
|
+
def create_pdf_dir
|
87
|
+
if save_as_pdf
|
88
|
+
FileUtils::mkdir_p(pdf_storage) unless File.exists?(pdf_storage)
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
def create_tmp_dir
|
93
|
+
@tmp_dir = "#{TMP_EXTRACTION_FILE_STORAGE}/#{SecureRandom.hex(6)}"
|
94
|
+
FileUtils::mkdir_p(@tmp_dir) unless File.exists?(@tmp_dir)
|
95
|
+
end
|
96
|
+
|
97
|
+
def copy_document
|
98
|
+
@origin_document_path = "#{file_storage}#{self.send(:extracted_filename).url.to_s}"
|
99
|
+
FileUtils.cp(@origin_document_path, @tmp_dir)
|
100
|
+
@copy_document_path = "#{@tmp_dir}/#{@origin_document_path.split("/").last}"
|
101
|
+
@document_filename = @origin_document_path.split("/").last
|
102
|
+
end
|
103
|
+
|
104
|
+
def finish
|
105
|
+
remove_tmp_dir
|
106
|
+
end
|
107
|
+
|
108
|
+
def remove_tmp_dir
|
109
|
+
FileUtils.rm_rf(@tmp_dir) if @tmp_dir =~ /\/tmp\//
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
# rails g model ExtractedPage page:text document_id:integer category_id:integer page_number:integer
|
114
|
+
|
115
|
+
# Rails 4 way
|
116
|
+
# 9.2.7.1 Multiple Callback Methods in One Class
|
117
|
+
# 258 page
|
118
|
+
|
119
|
+
# class ActiveRecord::Base
|
120
|
+
# def self.acts_as_page_extractor(document_field=:filename)
|
121
|
+
# auditor = Auditor.new(audit_log)
|
122
|
+
# after_create auditor
|
123
|
+
# after_update auditor
|
124
|
+
# after_destroy auditor
|
125
|
+
# end
|
126
|
+
# end
|