act_as_page_extractor 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +57 -0
- data/.rmvrc +1 -0
- data/.rspec +3 -0
- data/.ruby-gemset +1 -0
- data/.ruby-version +1 -0
- data/Gemfile +22 -0
- data/Gemfile.lock +107 -0
- data/LICENSE +21 -0
- data/README.md +119 -0
- data/Rakefile +6 -0
- data/act_as_page_extractor.gemspec +34 -0
- data/lib/act_as_page_extractor.rb +126 -0
- data/lib/act_as_page_extractor/modules/extracting.rb +35 -0
- data/lib/act_as_page_extractor/modules/interface.rb +30 -0
- data/lib/act_as_page_extractor/modules/saving.rb +47 -0
- data/lib/act_as_page_extractor/modules/tools.rb +54 -0
- data/lib/act_as_page_extractor/modules/unzipping.rb +15 -0
- data/lib/act_as_page_extractor/modules/validating.rb +22 -0
- data/lib/act_as_page_extractor/version.rb +5 -0
- data/lib/generators/act_as_page_extractor/migration_generator.rb +49 -0
- data/lib/generators/act_as_page_extractor/templates/act_as_page_extractor.rb.erb +14 -0
- data/lib/generators/act_as_page_extractor/templates/add_page_extractor_fields_to_documents.rb.erb +8 -0
- data/lib/generators/act_as_page_extractor/templates/create_extracted_pages_table.rb.erb +19 -0
- data/lib/generators/act_as_page_extractor/templates/extracted_page.rb.erb +3 -0
- data/spec/act_as_page_extractor_spec.rb +46 -0
- data/spec/spec_helper.rb +8 -0
- data/spec/support/models.rb +92 -0
- data/test/test-doc-3-pages.doc +0 -0
- data/test/test-doc-3-pages.docx +0 -0
- data/test/test-doc-3-pages.docx.7z +0 -0
- data/test/test-doc-3-pages.docx.rar +0 -0
- data/test/test-doc-3-pages.docx.zip +0 -0
- data/test/test-doc-3-pages.html +279 -0
- data/test/test-doc-3-pages.odt +0 -0
- data/test/test-doc-3-pages.pdf +0 -0
- data/test/test-doc-3-pages.rtf +339 -0
- data/test/test-doc-3-pages.txt +125 -0
- data/test/test-doc-3-pages.wrong +0 -0
- metadata +279 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 5b69a8043a6f3c7c01db160cbefaac0901a095e9
|
4
|
+
data.tar.gz: 7f18973005aa3aa1c7ea465bc73b7d4a1fcce266
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: fb9b0bb9ae38c501bb7d0d729a893be689695d6eb5cec46ab629bd529da1eb70c6ce0463974cead6cc4d955da915d3c01d569597d4fe1d30ef6441efa4ff01ba
|
7
|
+
data.tar.gz: 1836aa3cad7d8ff323b55afd3433e8842bb03efd069f0d9d947debc54af7324faa737dc08c9d11851861dd493f17d523b771747e8884b2300b9c593c383dacfb
|
data/.gitignore
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
*.gem
|
2
|
+
*.rbc
|
3
|
+
/.config
|
4
|
+
/coverage/
|
5
|
+
/InstalledFiles
|
6
|
+
/pkg/
|
7
|
+
/spec/reports/
|
8
|
+
/spec/examples.txt
|
9
|
+
/test/tmp/
|
10
|
+
/test/version_tmp/
|
11
|
+
/tmp/
|
12
|
+
|
13
|
+
# Used by dotenv library to load environment variables.
|
14
|
+
# .env
|
15
|
+
|
16
|
+
## Specific to RubyMotion:
|
17
|
+
.dat*
|
18
|
+
.repl_history
|
19
|
+
build/
|
20
|
+
*.bridgesupport
|
21
|
+
build-iPhoneOS/
|
22
|
+
build-iPhoneSimulator/
|
23
|
+
|
24
|
+
## Specific to RubyMotion (use of CocoaPods):
|
25
|
+
#
|
26
|
+
# We recommend against adding the Pods directory to your .gitignore. However
|
27
|
+
# you should judge for yourself, the pros and cons are mentioned at:
|
28
|
+
# https://guides.cocoapods.org/using/using-cocoapods.html#should-i-check-the-pods-directory-into-source-control
|
29
|
+
#
|
30
|
+
# vendor/Pods/
|
31
|
+
|
32
|
+
## Documentation cache and generated files:
|
33
|
+
/.yardoc/
|
34
|
+
/_yardoc/
|
35
|
+
/doc/
|
36
|
+
/rdoc/
|
37
|
+
|
38
|
+
## Environment normalization:
|
39
|
+
/.bundle/
|
40
|
+
/vendor/bundle
|
41
|
+
/lib/bundler/man/
|
42
|
+
|
43
|
+
# for a library or gem, you might want to ignore these files since the code is
|
44
|
+
# intended to run in multiple environments; otherwise, check them in:
|
45
|
+
# Gemfile.lock
|
46
|
+
# .ruby-version
|
47
|
+
# .ruby-gemset
|
48
|
+
|
49
|
+
# unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
|
50
|
+
.rvmrc
|
51
|
+
|
52
|
+
# Ignore all logfiles and tempfiles.
|
53
|
+
/log/*
|
54
|
+
!/log/.keep
|
55
|
+
/public/uploads/*
|
56
|
+
.idea/*
|
57
|
+
test/uploads
|
data/.rmvrc
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
rvm use ruby-2.3.0@act_as_page_extractor --create
|
data/.rspec
ADDED
data/.ruby-gemset
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
act_as_page_extractor
|
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
ruby-2.3.0
|
data/Gemfile
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
source 'https://rubygems.org'
|
2
|
+
|
3
|
+
# Specify your gem's dependencies in total_compressor.gemspec
|
4
|
+
gemspec
|
5
|
+
|
6
|
+
gem 'activerecord', '~> 4.1'
|
7
|
+
|
8
|
+
gem 'awesome_print'
|
9
|
+
|
10
|
+
gem 'docsplit' # API for OpenOffice jodconverter (any to pdf)
|
11
|
+
gem 'pdf_utils' # getting text from pdf
|
12
|
+
gem 'prawn', '~>0.7.1' # need for pdf_utils
|
13
|
+
gem 'pdf-reader' # need for pdf_utils
|
14
|
+
gem 'total_compressor' # decompressing files
|
15
|
+
gem 'filesize' # pretty size of file
|
16
|
+
|
17
|
+
gem 'byebug'
|
18
|
+
|
19
|
+
group :test do
|
20
|
+
gem 'rspec' , '>= 2.14'
|
21
|
+
gem 'simplecov', require: false, group: :test
|
22
|
+
end
|
data/Gemfile.lock
ADDED
@@ -0,0 +1,107 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
act_as_page_extractor (0.0.1)
|
5
|
+
activerecord (~> 4.1)
|
6
|
+
awesome_print
|
7
|
+
docsplit
|
8
|
+
filesize
|
9
|
+
pdf-reader
|
10
|
+
pdf_utils
|
11
|
+
prawn (~> 0.7.1)
|
12
|
+
total_compressor
|
13
|
+
|
14
|
+
GEM
|
15
|
+
remote: https://rubygems.org/
|
16
|
+
specs:
|
17
|
+
Ascii85 (1.0.2)
|
18
|
+
activemodel (4.2.7.1)
|
19
|
+
activesupport (= 4.2.7.1)
|
20
|
+
builder (~> 3.1)
|
21
|
+
activerecord (4.2.7.1)
|
22
|
+
activemodel (= 4.2.7.1)
|
23
|
+
activesupport (= 4.2.7.1)
|
24
|
+
arel (~> 6.0)
|
25
|
+
activesupport (4.2.7.1)
|
26
|
+
i18n (~> 0.7)
|
27
|
+
json (~> 1.7, >= 1.7.7)
|
28
|
+
minitest (~> 5.1)
|
29
|
+
thread_safe (~> 0.3, >= 0.3.4)
|
30
|
+
tzinfo (~> 1.1)
|
31
|
+
afm (0.2.2)
|
32
|
+
arel (6.0.3)
|
33
|
+
awesome_print (1.7.0)
|
34
|
+
builder (3.2.2)
|
35
|
+
byebug (9.0.5)
|
36
|
+
diff-lcs (1.2.5)
|
37
|
+
docile (1.1.5)
|
38
|
+
docsplit (0.7.6)
|
39
|
+
filesize (0.1.1)
|
40
|
+
hashery (2.1.2)
|
41
|
+
i18n (0.7.0)
|
42
|
+
json (1.8.3)
|
43
|
+
minitest (5.9.0)
|
44
|
+
pdf-reader (1.4.0)
|
45
|
+
Ascii85 (~> 1.0.0)
|
46
|
+
afm (~> 0.2.1)
|
47
|
+
hashery (~> 2.0)
|
48
|
+
ruby-rc4
|
49
|
+
ttfunk
|
50
|
+
pdf_utils (0.1.0)
|
51
|
+
prawn (0.7.2)
|
52
|
+
prawn-core (>= 0.7.2, < 0.8)
|
53
|
+
prawn-layout (>= 0.7.2, < 0.8)
|
54
|
+
prawn-security (>= 0.7.1, < 0.8)
|
55
|
+
prawn-core (0.7.2)
|
56
|
+
prawn-layout (0.7.2)
|
57
|
+
prawn-security (0.7.1)
|
58
|
+
rake (11.2.2)
|
59
|
+
rspec (3.5.0)
|
60
|
+
rspec-core (~> 3.5.0)
|
61
|
+
rspec-expectations (~> 3.5.0)
|
62
|
+
rspec-mocks (~> 3.5.0)
|
63
|
+
rspec-core (3.5.3)
|
64
|
+
rspec-support (~> 3.5.0)
|
65
|
+
rspec-expectations (3.5.0)
|
66
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
67
|
+
rspec-support (~> 3.5.0)
|
68
|
+
rspec-mocks (3.5.0)
|
69
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
70
|
+
rspec-support (~> 3.5.0)
|
71
|
+
rspec-support (3.5.0)
|
72
|
+
ruby-rc4 (0.1.5)
|
73
|
+
rubyzip (0.9.9)
|
74
|
+
simplecov (0.11.2)
|
75
|
+
docile (~> 1.1.0)
|
76
|
+
json (~> 1.8)
|
77
|
+
simplecov-html (~> 0.10.0)
|
78
|
+
simplecov-html (0.10.0)
|
79
|
+
thread_safe (0.3.5)
|
80
|
+
total_compressor (0.1.6)
|
81
|
+
awesome_print
|
82
|
+
rubyzip (~> 0.9.9)
|
83
|
+
ttfunk (1.4.0)
|
84
|
+
tzinfo (1.2.2)
|
85
|
+
thread_safe (~> 0.1)
|
86
|
+
|
87
|
+
PLATFORMS
|
88
|
+
ruby
|
89
|
+
|
90
|
+
DEPENDENCIES
|
91
|
+
act_as_page_extractor!
|
92
|
+
activerecord (~> 4.1)
|
93
|
+
awesome_print
|
94
|
+
bundler (~> 1.3)
|
95
|
+
byebug
|
96
|
+
docsplit
|
97
|
+
filesize
|
98
|
+
pdf-reader
|
99
|
+
pdf_utils
|
100
|
+
prawn (~> 0.7.1)
|
101
|
+
rake
|
102
|
+
rspec (>= 2.14)
|
103
|
+
simplecov
|
104
|
+
total_compressor
|
105
|
+
|
106
|
+
BUNDLED WITH
|
107
|
+
1.13.7
|
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2017 Flower Team
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,119 @@
|
|
1
|
+
act_as_page_extractor
|
2
|
+
================
|
3
|
+
|
4
|
+
Library for extracting plain text from documents(files) for further processing (indexing and searching).
|
5
|
+
|
6
|
+
## Installation
|
7
|
+
|
8
|
+
Install appropriate tools before using:
|
9
|
+
|
10
|
+
sudo apt-get install zlib1g zlib1g-dev zip rar p7zip-full
|
11
|
+
|
12
|
+
Add this line to your application's Gemfile:
|
13
|
+
|
14
|
+
gem 'act_as_page_extractor'
|
15
|
+
|
16
|
+
## Usage
|
17
|
+
|
18
|
+
For example, for model Document we need execute:
|
19
|
+
|
20
|
+
$ bundle
|
21
|
+
$ rails g act_as_page_extractor:migration Document category_id user_id
|
22
|
+
|
23
|
+
As a result we get two migration files:
|
24
|
+
|
25
|
+
class AddPageExtractorFields < ActiveRecord::Migration
|
26
|
+
def change
|
27
|
+
add_column :documents, :page_extraction_state, :string, default: ''
|
28
|
+
add_column :documents, :page_extraction_pages, :integer, default: 0
|
29
|
+
add_column :documents, :page_extraction_doctype, :string, default: ''
|
30
|
+
add_column :documents, :page_extraction_filesize, :string, default: ''
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
class CreateExtractedPages < ActiveRecord::Migration
|
35
|
+
def change
|
36
|
+
create_table :extracted_pages do |t|
|
37
|
+
t.text :page
|
38
|
+
t.integer :document_id
|
39
|
+
t.integer :category_id
|
40
|
+
t.integer :user_id
|
41
|
+
t.integer :page_number
|
42
|
+
|
43
|
+
t.timestamps null: false
|
44
|
+
end
|
45
|
+
|
46
|
+
add_index :extracted_pages, :document_id
|
47
|
+
add_index :extracted_pages, :category_id
|
48
|
+
add_index :extracted_pages, [:document_id, :category_id]
|
49
|
+
add_index :extracted_pages, [:document_id, :page_number]
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
|
54
|
+
Model Document must have field which contains path to file(supports [different archive types](https://github.com/phlowerteam/total_compressor) that contains [txt, pdf, doc/x, txt, html, rtf, ...](https://www.exoplatform.com/docs/public/index.jsp?topic=%2FPLF43%2FPLFAdminGuide.Configuration.JODConverter.html))
|
55
|
+
|
56
|
+
Add to model next parameters for initializing:
|
57
|
+
|
58
|
+
class Document < ActiveRecord::Base
|
59
|
+
include ActAsPageExtractor
|
60
|
+
|
61
|
+
act_as_page_extractor options: {
|
62
|
+
document_class: 'Document',
|
63
|
+
save_as_pdf: true,
|
64
|
+
filename: :filename,
|
65
|
+
document_id: :document_id,
|
66
|
+
additional_fields: [:category_id, :user_id],
|
67
|
+
#file_storage: "/full/path/to/documents/storage",
|
68
|
+
#pdf_storage: "/full/path/to/extracted/pdf/storage"
|
69
|
+
}
|
70
|
+
|
71
|
+
has_many :extracted_pages, dependent: :destroy
|
72
|
+
end
|
73
|
+
|
74
|
+
Now our instance has few new methods:
|
75
|
+
|
76
|
+
document = Document.first
|
77
|
+
document.page_extract!
|
78
|
+
document.extracted_pages
|
79
|
+
document.pdf_path # if option 'save_as_pdf' is 'true'
|
80
|
+
|
81
|
+
# Access to pages
|
82
|
+
ExtractedPage.count
|
83
|
+
|
84
|
+
# Importing whole directory of documents
|
85
|
+
ActAsPageExtractor.import_files('/path/to/foler/with/documents')
|
86
|
+
|
87
|
+
# We can use cron for run the processing of all the new documents
|
88
|
+
ActAsPageExtractor.start_extraction
|
89
|
+
|
90
|
+
# Getting statistics information of all documents
|
91
|
+
ActAsPageExtractor.statistics
|
92
|
+
|
93
|
+
Parameters of initializing `act_as_page_extractor options: { ... }`:
|
94
|
+
|
95
|
+
`document_class` - name of model (e.g. 'Document)
|
96
|
+
`save_as_pdf` - boolean [true, false] when we want save temporary pdf
|
97
|
+
`filename` - name of field which contains access to the file and it should be an object with 'url' method that returns path to file (e.g. CarrierWave object with 'filename.url')
|
98
|
+
`document_id` - name for saving id
|
99
|
+
`additional_fields` - additional fields that added to extracted page (e.g. for indexing, etc.)
|
100
|
+
`file_storage` - path for saving tmp files (by default it is "public")
|
101
|
+
`pdf_storage` - path for saving pdf (by default it is "public/uploads/extracted/pdf")
|
102
|
+
|
103
|
+
## Run tests
|
104
|
+
$ COVERAGE=true rspec
|
105
|
+
|
106
|
+
## Contributing
|
107
|
+
1. Fork it
|
108
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
109
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
110
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
111
|
+
5. Create new Pull Request
|
112
|
+
|
113
|
+
## Contacts
|
114
|
+
https://github.com/phlowerteam
|
115
|
+
phlowerteam@gmail.com
|
116
|
+
|
117
|
+
## License
|
118
|
+
Copyright (c) 2017 PhlowerTeam
|
119
|
+
MIT License
|
data/Rakefile
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'act_as_page_extractor/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = 'act_as_page_extractor'
|
8
|
+
spec.version = ActAsPageExtractor::VERSION
|
9
|
+
spec.authors = ['PhlowerTeam']
|
10
|
+
spec.email = ['phlowerteam@gmail.com']
|
11
|
+
spec.description = %q{Library (Docsplit wrapper) for text extraction from pdf, doc/x, txt files with OpenOffice}
|
12
|
+
spec.summary = %q{Uses system calls}
|
13
|
+
spec.homepage = 'https://github.com/phlowerteam'
|
14
|
+
spec.license = 'MIT'
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
18
|
+
spec.require_paths = ['lib']
|
19
|
+
|
20
|
+
spec.add_development_dependency 'bundler', '~> 1.3'
|
21
|
+
spec.add_development_dependency 'rake'
|
22
|
+
spec.add_development_dependency 'byebug'
|
23
|
+
spec.add_development_dependency 'rspec'
|
24
|
+
spec.add_development_dependency 'simplecov'
|
25
|
+
|
26
|
+
spec.add_runtime_dependency 'activerecord', '~> 4.1'
|
27
|
+
spec.add_runtime_dependency 'awesome_print'
|
28
|
+
spec.add_runtime_dependency 'docsplit' # API for OpenOffice jodconverter (any to pdf)
|
29
|
+
spec.add_runtime_dependency 'pdf_utils' # getting text from pdf
|
30
|
+
spec.add_runtime_dependency 'prawn', '~>0.7.1' # need for pdf_utils
|
31
|
+
spec.add_runtime_dependency 'pdf-reader' # need for pdf_utils
|
32
|
+
spec.add_runtime_dependency 'total_compressor' # decompressing files
|
33
|
+
spec.add_runtime_dependency 'filesize' # pretty size of file
|
34
|
+
end
|
@@ -0,0 +1,126 @@
|
|
1
|
+
require 'act_as_page_extractor/version'
|
2
|
+
|
3
|
+
require 'active_record'
|
4
|
+
|
5
|
+
require 'awesome_print'
|
6
|
+
require 'filesize'
|
7
|
+
require 'total_compressor'
|
8
|
+
require 'docsplit'
|
9
|
+
require 'pdf_utils'
|
10
|
+
require 'prawn'
|
11
|
+
require 'pdf-reader'
|
12
|
+
|
13
|
+
require 'act_as_page_extractor/modules/tools.rb'
|
14
|
+
require 'act_as_page_extractor/modules/validating.rb'
|
15
|
+
require 'act_as_page_extractor/modules/unzipping.rb'
|
16
|
+
require 'act_as_page_extractor/modules/extracting.rb'
|
17
|
+
require 'act_as_page_extractor/modules/saving.rb'
|
18
|
+
|
19
|
+
require 'act_as_page_extractor/modules/interface'
|
20
|
+
|
21
|
+
module ActAsPageExtractor
|
22
|
+
|
23
|
+
extend ActiveSupport::Concern
|
24
|
+
|
25
|
+
included do
|
26
|
+
before_create { self.page_extraction_state = EXTRACTING_STATES[:new] }
|
27
|
+
before_destroy :remove_files
|
28
|
+
end
|
29
|
+
|
30
|
+
module ClassMethods
|
31
|
+
def act_as_page_extractor(options: {})
|
32
|
+
define_method(:save_as_pdf){|*args| options[:save_as_pdf] }
|
33
|
+
define_method(:extracted_filename){|*args| self.send(options[:filename].to_sym) }
|
34
|
+
ActAsPageExtractor.define_singleton_method(:extracted_filename) {|*args| options[:filename] }
|
35
|
+
ActAsPageExtractor.define_singleton_method(:document_class) {|*args| Object.const_get(options[:document_class]) }
|
36
|
+
define_method(:extracted_document_id){|*args| options[:document_id] }
|
37
|
+
define_method(:additional_fields){|*args| options[:additional_fields] || [] }
|
38
|
+
define_method(:file_storage){|*args| options[:file_storage] || FILE_STORAGE }
|
39
|
+
define_method(:pdf_storage){|*args| options[:pdf_storage] || PDF_STORAGE }
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
EXTRACTING_STATES = {
|
44
|
+
new: 'new',
|
45
|
+
extracting: 'extracting',
|
46
|
+
extracted: 'extracted',
|
47
|
+
'error.extraction': 'error.extraction'
|
48
|
+
}.freeze
|
49
|
+
|
50
|
+
TMP_EXTRACTION_FILE_STORAGE = "#{Dir.pwd}/tmp/page_extraction".freeze
|
51
|
+
FILE_STORAGE = "#{Dir.pwd}/public".freeze
|
52
|
+
PDF_STORAGE = "#{FILE_STORAGE}/uploads/extracted/pdf".freeze
|
53
|
+
|
54
|
+
def initialized
|
55
|
+
# add all need callbacks
|
56
|
+
#on destroy remove pdf
|
57
|
+
|
58
|
+
#Add to Readme!!
|
59
|
+
#rails g act_as_page_extractor:migration Document category_id user_id
|
60
|
+
# add to [Document] model:
|
61
|
+
# has_many :extracted_pages, dependent: :destroy
|
62
|
+
create_pdf_dir
|
63
|
+
end
|
64
|
+
|
65
|
+
def page_extract!
|
66
|
+
initialized
|
67
|
+
cleanup_pages
|
68
|
+
create_tmp_dir
|
69
|
+
begin
|
70
|
+
copy_document
|
71
|
+
unzip_document
|
72
|
+
if valid_document
|
73
|
+
extract_pages
|
74
|
+
save_to_db
|
75
|
+
end
|
76
|
+
ensure
|
77
|
+
update_state
|
78
|
+
save_pdf
|
79
|
+
debug_info
|
80
|
+
finish
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
private
|
85
|
+
|
86
|
+
def create_pdf_dir
|
87
|
+
if save_as_pdf
|
88
|
+
FileUtils::mkdir_p(pdf_storage) unless File.exists?(pdf_storage)
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
def create_tmp_dir
|
93
|
+
@tmp_dir = "#{TMP_EXTRACTION_FILE_STORAGE}/#{SecureRandom.hex(6)}"
|
94
|
+
FileUtils::mkdir_p(@tmp_dir) unless File.exists?(@tmp_dir)
|
95
|
+
end
|
96
|
+
|
97
|
+
def copy_document
|
98
|
+
@origin_document_path = "#{file_storage}#{self.send(:extracted_filename).url.to_s}"
|
99
|
+
FileUtils.cp(@origin_document_path, @tmp_dir)
|
100
|
+
@copy_document_path = "#{@tmp_dir}/#{@origin_document_path.split("/").last}"
|
101
|
+
@document_filename = @origin_document_path.split("/").last
|
102
|
+
end
|
103
|
+
|
104
|
+
def finish
|
105
|
+
remove_tmp_dir
|
106
|
+
end
|
107
|
+
|
108
|
+
def remove_tmp_dir
|
109
|
+
FileUtils.rm_rf(@tmp_dir) if @tmp_dir =~ /\/tmp\//
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
# rails g model ExtractedPage page:text document_id:integer category_id:integer page_number:integer
|
114
|
+
|
115
|
+
# Rails 4 way
|
116
|
+
# 9.2.7.1 Multiple Callback Methods in One Class
|
117
|
+
# 258 page
|
118
|
+
|
119
|
+
# class ActiveRecord::Base
|
120
|
+
# def self.acts_as_page_extractor(document_field=:filename)
|
121
|
+
# auditor = Auditor.new(audit_log)
|
122
|
+
# after_create auditor
|
123
|
+
# after_update auditor
|
124
|
+
# after_destroy auditor
|
125
|
+
# end
|
126
|
+
# end
|