act_as_page_extractor 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +57 -0
  3. data/.rmvrc +1 -0
  4. data/.rspec +3 -0
  5. data/.ruby-gemset +1 -0
  6. data/.ruby-version +1 -0
  7. data/Gemfile +22 -0
  8. data/Gemfile.lock +107 -0
  9. data/LICENSE +21 -0
  10. data/README.md +119 -0
  11. data/Rakefile +6 -0
  12. data/act_as_page_extractor.gemspec +34 -0
  13. data/lib/act_as_page_extractor.rb +126 -0
  14. data/lib/act_as_page_extractor/modules/extracting.rb +35 -0
  15. data/lib/act_as_page_extractor/modules/interface.rb +30 -0
  16. data/lib/act_as_page_extractor/modules/saving.rb +47 -0
  17. data/lib/act_as_page_extractor/modules/tools.rb +54 -0
  18. data/lib/act_as_page_extractor/modules/unzipping.rb +15 -0
  19. data/lib/act_as_page_extractor/modules/validating.rb +22 -0
  20. data/lib/act_as_page_extractor/version.rb +5 -0
  21. data/lib/generators/act_as_page_extractor/migration_generator.rb +49 -0
  22. data/lib/generators/act_as_page_extractor/templates/act_as_page_extractor.rb.erb +14 -0
  23. data/lib/generators/act_as_page_extractor/templates/add_page_extractor_fields_to_documents.rb.erb +8 -0
  24. data/lib/generators/act_as_page_extractor/templates/create_extracted_pages_table.rb.erb +19 -0
  25. data/lib/generators/act_as_page_extractor/templates/extracted_page.rb.erb +3 -0
  26. data/spec/act_as_page_extractor_spec.rb +46 -0
  27. data/spec/spec_helper.rb +8 -0
  28. data/spec/support/models.rb +92 -0
  29. data/test/test-doc-3-pages.doc +0 -0
  30. data/test/test-doc-3-pages.docx +0 -0
  31. data/test/test-doc-3-pages.docx.7z +0 -0
  32. data/test/test-doc-3-pages.docx.rar +0 -0
  33. data/test/test-doc-3-pages.docx.zip +0 -0
  34. data/test/test-doc-3-pages.html +279 -0
  35. data/test/test-doc-3-pages.odt +0 -0
  36. data/test/test-doc-3-pages.pdf +0 -0
  37. data/test/test-doc-3-pages.rtf +339 -0
  38. data/test/test-doc-3-pages.txt +125 -0
  39. data/test/test-doc-3-pages.wrong +0 -0
  40. metadata +279 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 5b69a8043a6f3c7c01db160cbefaac0901a095e9
4
+ data.tar.gz: 7f18973005aa3aa1c7ea465bc73b7d4a1fcce266
5
+ SHA512:
6
+ metadata.gz: fb9b0bb9ae38c501bb7d0d729a893be689695d6eb5cec46ab629bd529da1eb70c6ce0463974cead6cc4d955da915d3c01d569597d4fe1d30ef6441efa4ff01ba
7
+ data.tar.gz: 1836aa3cad7d8ff323b55afd3433e8842bb03efd069f0d9d947debc54af7324faa737dc08c9d11851861dd493f17d523b771747e8884b2300b9c593c383dacfb
data/.gitignore ADDED
@@ -0,0 +1,57 @@
1
+ *.gem
2
+ *.rbc
3
+ /.config
4
+ /coverage/
5
+ /InstalledFiles
6
+ /pkg/
7
+ /spec/reports/
8
+ /spec/examples.txt
9
+ /test/tmp/
10
+ /test/version_tmp/
11
+ /tmp/
12
+
13
+ # Used by dotenv library to load environment variables.
14
+ # .env
15
+
16
+ ## Specific to RubyMotion:
17
+ .dat*
18
+ .repl_history
19
+ build/
20
+ *.bridgesupport
21
+ build-iPhoneOS/
22
+ build-iPhoneSimulator/
23
+
24
+ ## Specific to RubyMotion (use of CocoaPods):
25
+ #
26
+ # We recommend against adding the Pods directory to your .gitignore. However
27
+ # you should judge for yourself, the pros and cons are mentioned at:
28
+ # https://guides.cocoapods.org/using/using-cocoapods.html#should-i-check-the-pods-directory-into-source-control
29
+ #
30
+ # vendor/Pods/
31
+
32
+ ## Documentation cache and generated files:
33
+ /.yardoc/
34
+ /_yardoc/
35
+ /doc/
36
+ /rdoc/
37
+
38
+ ## Environment normalization:
39
+ /.bundle/
40
+ /vendor/bundle
41
+ /lib/bundler/man/
42
+
43
+ # for a library or gem, you might want to ignore these files since the code is
44
+ # intended to run in multiple environments; otherwise, check them in:
45
+ # Gemfile.lock
46
+ # .ruby-version
47
+ # .ruby-gemset
48
+
49
+ # unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
50
+ .rvmrc
51
+
52
+ # Ignore all logfiles and tempfiles.
53
+ /log/*
54
+ !/log/.keep
55
+ /public/uploads/*
56
+ .idea/*
57
+ test/uploads
data/.rmvrc ADDED
@@ -0,0 +1 @@
1
+ rvm use ruby-2.3.0@act_as_page_extractor --create
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --color
2
+ #--warnings
3
+ --require spec_helper
data/.ruby-gemset ADDED
@@ -0,0 +1 @@
1
+ act_as_page_extractor
data/.ruby-version ADDED
@@ -0,0 +1 @@
1
+ ruby-2.3.0
data/Gemfile ADDED
@@ -0,0 +1,22 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in total_compressor.gemspec
4
+ gemspec
5
+
6
+ gem 'activerecord', '~> 4.1'
7
+
8
+ gem 'awesome_print'
9
+
10
+ gem 'docsplit' # API for OpenOffice jodconverter (any to pdf)
11
+ gem 'pdf_utils' # getting text from pdf
12
+ gem 'prawn', '~>0.7.1' # need for pdf_utils
13
+ gem 'pdf-reader' # need for pdf_utils
14
+ gem 'total_compressor' # decompressing files
15
+ gem 'filesize' # pretty size of file
16
+
17
+ gem 'byebug'
18
+
19
+ group :test do
20
+ gem 'rspec' , '>= 2.14'
21
+ gem 'simplecov', require: false, group: :test
22
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,107 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ act_as_page_extractor (0.0.1)
5
+ activerecord (~> 4.1)
6
+ awesome_print
7
+ docsplit
8
+ filesize
9
+ pdf-reader
10
+ pdf_utils
11
+ prawn (~> 0.7.1)
12
+ total_compressor
13
+
14
+ GEM
15
+ remote: https://rubygems.org/
16
+ specs:
17
+ Ascii85 (1.0.2)
18
+ activemodel (4.2.7.1)
19
+ activesupport (= 4.2.7.1)
20
+ builder (~> 3.1)
21
+ activerecord (4.2.7.1)
22
+ activemodel (= 4.2.7.1)
23
+ activesupport (= 4.2.7.1)
24
+ arel (~> 6.0)
25
+ activesupport (4.2.7.1)
26
+ i18n (~> 0.7)
27
+ json (~> 1.7, >= 1.7.7)
28
+ minitest (~> 5.1)
29
+ thread_safe (~> 0.3, >= 0.3.4)
30
+ tzinfo (~> 1.1)
31
+ afm (0.2.2)
32
+ arel (6.0.3)
33
+ awesome_print (1.7.0)
34
+ builder (3.2.2)
35
+ byebug (9.0.5)
36
+ diff-lcs (1.2.5)
37
+ docile (1.1.5)
38
+ docsplit (0.7.6)
39
+ filesize (0.1.1)
40
+ hashery (2.1.2)
41
+ i18n (0.7.0)
42
+ json (1.8.3)
43
+ minitest (5.9.0)
44
+ pdf-reader (1.4.0)
45
+ Ascii85 (~> 1.0.0)
46
+ afm (~> 0.2.1)
47
+ hashery (~> 2.0)
48
+ ruby-rc4
49
+ ttfunk
50
+ pdf_utils (0.1.0)
51
+ prawn (0.7.2)
52
+ prawn-core (>= 0.7.2, < 0.8)
53
+ prawn-layout (>= 0.7.2, < 0.8)
54
+ prawn-security (>= 0.7.1, < 0.8)
55
+ prawn-core (0.7.2)
56
+ prawn-layout (0.7.2)
57
+ prawn-security (0.7.1)
58
+ rake (11.2.2)
59
+ rspec (3.5.0)
60
+ rspec-core (~> 3.5.0)
61
+ rspec-expectations (~> 3.5.0)
62
+ rspec-mocks (~> 3.5.0)
63
+ rspec-core (3.5.3)
64
+ rspec-support (~> 3.5.0)
65
+ rspec-expectations (3.5.0)
66
+ diff-lcs (>= 1.2.0, < 2.0)
67
+ rspec-support (~> 3.5.0)
68
+ rspec-mocks (3.5.0)
69
+ diff-lcs (>= 1.2.0, < 2.0)
70
+ rspec-support (~> 3.5.0)
71
+ rspec-support (3.5.0)
72
+ ruby-rc4 (0.1.5)
73
+ rubyzip (0.9.9)
74
+ simplecov (0.11.2)
75
+ docile (~> 1.1.0)
76
+ json (~> 1.8)
77
+ simplecov-html (~> 0.10.0)
78
+ simplecov-html (0.10.0)
79
+ thread_safe (0.3.5)
80
+ total_compressor (0.1.6)
81
+ awesome_print
82
+ rubyzip (~> 0.9.9)
83
+ ttfunk (1.4.0)
84
+ tzinfo (1.2.2)
85
+ thread_safe (~> 0.1)
86
+
87
+ PLATFORMS
88
+ ruby
89
+
90
+ DEPENDENCIES
91
+ act_as_page_extractor!
92
+ activerecord (~> 4.1)
93
+ awesome_print
94
+ bundler (~> 1.3)
95
+ byebug
96
+ docsplit
97
+ filesize
98
+ pdf-reader
99
+ pdf_utils
100
+ prawn (~> 0.7.1)
101
+ rake
102
+ rspec (>= 2.14)
103
+ simplecov
104
+ total_compressor
105
+
106
+ BUNDLED WITH
107
+ 1.13.7
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2017 Flower Team
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,119 @@
1
+ act_as_page_extractor
2
+ ================
3
+
4
+ Library for extracting plain text from documents(files) for further processing (indexing and searching).
5
+
6
+ ## Installation
7
+
8
+ Install appropriate tools before using:
9
+
10
+ sudo apt-get install zlib1g zlib1g-dev zip rar p7zip-full
11
+
12
+ Add this line to your application's Gemfile:
13
+
14
+ gem 'act_as_page_extractor'
15
+
16
+ ## Usage
17
+
18
+ For example, for model Document we need execute:
19
+
20
+ $ bundle
21
+ $ rails g act_as_page_extractor:migration Document category_id user_id
22
+
23
+ As a result we get two migration files:
24
+
25
+ class AddPageExtractorFields < ActiveRecord::Migration
26
+ def change
27
+ add_column :documents, :page_extraction_state, :string, default: ''
28
+ add_column :documents, :page_extraction_pages, :integer, default: 0
29
+ add_column :documents, :page_extraction_doctype, :string, default: ''
30
+ add_column :documents, :page_extraction_filesize, :string, default: ''
31
+ end
32
+ end
33
+
34
+ class CreateExtractedPages < ActiveRecord::Migration
35
+ def change
36
+ create_table :extracted_pages do |t|
37
+ t.text :page
38
+ t.integer :document_id
39
+ t.integer :category_id
40
+ t.integer :user_id
41
+ t.integer :page_number
42
+
43
+ t.timestamps null: false
44
+ end
45
+
46
+ add_index :extracted_pages, :document_id
47
+ add_index :extracted_pages, :category_id
48
+ add_index :extracted_pages, [:document_id, :category_id]
49
+ add_index :extracted_pages, [:document_id, :page_number]
50
+ end
51
+ end
52
+
53
+
54
+ Model Document must have field which contains path to file(supports [different archive types](https://github.com/phlowerteam/total_compressor) that contains [txt, pdf, doc/x, txt, html, rtf, ...](https://www.exoplatform.com/docs/public/index.jsp?topic=%2FPLF43%2FPLFAdminGuide.Configuration.JODConverter.html))
55
+
56
+ Add to model next parameters for initializing:
57
+
58
+ class Document < ActiveRecord::Base
59
+ include ActAsPageExtractor
60
+
61
+ act_as_page_extractor options: {
62
+ document_class: 'Document',
63
+ save_as_pdf: true,
64
+ filename: :filename,
65
+ document_id: :document_id,
66
+ additional_fields: [:category_id, :user_id],
67
+ #file_storage: "/full/path/to/documents/storage",
68
+ #pdf_storage: "/full/path/to/extracted/pdf/storage"
69
+ }
70
+
71
+ has_many :extracted_pages, dependent: :destroy
72
+ end
73
+
74
+ Now our instance has few new methods:
75
+
76
+ document = Document.first
77
+ document.page_extract!
78
+ document.extracted_pages
79
+ document.pdf_path # if option 'save_as_pdf' is 'true'
80
+
81
+ # Access to pages
82
+ ExtractedPage.count
83
+
84
+ # Importing whole directory of documents
85
+ ActAsPageExtractor.import_files('/path/to/foler/with/documents')
86
+
87
+ # We can use cron for run the processing of all the new documents
88
+ ActAsPageExtractor.start_extraction
89
+
90
+ # Getting statistics information of all documents
91
+ ActAsPageExtractor.statistics
92
+
93
+ Parameters of initializing `act_as_page_extractor options: { ... }`:
94
+
95
+ `document_class` - name of model (e.g. 'Document)
96
+ `save_as_pdf` - boolean [true, false] when we want save temporary pdf
97
+ `filename` - name of field which contains access to the file and it should be an object with 'url' method that returns path to file (e.g. CarrierWave object with 'filename.url')
98
+ `document_id` - name for saving id
99
+ `additional_fields` - additional fields that added to extracted page (e.g. for indexing, etc.)
100
+ `file_storage` - path for saving tmp files (by default it is "public")
101
+ `pdf_storage` - path for saving pdf (by default it is "public/uploads/extracted/pdf")
102
+
103
+ ## Run tests
104
+ $ COVERAGE=true rspec
105
+
106
+ ## Contributing
107
+ 1. Fork it
108
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
109
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
110
+ 4. Push to the branch (`git push origin my-new-feature`)
111
+ 5. Create new Pull Request
112
+
113
+ ## Contacts
114
+ https://github.com/phlowerteam
115
+ phlowerteam@gmail.com
116
+
117
+ ## License
118
+ Copyright (c) 2017 PhlowerTeam
119
+ MIT License
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task default: :spec
@@ -0,0 +1,34 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'act_as_page_extractor/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = 'act_as_page_extractor'
8
+ spec.version = ActAsPageExtractor::VERSION
9
+ spec.authors = ['PhlowerTeam']
10
+ spec.email = ['phlowerteam@gmail.com']
11
+ spec.description = %q{Library (Docsplit wrapper) for text extraction from pdf, doc/x, txt files with OpenOffice}
12
+ spec.summary = %q{Uses system calls}
13
+ spec.homepage = 'https://github.com/phlowerteam'
14
+ spec.license = 'MIT'
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
18
+ spec.require_paths = ['lib']
19
+
20
+ spec.add_development_dependency 'bundler', '~> 1.3'
21
+ spec.add_development_dependency 'rake'
22
+ spec.add_development_dependency 'byebug'
23
+ spec.add_development_dependency 'rspec'
24
+ spec.add_development_dependency 'simplecov'
25
+
26
+ spec.add_runtime_dependency 'activerecord', '~> 4.1'
27
+ spec.add_runtime_dependency 'awesome_print'
28
+ spec.add_runtime_dependency 'docsplit' # API for OpenOffice jodconverter (any to pdf)
29
+ spec.add_runtime_dependency 'pdf_utils' # getting text from pdf
30
+ spec.add_runtime_dependency 'prawn', '~>0.7.1' # need for pdf_utils
31
+ spec.add_runtime_dependency 'pdf-reader' # need for pdf_utils
32
+ spec.add_runtime_dependency 'total_compressor' # decompressing files
33
+ spec.add_runtime_dependency 'filesize' # pretty size of file
34
+ end
@@ -0,0 +1,126 @@
1
+ require 'act_as_page_extractor/version'
2
+
3
+ require 'active_record'
4
+
5
+ require 'awesome_print'
6
+ require 'filesize'
7
+ require 'total_compressor'
8
+ require 'docsplit'
9
+ require 'pdf_utils'
10
+ require 'prawn'
11
+ require 'pdf-reader'
12
+
13
+ require 'act_as_page_extractor/modules/tools.rb'
14
+ require 'act_as_page_extractor/modules/validating.rb'
15
+ require 'act_as_page_extractor/modules/unzipping.rb'
16
+ require 'act_as_page_extractor/modules/extracting.rb'
17
+ require 'act_as_page_extractor/modules/saving.rb'
18
+
19
+ require 'act_as_page_extractor/modules/interface'
20
+
21
+ module ActAsPageExtractor
22
+
23
+ extend ActiveSupport::Concern
24
+
25
+ included do
26
+ before_create { self.page_extraction_state = EXTRACTING_STATES[:new] }
27
+ before_destroy :remove_files
28
+ end
29
+
30
+ module ClassMethods
31
+ def act_as_page_extractor(options: {})
32
+ define_method(:save_as_pdf){|*args| options[:save_as_pdf] }
33
+ define_method(:extracted_filename){|*args| self.send(options[:filename].to_sym) }
34
+ ActAsPageExtractor.define_singleton_method(:extracted_filename) {|*args| options[:filename] }
35
+ ActAsPageExtractor.define_singleton_method(:document_class) {|*args| Object.const_get(options[:document_class]) }
36
+ define_method(:extracted_document_id){|*args| options[:document_id] }
37
+ define_method(:additional_fields){|*args| options[:additional_fields] || [] }
38
+ define_method(:file_storage){|*args| options[:file_storage] || FILE_STORAGE }
39
+ define_method(:pdf_storage){|*args| options[:pdf_storage] || PDF_STORAGE }
40
+ end
41
+ end
42
+
43
+ EXTRACTING_STATES = {
44
+ new: 'new',
45
+ extracting: 'extracting',
46
+ extracted: 'extracted',
47
+ 'error.extraction': 'error.extraction'
48
+ }.freeze
49
+
50
+ TMP_EXTRACTION_FILE_STORAGE = "#{Dir.pwd}/tmp/page_extraction".freeze
51
+ FILE_STORAGE = "#{Dir.pwd}/public".freeze
52
+ PDF_STORAGE = "#{FILE_STORAGE}/uploads/extracted/pdf".freeze
53
+
54
+ def initialized
55
+ # add all need callbacks
56
+ #on destroy remove pdf
57
+
58
+ #Add to Readme!!
59
+ #rails g act_as_page_extractor:migration Document category_id user_id
60
+ # add to [Document] model:
61
+ # has_many :extracted_pages, dependent: :destroy
62
+ create_pdf_dir
63
+ end
64
+
65
+ def page_extract!
66
+ initialized
67
+ cleanup_pages
68
+ create_tmp_dir
69
+ begin
70
+ copy_document
71
+ unzip_document
72
+ if valid_document
73
+ extract_pages
74
+ save_to_db
75
+ end
76
+ ensure
77
+ update_state
78
+ save_pdf
79
+ debug_info
80
+ finish
81
+ end
82
+ end
83
+
84
+ private
85
+
86
+ def create_pdf_dir
87
+ if save_as_pdf
88
+ FileUtils::mkdir_p(pdf_storage) unless File.exists?(pdf_storage)
89
+ end
90
+ end
91
+
92
+ def create_tmp_dir
93
+ @tmp_dir = "#{TMP_EXTRACTION_FILE_STORAGE}/#{SecureRandom.hex(6)}"
94
+ FileUtils::mkdir_p(@tmp_dir) unless File.exists?(@tmp_dir)
95
+ end
96
+
97
+ def copy_document
98
+ @origin_document_path = "#{file_storage}#{self.send(:extracted_filename).url.to_s}"
99
+ FileUtils.cp(@origin_document_path, @tmp_dir)
100
+ @copy_document_path = "#{@tmp_dir}/#{@origin_document_path.split("/").last}"
101
+ @document_filename = @origin_document_path.split("/").last
102
+ end
103
+
104
+ def finish
105
+ remove_tmp_dir
106
+ end
107
+
108
+ def remove_tmp_dir
109
+ FileUtils.rm_rf(@tmp_dir) if @tmp_dir =~ /\/tmp\//
110
+ end
111
+ end
112
+
113
+ # rails g model ExtractedPage page:text document_id:integer category_id:integer page_number:integer
114
+
115
+ # Rails 4 way
116
+ # 9.2.7.1 Multiple Callback Methods in One Class
117
+ # 258 page
118
+
119
+ # class ActiveRecord::Base
120
+ # def self.acts_as_page_extractor(document_field=:filename)
121
+ # auditor = Auditor.new(audit_log)
122
+ # after_create auditor
123
+ # after_update auditor
124
+ # after_destroy auditor
125
+ # end
126
+ # end