act_as_page_extractor 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +57 -0
  3. data/.rmvrc +1 -0
  4. data/.rspec +3 -0
  5. data/.ruby-gemset +1 -0
  6. data/.ruby-version +1 -0
  7. data/Gemfile +22 -0
  8. data/Gemfile.lock +107 -0
  9. data/LICENSE +21 -0
  10. data/README.md +119 -0
  11. data/Rakefile +6 -0
  12. data/act_as_page_extractor.gemspec +34 -0
  13. data/lib/act_as_page_extractor.rb +126 -0
  14. data/lib/act_as_page_extractor/modules/extracting.rb +35 -0
  15. data/lib/act_as_page_extractor/modules/interface.rb +30 -0
  16. data/lib/act_as_page_extractor/modules/saving.rb +47 -0
  17. data/lib/act_as_page_extractor/modules/tools.rb +54 -0
  18. data/lib/act_as_page_extractor/modules/unzipping.rb +15 -0
  19. data/lib/act_as_page_extractor/modules/validating.rb +22 -0
  20. data/lib/act_as_page_extractor/version.rb +5 -0
  21. data/lib/generators/act_as_page_extractor/migration_generator.rb +49 -0
  22. data/lib/generators/act_as_page_extractor/templates/act_as_page_extractor.rb.erb +14 -0
  23. data/lib/generators/act_as_page_extractor/templates/add_page_extractor_fields_to_documents.rb.erb +8 -0
  24. data/lib/generators/act_as_page_extractor/templates/create_extracted_pages_table.rb.erb +19 -0
  25. data/lib/generators/act_as_page_extractor/templates/extracted_page.rb.erb +3 -0
  26. data/spec/act_as_page_extractor_spec.rb +46 -0
  27. data/spec/spec_helper.rb +8 -0
  28. data/spec/support/models.rb +92 -0
  29. data/test/test-doc-3-pages.doc +0 -0
  30. data/test/test-doc-3-pages.docx +0 -0
  31. data/test/test-doc-3-pages.docx.7z +0 -0
  32. data/test/test-doc-3-pages.docx.rar +0 -0
  33. data/test/test-doc-3-pages.docx.zip +0 -0
  34. data/test/test-doc-3-pages.html +279 -0
  35. data/test/test-doc-3-pages.odt +0 -0
  36. data/test/test-doc-3-pages.pdf +0 -0
  37. data/test/test-doc-3-pages.rtf +339 -0
  38. data/test/test-doc-3-pages.txt +125 -0
  39. data/test/test-doc-3-pages.wrong +0 -0
  40. metadata +279 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 5b69a8043a6f3c7c01db160cbefaac0901a095e9
4
+ data.tar.gz: 7f18973005aa3aa1c7ea465bc73b7d4a1fcce266
5
+ SHA512:
6
+ metadata.gz: fb9b0bb9ae38c501bb7d0d729a893be689695d6eb5cec46ab629bd529da1eb70c6ce0463974cead6cc4d955da915d3c01d569597d4fe1d30ef6441efa4ff01ba
7
+ data.tar.gz: 1836aa3cad7d8ff323b55afd3433e8842bb03efd069f0d9d947debc54af7324faa737dc08c9d11851861dd493f17d523b771747e8884b2300b9c593c383dacfb
data/.gitignore ADDED
@@ -0,0 +1,57 @@
1
+ *.gem
2
+ *.rbc
3
+ /.config
4
+ /coverage/
5
+ /InstalledFiles
6
+ /pkg/
7
+ /spec/reports/
8
+ /spec/examples.txt
9
+ /test/tmp/
10
+ /test/version_tmp/
11
+ /tmp/
12
+
13
+ # Used by dotenv library to load environment variables.
14
+ # .env
15
+
16
+ ## Specific to RubyMotion:
17
+ .dat*
18
+ .repl_history
19
+ build/
20
+ *.bridgesupport
21
+ build-iPhoneOS/
22
+ build-iPhoneSimulator/
23
+
24
+ ## Specific to RubyMotion (use of CocoaPods):
25
+ #
26
+ # We recommend against adding the Pods directory to your .gitignore. However
27
+ # you should judge for yourself, the pros and cons are mentioned at:
28
+ # https://guides.cocoapods.org/using/using-cocoapods.html#should-i-check-the-pods-directory-into-source-control
29
+ #
30
+ # vendor/Pods/
31
+
32
+ ## Documentation cache and generated files:
33
+ /.yardoc/
34
+ /_yardoc/
35
+ /doc/
36
+ /rdoc/
37
+
38
+ ## Environment normalization:
39
+ /.bundle/
40
+ /vendor/bundle
41
+ /lib/bundler/man/
42
+
43
+ # for a library or gem, you might want to ignore these files since the code is
44
+ # intended to run in multiple environments; otherwise, check them in:
45
+ # Gemfile.lock
46
+ # .ruby-version
47
+ # .ruby-gemset
48
+
49
+ # unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
50
+ .rvmrc
51
+
52
+ # Ignore all logfiles and tempfiles.
53
+ /log/*
54
+ !/log/.keep
55
+ /public/uploads/*
56
+ .idea/*
57
+ test/uploads
data/.rmvrc ADDED
@@ -0,0 +1 @@
1
+ rvm use ruby-2.3.0@act_as_page_extractor --create
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --color
2
+ #--warnings
3
+ --require spec_helper
data/.ruby-gemset ADDED
@@ -0,0 +1 @@
1
+ act_as_page_extractor
data/.ruby-version ADDED
@@ -0,0 +1 @@
1
+ ruby-2.3.0
data/Gemfile ADDED
@@ -0,0 +1,22 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in total_compressor.gemspec
4
+ gemspec
5
+
6
+ gem 'activerecord', '~> 4.1'
7
+
8
+ gem 'awesome_print'
9
+
10
+ gem 'docsplit' # API for OpenOffice jodconverter (any to pdf)
11
+ gem 'pdf_utils' # getting text from pdf
12
+ gem 'prawn', '~>0.7.1' # need for pdf_utils
13
+ gem 'pdf-reader' # need for pdf_utils
14
+ gem 'total_compressor' # decompressing files
15
+ gem 'filesize' # pretty size of file
16
+
17
+ gem 'byebug'
18
+
19
+ group :test do
20
+ gem 'rspec' , '>= 2.14'
21
+ gem 'simplecov', require: false, group: :test
22
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,107 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ act_as_page_extractor (0.0.1)
5
+ activerecord (~> 4.1)
6
+ awesome_print
7
+ docsplit
8
+ filesize
9
+ pdf-reader
10
+ pdf_utils
11
+ prawn (~> 0.7.1)
12
+ total_compressor
13
+
14
+ GEM
15
+ remote: https://rubygems.org/
16
+ specs:
17
+ Ascii85 (1.0.2)
18
+ activemodel (4.2.7.1)
19
+ activesupport (= 4.2.7.1)
20
+ builder (~> 3.1)
21
+ activerecord (4.2.7.1)
22
+ activemodel (= 4.2.7.1)
23
+ activesupport (= 4.2.7.1)
24
+ arel (~> 6.0)
25
+ activesupport (4.2.7.1)
26
+ i18n (~> 0.7)
27
+ json (~> 1.7, >= 1.7.7)
28
+ minitest (~> 5.1)
29
+ thread_safe (~> 0.3, >= 0.3.4)
30
+ tzinfo (~> 1.1)
31
+ afm (0.2.2)
32
+ arel (6.0.3)
33
+ awesome_print (1.7.0)
34
+ builder (3.2.2)
35
+ byebug (9.0.5)
36
+ diff-lcs (1.2.5)
37
+ docile (1.1.5)
38
+ docsplit (0.7.6)
39
+ filesize (0.1.1)
40
+ hashery (2.1.2)
41
+ i18n (0.7.0)
42
+ json (1.8.3)
43
+ minitest (5.9.0)
44
+ pdf-reader (1.4.0)
45
+ Ascii85 (~> 1.0.0)
46
+ afm (~> 0.2.1)
47
+ hashery (~> 2.0)
48
+ ruby-rc4
49
+ ttfunk
50
+ pdf_utils (0.1.0)
51
+ prawn (0.7.2)
52
+ prawn-core (>= 0.7.2, < 0.8)
53
+ prawn-layout (>= 0.7.2, < 0.8)
54
+ prawn-security (>= 0.7.1, < 0.8)
55
+ prawn-core (0.7.2)
56
+ prawn-layout (0.7.2)
57
+ prawn-security (0.7.1)
58
+ rake (11.2.2)
59
+ rspec (3.5.0)
60
+ rspec-core (~> 3.5.0)
61
+ rspec-expectations (~> 3.5.0)
62
+ rspec-mocks (~> 3.5.0)
63
+ rspec-core (3.5.3)
64
+ rspec-support (~> 3.5.0)
65
+ rspec-expectations (3.5.0)
66
+ diff-lcs (>= 1.2.0, < 2.0)
67
+ rspec-support (~> 3.5.0)
68
+ rspec-mocks (3.5.0)
69
+ diff-lcs (>= 1.2.0, < 2.0)
70
+ rspec-support (~> 3.5.0)
71
+ rspec-support (3.5.0)
72
+ ruby-rc4 (0.1.5)
73
+ rubyzip (0.9.9)
74
+ simplecov (0.11.2)
75
+ docile (~> 1.1.0)
76
+ json (~> 1.8)
77
+ simplecov-html (~> 0.10.0)
78
+ simplecov-html (0.10.0)
79
+ thread_safe (0.3.5)
80
+ total_compressor (0.1.6)
81
+ awesome_print
82
+ rubyzip (~> 0.9.9)
83
+ ttfunk (1.4.0)
84
+ tzinfo (1.2.2)
85
+ thread_safe (~> 0.1)
86
+
87
+ PLATFORMS
88
+ ruby
89
+
90
+ DEPENDENCIES
91
+ act_as_page_extractor!
92
+ activerecord (~> 4.1)
93
+ awesome_print
94
+ bundler (~> 1.3)
95
+ byebug
96
+ docsplit
97
+ filesize
98
+ pdf-reader
99
+ pdf_utils
100
+ prawn (~> 0.7.1)
101
+ rake
102
+ rspec (>= 2.14)
103
+ simplecov
104
+ total_compressor
105
+
106
+ BUNDLED WITH
107
+ 1.13.7
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2017 Flower Team
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,119 @@
1
+ act_as_page_extractor
2
+ ================
3
+
4
+ Library for extracting plain text from documents(files) for further processing (indexing and searching).
5
+
6
+ ## Installation
7
+
8
+ Install appropriate tools before using:
9
+
10
+ sudo apt-get install zlib1g zlib1g-dev zip rar p7zip-full
11
+
12
+ Add this line to your application's Gemfile:
13
+
14
+ gem 'act_as_page_extractor'
15
+
16
+ ## Usage
17
+
18
+ For example, for model Document we need execute:
19
+
20
+ $ bundle
21
+ $ rails g act_as_page_extractor:migration Document category_id user_id
22
+
23
+ As a result we get two migration files:
24
+
25
+ class AddPageExtractorFields < ActiveRecord::Migration
26
+ def change
27
+ add_column :documents, :page_extraction_state, :string, default: ''
28
+ add_column :documents, :page_extraction_pages, :integer, default: 0
29
+ add_column :documents, :page_extraction_doctype, :string, default: ''
30
+ add_column :documents, :page_extraction_filesize, :string, default: ''
31
+ end
32
+ end
33
+
34
+ class CreateExtractedPages < ActiveRecord::Migration
35
+ def change
36
+ create_table :extracted_pages do |t|
37
+ t.text :page
38
+ t.integer :document_id
39
+ t.integer :category_id
40
+ t.integer :user_id
41
+ t.integer :page_number
42
+
43
+ t.timestamps null: false
44
+ end
45
+
46
+ add_index :extracted_pages, :document_id
47
+ add_index :extracted_pages, :category_id
48
+ add_index :extracted_pages, [:document_id, :category_id]
49
+ add_index :extracted_pages, [:document_id, :page_number]
50
+ end
51
+ end
52
+
53
+
54
+ Model Document must have field which contains path to file(supports [different archive types](https://github.com/phlowerteam/total_compressor) that contains [txt, pdf, doc/x, txt, html, rtf, ...](https://www.exoplatform.com/docs/public/index.jsp?topic=%2FPLF43%2FPLFAdminGuide.Configuration.JODConverter.html))
55
+
56
+ Add to model next parameters for initializing:
57
+
58
+ class Document < ActiveRecord::Base
59
+ include ActAsPageExtractor
60
+
61
+ act_as_page_extractor options: {
62
+ document_class: 'Document',
63
+ save_as_pdf: true,
64
+ filename: :filename,
65
+ document_id: :document_id,
66
+ additional_fields: [:category_id, :user_id],
67
+ #file_storage: "/full/path/to/documents/storage",
68
+ #pdf_storage: "/full/path/to/extracted/pdf/storage"
69
+ }
70
+
71
+ has_many :extracted_pages, dependent: :destroy
72
+ end
73
+
74
+ Now our instance has few new methods:
75
+
76
+ document = Document.first
77
+ document.page_extract!
78
+ document.extracted_pages
79
+ document.pdf_path # if option 'save_as_pdf' is 'true'
80
+
81
+ # Access to pages
82
+ ExtractedPage.count
83
+
84
+ # Importing whole directory of documents
85
+ ActAsPageExtractor.import_files('/path/to/foler/with/documents')
86
+
87
+ # We can use cron for run the processing of all the new documents
88
+ ActAsPageExtractor.start_extraction
89
+
90
+ # Getting statistics information of all documents
91
+ ActAsPageExtractor.statistics
92
+
93
+ Parameters of initializing `act_as_page_extractor options: { ... }`:
94
+
95
+ `document_class` - name of model (e.g. 'Document)
96
+ `save_as_pdf` - boolean [true, false] when we want save temporary pdf
97
+ `filename` - name of field which contains access to the file and it should be an object with 'url' method that returns path to file (e.g. CarrierWave object with 'filename.url')
98
+ `document_id` - name for saving id
99
+ `additional_fields` - additional fields that added to extracted page (e.g. for indexing, etc.)
100
+ `file_storage` - path for saving tmp files (by default it is "public")
101
+ `pdf_storage` - path for saving pdf (by default it is "public/uploads/extracted/pdf")
102
+
103
+ ## Run tests
104
+ $ COVERAGE=true rspec
105
+
106
+ ## Contributing
107
+ 1. Fork it
108
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
109
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
110
+ 4. Push to the branch (`git push origin my-new-feature`)
111
+ 5. Create new Pull Request
112
+
113
+ ## Contacts
114
+ https://github.com/phlowerteam
115
+ phlowerteam@gmail.com
116
+
117
+ ## License
118
+ Copyright (c) 2017 PhlowerTeam
119
+ MIT License
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task default: :spec
@@ -0,0 +1,34 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'act_as_page_extractor/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = 'act_as_page_extractor'
8
+ spec.version = ActAsPageExtractor::VERSION
9
+ spec.authors = ['PhlowerTeam']
10
+ spec.email = ['phlowerteam@gmail.com']
11
+ spec.description = %q{Library (Docsplit wrapper) for text extraction from pdf, doc/x, txt files with OpenOffice}
12
+ spec.summary = %q{Uses system calls}
13
+ spec.homepage = 'https://github.com/phlowerteam'
14
+ spec.license = 'MIT'
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
18
+ spec.require_paths = ['lib']
19
+
20
+ spec.add_development_dependency 'bundler', '~> 1.3'
21
+ spec.add_development_dependency 'rake'
22
+ spec.add_development_dependency 'byebug'
23
+ spec.add_development_dependency 'rspec'
24
+ spec.add_development_dependency 'simplecov'
25
+
26
+ spec.add_runtime_dependency 'activerecord', '~> 4.1'
27
+ spec.add_runtime_dependency 'awesome_print'
28
+ spec.add_runtime_dependency 'docsplit' # API for OpenOffice jodconverter (any to pdf)
29
+ spec.add_runtime_dependency 'pdf_utils' # getting text from pdf
30
+ spec.add_runtime_dependency 'prawn', '~>0.7.1' # need for pdf_utils
31
+ spec.add_runtime_dependency 'pdf-reader' # need for pdf_utils
32
+ spec.add_runtime_dependency 'total_compressor' # decompressing files
33
+ spec.add_runtime_dependency 'filesize' # pretty size of file
34
+ end
@@ -0,0 +1,126 @@
1
+ require 'act_as_page_extractor/version'
2
+
3
+ require 'active_record'
4
+
5
+ require 'awesome_print'
6
+ require 'filesize'
7
+ require 'total_compressor'
8
+ require 'docsplit'
9
+ require 'pdf_utils'
10
+ require 'prawn'
11
+ require 'pdf-reader'
12
+
13
+ require 'act_as_page_extractor/modules/tools.rb'
14
+ require 'act_as_page_extractor/modules/validating.rb'
15
+ require 'act_as_page_extractor/modules/unzipping.rb'
16
+ require 'act_as_page_extractor/modules/extracting.rb'
17
+ require 'act_as_page_extractor/modules/saving.rb'
18
+
19
+ require 'act_as_page_extractor/modules/interface'
20
+
21
+ module ActAsPageExtractor
22
+
23
+ extend ActiveSupport::Concern
24
+
25
+ included do
26
+ before_create { self.page_extraction_state = EXTRACTING_STATES[:new] }
27
+ before_destroy :remove_files
28
+ end
29
+
30
+ module ClassMethods
31
+ def act_as_page_extractor(options: {})
32
+ define_method(:save_as_pdf){|*args| options[:save_as_pdf] }
33
+ define_method(:extracted_filename){|*args| self.send(options[:filename].to_sym) }
34
+ ActAsPageExtractor.define_singleton_method(:extracted_filename) {|*args| options[:filename] }
35
+ ActAsPageExtractor.define_singleton_method(:document_class) {|*args| Object.const_get(options[:document_class]) }
36
+ define_method(:extracted_document_id){|*args| options[:document_id] }
37
+ define_method(:additional_fields){|*args| options[:additional_fields] || [] }
38
+ define_method(:file_storage){|*args| options[:file_storage] || FILE_STORAGE }
39
+ define_method(:pdf_storage){|*args| options[:pdf_storage] || PDF_STORAGE }
40
+ end
41
+ end
42
+
43
+ EXTRACTING_STATES = {
44
+ new: 'new',
45
+ extracting: 'extracting',
46
+ extracted: 'extracted',
47
+ 'error.extraction': 'error.extraction'
48
+ }.freeze
49
+
50
+ TMP_EXTRACTION_FILE_STORAGE = "#{Dir.pwd}/tmp/page_extraction".freeze
51
+ FILE_STORAGE = "#{Dir.pwd}/public".freeze
52
+ PDF_STORAGE = "#{FILE_STORAGE}/uploads/extracted/pdf".freeze
53
+
54
+ def initialized
55
+ # add all need callbacks
56
+ #on destroy remove pdf
57
+
58
+ #Add to Readme!!
59
+ #rails g act_as_page_extractor:migration Document category_id user_id
60
+ # add to [Document] model:
61
+ # has_many :extracted_pages, dependent: :destroy
62
+ create_pdf_dir
63
+ end
64
+
65
+ def page_extract!
66
+ initialized
67
+ cleanup_pages
68
+ create_tmp_dir
69
+ begin
70
+ copy_document
71
+ unzip_document
72
+ if valid_document
73
+ extract_pages
74
+ save_to_db
75
+ end
76
+ ensure
77
+ update_state
78
+ save_pdf
79
+ debug_info
80
+ finish
81
+ end
82
+ end
83
+
84
+ private
85
+
86
+ def create_pdf_dir
87
+ if save_as_pdf
88
+ FileUtils::mkdir_p(pdf_storage) unless File.exists?(pdf_storage)
89
+ end
90
+ end
91
+
92
+ def create_tmp_dir
93
+ @tmp_dir = "#{TMP_EXTRACTION_FILE_STORAGE}/#{SecureRandom.hex(6)}"
94
+ FileUtils::mkdir_p(@tmp_dir) unless File.exists?(@tmp_dir)
95
+ end
96
+
97
+ def copy_document
98
+ @origin_document_path = "#{file_storage}#{self.send(:extracted_filename).url.to_s}"
99
+ FileUtils.cp(@origin_document_path, @tmp_dir)
100
+ @copy_document_path = "#{@tmp_dir}/#{@origin_document_path.split("/").last}"
101
+ @document_filename = @origin_document_path.split("/").last
102
+ end
103
+
104
+ def finish
105
+ remove_tmp_dir
106
+ end
107
+
108
+ def remove_tmp_dir
109
+ FileUtils.rm_rf(@tmp_dir) if @tmp_dir =~ /\/tmp\//
110
+ end
111
+ end
112
+
113
+ # rails g model ExtractedPage page:text document_id:integer category_id:integer page_number:integer
114
+
115
+ # Rails 4 way
116
+ # 9.2.7.1 Multiple Callback Methods in One Class
117
+ # 258 page
118
+
119
+ # class ActiveRecord::Base
120
+ # def self.acts_as_page_extractor(document_field=:filename)
121
+ # auditor = Auditor.new(audit_log)
122
+ # after_create auditor
123
+ # after_update auditor
124
+ # after_destroy auditor
125
+ # end
126
+ # end