act_as_page_extractor 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +57 -0
  3. data/.rmvrc +1 -0
  4. data/.rspec +3 -0
  5. data/.ruby-gemset +1 -0
  6. data/.ruby-version +1 -0
  7. data/Gemfile +22 -0
  8. data/Gemfile.lock +107 -0
  9. data/LICENSE +21 -0
  10. data/README.md +119 -0
  11. data/Rakefile +6 -0
  12. data/act_as_page_extractor.gemspec +34 -0
  13. data/lib/act_as_page_extractor.rb +126 -0
  14. data/lib/act_as_page_extractor/modules/extracting.rb +35 -0
  15. data/lib/act_as_page_extractor/modules/interface.rb +30 -0
  16. data/lib/act_as_page_extractor/modules/saving.rb +47 -0
  17. data/lib/act_as_page_extractor/modules/tools.rb +54 -0
  18. data/lib/act_as_page_extractor/modules/unzipping.rb +15 -0
  19. data/lib/act_as_page_extractor/modules/validating.rb +22 -0
  20. data/lib/act_as_page_extractor/version.rb +5 -0
  21. data/lib/generators/act_as_page_extractor/migration_generator.rb +49 -0
  22. data/lib/generators/act_as_page_extractor/templates/act_as_page_extractor.rb.erb +14 -0
  23. data/lib/generators/act_as_page_extractor/templates/add_page_extractor_fields_to_documents.rb.erb +8 -0
  24. data/lib/generators/act_as_page_extractor/templates/create_extracted_pages_table.rb.erb +19 -0
  25. data/lib/generators/act_as_page_extractor/templates/extracted_page.rb.erb +3 -0
  26. data/spec/act_as_page_extractor_spec.rb +46 -0
  27. data/spec/spec_helper.rb +8 -0
  28. data/spec/support/models.rb +92 -0
  29. data/test/test-doc-3-pages.doc +0 -0
  30. data/test/test-doc-3-pages.docx +0 -0
  31. data/test/test-doc-3-pages.docx.7z +0 -0
  32. data/test/test-doc-3-pages.docx.rar +0 -0
  33. data/test/test-doc-3-pages.docx.zip +0 -0
  34. data/test/test-doc-3-pages.html +279 -0
  35. data/test/test-doc-3-pages.odt +0 -0
  36. data/test/test-doc-3-pages.pdf +0 -0
  37. data/test/test-doc-3-pages.rtf +339 -0
  38. data/test/test-doc-3-pages.txt +125 -0
  39. data/test/test-doc-3-pages.wrong +0 -0
  40. metadata +279 -0
@@ -0,0 +1,125 @@
1
+ require 'act_as_page_extractor/version'
2
+
3
+ require 'active_record'
4
+
5
+ require 'awesome_print'
6
+ require 'filesize'
7
+ require 'total_compressor'
8
+ require 'docsplit'
9
+ require 'pdf_utils'
10
+ require 'prawn'
11
+ require 'pdf-reader'
12
+
13
+ require 'act_as_page_extractor/modules/tools.rb'
14
+ require 'act_as_page_extractor/modules/validating.rb'
15
+ require 'act_as_page_extractor/modules/unzipping.rb'
16
+ require 'act_as_page_extractor/modules/extracting.rb'
17
+ require 'act_as_page_extractor/modules/saving.rb'
18
+
19
+ require 'act_as_page_extractor/modules/interface'
20
+
21
+ module ActAsPageExtractor
22
+
23
+ extend ActiveSupport::Concern
24
+
25
+ included do
26
+ before_create { self.page_extraction_state = EXTRACTING_STATES[:new] }
27
+ before_destroy :remove_files
28
+ end
29
+
30
+ # attr_reader :options
31
+
32
+ module ClassMethods
33
+ def act_as_page_extractor(options: {})
34
+ define_method(:save_as_pdf){|*args| options[:save_as_pdf] }
35
+ define_method(:extracted_filename){|*args| self.send(options[:filename].to_sym) }
36
+ ActAsPageExtractor.define_singleton_method(:extracted_filename) {|*args| options[:filename] }
37
+ ActAsPageExtractor.define_singleton_method(:document_class) {|*args| options[:document_class].constantize }
38
+ define_method(:extracted_document_id){|*args| options[:document_id] }
39
+ define_method(:additional_fields){|*args| options[:additional_fields] }
40
+ end
41
+ end
42
+
43
+ EXTRACTING_STATES = {
44
+ new: 'new',
45
+ extracting: 'extracting',
46
+ extracted: 'extracted',
47
+ 'error.extraction': 'error.extraction'
48
+ }.freeze
49
+
50
+ TMP_EXTRACTION_FILE_STORAGE = "#{Dir.pwd}/tmp/page_extraction".freeze
51
+ FILE_STORAGE = "#{Dir.pwd}/public".freeze
52
+ PDF_STORAGE = "#{FILE_STORAGE}/uploads/extracted/pdf".freeze
53
+
54
+ def initialized
55
+ # add all need callbacks
56
+ #on destroy remove pdf
57
+
58
+ #Add to Readme!!
59
+ #rails g act_as_page_extractor:migration Document category_id user_id
60
+ # add to [Document] model:
61
+ # has_many :extracted_pages, dependent: :destroy
62
+ create_pdf_dir
63
+ end
64
+
65
+ def page_extract!
66
+ initialized
67
+ cleanup_pages
68
+ create_tmp_dir
69
+ begin
70
+ copy_document
71
+ # debug_info
72
+ unzip_document
73
+ if valid_document
74
+ extract_pages
75
+ save_to_db
76
+ end
77
+ ensure
78
+ update_state
79
+ save_pdf
80
+ finish
81
+ end
82
+ end
83
+
84
+ def create_pdf_dir
85
+ if save_as_pdf
86
+ FileUtils::mkdir_p(PDF_STORAGE) unless File.exists?(PDF_STORAGE)
87
+ end
88
+ end
89
+
90
+ def create_tmp_dir
91
+ @tmp_dir = "#{TMP_EXTRACTION_FILE_STORAGE}/#{SecureRandom.hex(6)}"
92
+ FileUtils::mkdir_p(@tmp_dir) unless File.exists?(@tmp_dir)
93
+ end
94
+
95
+ def copy_document
96
+ @origin_document_path = "#{FILE_STORAGE}#{self.send(:extracted_filename).url.to_s}"
97
+ ap @origin_document_path
98
+ FileUtils.cp(@origin_document_path, @tmp_dir)
99
+ @copy_document_path = "#{@tmp_dir}/#{@origin_document_path.split("/").last}"
100
+ @document_filename = @origin_document_path.split("/").last
101
+ end
102
+
103
+ def finish
104
+ remove_tmp_dir
105
+ end
106
+
107
+ def remove_tmp_dir
108
+ FileUtils.rm_rf(@tmp_dir) if @tmp_dir =~ /\/tmp\//
109
+ end
110
+ end
111
+
112
+ # rails g model ExtractedPage page:text document_id:integer category_id:integer page_number:integer
113
+
114
+ # Rails 4 way
115
+ # 9.2.7.1 Multiple Callback Methods in One Class
116
+ # 258 page
117
+
118
+ # class ActiveRecord::Base
119
+ # def self.acts_as_page_extractor(document_field=:filename)
120
+ # auditor = Auditor.new(audit_log)
121
+ # after_create auditor
122
+ # after_update auditor
123
+ # after_destroy auditor
124
+ # end
125
+ # end
Binary file
metadata ADDED
@@ -0,0 +1,279 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: act_as_page_extractor
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - PhlowerTeam
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2017-01-09 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.3'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.3'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: byebug
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rspec
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: simplecov
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: activerecord
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '4.1'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '4.1'
97
+ - !ruby/object:Gem::Dependency
98
+ name: awesome_print
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: docsplit
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :runtime
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ - !ruby/object:Gem::Dependency
126
+ name: pdf_utils
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ">="
130
+ - !ruby/object:Gem::Version
131
+ version: '0'
132
+ type: :runtime
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - ">="
137
+ - !ruby/object:Gem::Version
138
+ version: '0'
139
+ - !ruby/object:Gem::Dependency
140
+ name: prawn
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - "~>"
144
+ - !ruby/object:Gem::Version
145
+ version: 0.7.1
146
+ type: :runtime
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - "~>"
151
+ - !ruby/object:Gem::Version
152
+ version: 0.7.1
153
+ - !ruby/object:Gem::Dependency
154
+ name: pdf-reader
155
+ requirement: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - ">="
158
+ - !ruby/object:Gem::Version
159
+ version: '0'
160
+ type: :runtime
161
+ prerelease: false
162
+ version_requirements: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - ">="
165
+ - !ruby/object:Gem::Version
166
+ version: '0'
167
+ - !ruby/object:Gem::Dependency
168
+ name: total_compressor
169
+ requirement: !ruby/object:Gem::Requirement
170
+ requirements:
171
+ - - ">="
172
+ - !ruby/object:Gem::Version
173
+ version: '0'
174
+ type: :runtime
175
+ prerelease: false
176
+ version_requirements: !ruby/object:Gem::Requirement
177
+ requirements:
178
+ - - ">="
179
+ - !ruby/object:Gem::Version
180
+ version: '0'
181
+ - !ruby/object:Gem::Dependency
182
+ name: filesize
183
+ requirement: !ruby/object:Gem::Requirement
184
+ requirements:
185
+ - - ">="
186
+ - !ruby/object:Gem::Version
187
+ version: '0'
188
+ type: :runtime
189
+ prerelease: false
190
+ version_requirements: !ruby/object:Gem::Requirement
191
+ requirements:
192
+ - - ">="
193
+ - !ruby/object:Gem::Version
194
+ version: '0'
195
+ description: Library (Docsplit wrapper) for text extraction from pdf, doc/x, txt files
196
+ with OpenOffice
197
+ email:
198
+ - phlowerteam@gmail.com
199
+ executables: []
200
+ extensions: []
201
+ extra_rdoc_files: []
202
+ files:
203
+ - ".gitignore"
204
+ - ".rmvrc"
205
+ - ".rspec"
206
+ - ".ruby-gemset"
207
+ - ".ruby-version"
208
+ - Gemfile
209
+ - Gemfile.lock
210
+ - LICENSE
211
+ - README.md
212
+ - Rakefile
213
+ - act_as_page_extractor.gemspec
214
+ - lib/act_as_page_extractor.rb
215
+ - lib/act_as_page_extractor/modules/extracting.rb
216
+ - lib/act_as_page_extractor/modules/interface.rb
217
+ - lib/act_as_page_extractor/modules/saving.rb
218
+ - lib/act_as_page_extractor/modules/tools.rb
219
+ - lib/act_as_page_extractor/modules/unzipping.rb
220
+ - lib/act_as_page_extractor/modules/validating.rb
221
+ - lib/act_as_page_extractor/version.rb
222
+ - lib/generators/act_as_page_extractor/migration_generator.rb
223
+ - lib/generators/act_as_page_extractor/templates/act_as_page_extractor.rb.erb
224
+ - lib/generators/act_as_page_extractor/templates/add_page_extractor_fields_to_documents.rb.erb
225
+ - lib/generators/act_as_page_extractor/templates/create_extracted_pages_table.rb.erb
226
+ - lib/generators/act_as_page_extractor/templates/extracted_page.rb.erb
227
+ - spec/act_as_page_extractor_spec.rb
228
+ - spec/spec_helper.rb
229
+ - spec/support/models.rb
230
+ - test/test-doc-3-pages.doc
231
+ - test/test-doc-3-pages.docx
232
+ - test/test-doc-3-pages.docx.7z
233
+ - test/test-doc-3-pages.docx.rar
234
+ - test/test-doc-3-pages.docx.zip
235
+ - test/test-doc-3-pages.html
236
+ - test/test-doc-3-pages.odt
237
+ - test/test-doc-3-pages.pdf
238
+ - test/test-doc-3-pages.rtf
239
+ - test/test-doc-3-pages.txt
240
+ - test/test-doc-3-pages.wrong
241
+ homepage: https://github.com/phlowerteam
242
+ licenses:
243
+ - MIT
244
+ metadata: {}
245
+ post_install_message:
246
+ rdoc_options: []
247
+ require_paths:
248
+ - lib
249
+ required_ruby_version: !ruby/object:Gem::Requirement
250
+ requirements:
251
+ - - ">="
252
+ - !ruby/object:Gem::Version
253
+ version: '0'
254
+ required_rubygems_version: !ruby/object:Gem::Requirement
255
+ requirements:
256
+ - - ">="
257
+ - !ruby/object:Gem::Version
258
+ version: '0'
259
+ requirements: []
260
+ rubyforge_project:
261
+ rubygems_version: 2.5.1
262
+ signing_key:
263
+ specification_version: 4
264
+ summary: Uses system calls
265
+ test_files:
266
+ - spec/act_as_page_extractor_spec.rb
267
+ - spec/spec_helper.rb
268
+ - spec/support/models.rb
269
+ - test/test-doc-3-pages.doc
270
+ - test/test-doc-3-pages.docx
271
+ - test/test-doc-3-pages.docx.7z
272
+ - test/test-doc-3-pages.docx.rar
273
+ - test/test-doc-3-pages.docx.zip
274
+ - test/test-doc-3-pages.html
275
+ - test/test-doc-3-pages.odt
276
+ - test/test-doc-3-pages.pdf
277
+ - test/test-doc-3-pages.rtf
278
+ - test/test-doc-3-pages.txt
279
+ - test/test-doc-3-pages.wrong