act_as_page_extractor 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +57 -0
  3. data/.rmvrc +1 -0
  4. data/.rspec +3 -0
  5. data/.ruby-gemset +1 -0
  6. data/.ruby-version +1 -0
  7. data/Gemfile +22 -0
  8. data/Gemfile.lock +107 -0
  9. data/LICENSE +21 -0
  10. data/README.md +119 -0
  11. data/Rakefile +6 -0
  12. data/act_as_page_extractor.gemspec +34 -0
  13. data/lib/act_as_page_extractor.rb +126 -0
  14. data/lib/act_as_page_extractor/modules/extracting.rb +35 -0
  15. data/lib/act_as_page_extractor/modules/interface.rb +30 -0
  16. data/lib/act_as_page_extractor/modules/saving.rb +47 -0
  17. data/lib/act_as_page_extractor/modules/tools.rb +54 -0
  18. data/lib/act_as_page_extractor/modules/unzipping.rb +15 -0
  19. data/lib/act_as_page_extractor/modules/validating.rb +22 -0
  20. data/lib/act_as_page_extractor/version.rb +5 -0
  21. data/lib/generators/act_as_page_extractor/migration_generator.rb +49 -0
  22. data/lib/generators/act_as_page_extractor/templates/act_as_page_extractor.rb.erb +14 -0
  23. data/lib/generators/act_as_page_extractor/templates/add_page_extractor_fields_to_documents.rb.erb +8 -0
  24. data/lib/generators/act_as_page_extractor/templates/create_extracted_pages_table.rb.erb +19 -0
  25. data/lib/generators/act_as_page_extractor/templates/extracted_page.rb.erb +3 -0
  26. data/spec/act_as_page_extractor_spec.rb +46 -0
  27. data/spec/spec_helper.rb +8 -0
  28. data/spec/support/models.rb +92 -0
  29. data/test/test-doc-3-pages.doc +0 -0
  30. data/test/test-doc-3-pages.docx +0 -0
  31. data/test/test-doc-3-pages.docx.7z +0 -0
  32. data/test/test-doc-3-pages.docx.rar +0 -0
  33. data/test/test-doc-3-pages.docx.zip +0 -0
  34. data/test/test-doc-3-pages.html +279 -0
  35. data/test/test-doc-3-pages.odt +0 -0
  36. data/test/test-doc-3-pages.pdf +0 -0
  37. data/test/test-doc-3-pages.rtf +339 -0
  38. data/test/test-doc-3-pages.txt +125 -0
  39. data/test/test-doc-3-pages.wrong +0 -0
  40. metadata +279 -0
@@ -0,0 +1,125 @@
1
+ require 'act_as_page_extractor/version'
2
+
3
+ require 'active_record'
4
+
5
+ require 'awesome_print'
6
+ require 'filesize'
7
+ require 'total_compressor'
8
+ require 'docsplit'
9
+ require 'pdf_utils'
10
+ require 'prawn'
11
+ require 'pdf-reader'
12
+
13
+ require 'act_as_page_extractor/modules/tools.rb'
14
+ require 'act_as_page_extractor/modules/validating.rb'
15
+ require 'act_as_page_extractor/modules/unzipping.rb'
16
+ require 'act_as_page_extractor/modules/extracting.rb'
17
+ require 'act_as_page_extractor/modules/saving.rb'
18
+
19
+ require 'act_as_page_extractor/modules/interface'
20
+
21
+ module ActAsPageExtractor
22
+
23
+ extend ActiveSupport::Concern
24
+
25
+ included do
26
+ before_create { self.page_extraction_state = EXTRACTING_STATES[:new] }
27
+ before_destroy :remove_files
28
+ end
29
+
30
+ # attr_reader :options
31
+
32
+ module ClassMethods
33
+ def act_as_page_extractor(options: {})
34
+ define_method(:save_as_pdf){|*args| options[:save_as_pdf] }
35
+ define_method(:extracted_filename){|*args| self.send(options[:filename].to_sym) }
36
+ ActAsPageExtractor.define_singleton_method(:extracted_filename) {|*args| options[:filename] }
37
+ ActAsPageExtractor.define_singleton_method(:document_class) {|*args| options[:document_class].constantize }
38
+ define_method(:extracted_document_id){|*args| options[:document_id] }
39
+ define_method(:additional_fields){|*args| options[:additional_fields] }
40
+ end
41
+ end
42
+
43
+ EXTRACTING_STATES = {
44
+ new: 'new',
45
+ extracting: 'extracting',
46
+ extracted: 'extracted',
47
+ 'error.extraction': 'error.extraction'
48
+ }.freeze
49
+
50
+ TMP_EXTRACTION_FILE_STORAGE = "#{Dir.pwd}/tmp/page_extraction".freeze
51
+ FILE_STORAGE = "#{Dir.pwd}/public".freeze
52
+ PDF_STORAGE = "#{FILE_STORAGE}/uploads/extracted/pdf".freeze
53
+
54
+ def initialized
55
+ # add all need callbacks
56
+ #on destroy remove pdf
57
+
58
+ #Add to Readme!!
59
+ #rails g act_as_page_extractor:migration Document category_id user_id
60
+ # add to [Document] model:
61
+ # has_many :extracted_pages, dependent: :destroy
62
+ create_pdf_dir
63
+ end
64
+
65
+ def page_extract!
66
+ initialized
67
+ cleanup_pages
68
+ create_tmp_dir
69
+ begin
70
+ copy_document
71
+ # debug_info
72
+ unzip_document
73
+ if valid_document
74
+ extract_pages
75
+ save_to_db
76
+ end
77
+ ensure
78
+ update_state
79
+ save_pdf
80
+ finish
81
+ end
82
+ end
83
+
84
+ def create_pdf_dir
85
+ if save_as_pdf
86
+ FileUtils::mkdir_p(PDF_STORAGE) unless File.exists?(PDF_STORAGE)
87
+ end
88
+ end
89
+
90
+ def create_tmp_dir
91
+ @tmp_dir = "#{TMP_EXTRACTION_FILE_STORAGE}/#{SecureRandom.hex(6)}"
92
+ FileUtils::mkdir_p(@tmp_dir) unless File.exists?(@tmp_dir)
93
+ end
94
+
95
+ def copy_document
96
+ @origin_document_path = "#{FILE_STORAGE}#{self.send(:extracted_filename).url.to_s}"
97
+ ap @origin_document_path
98
+ FileUtils.cp(@origin_document_path, @tmp_dir)
99
+ @copy_document_path = "#{@tmp_dir}/#{@origin_document_path.split("/").last}"
100
+ @document_filename = @origin_document_path.split("/").last
101
+ end
102
+
103
+ def finish
104
+ remove_tmp_dir
105
+ end
106
+
107
+ def remove_tmp_dir
108
+ FileUtils.rm_rf(@tmp_dir) if @tmp_dir =~ /\/tmp\//
109
+ end
110
+ end
111
+
112
+ # rails g model ExtractedPage page:text document_id:integer category_id:integer page_number:integer
113
+
114
+ # Rails 4 way
115
+ # 9.2.7.1 Multiple Callback Methods in One Class
116
+ # 258 page
117
+
118
+ # class ActiveRecord::Base
119
+ # def self.acts_as_page_extractor(document_field=:filename)
120
+ # auditor = Auditor.new(audit_log)
121
+ # after_create auditor
122
+ # after_update auditor
123
+ # after_destroy auditor
124
+ # end
125
+ # end
Binary file
metadata ADDED
@@ -0,0 +1,279 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: act_as_page_extractor
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - PhlowerTeam
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2017-01-09 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.3'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.3'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: byebug
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rspec
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: simplecov
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: activerecord
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '4.1'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '4.1'
97
+ - !ruby/object:Gem::Dependency
98
+ name: awesome_print
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: docsplit
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :runtime
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ - !ruby/object:Gem::Dependency
126
+ name: pdf_utils
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ">="
130
+ - !ruby/object:Gem::Version
131
+ version: '0'
132
+ type: :runtime
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - ">="
137
+ - !ruby/object:Gem::Version
138
+ version: '0'
139
+ - !ruby/object:Gem::Dependency
140
+ name: prawn
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - "~>"
144
+ - !ruby/object:Gem::Version
145
+ version: 0.7.1
146
+ type: :runtime
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - "~>"
151
+ - !ruby/object:Gem::Version
152
+ version: 0.7.1
153
+ - !ruby/object:Gem::Dependency
154
+ name: pdf-reader
155
+ requirement: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - ">="
158
+ - !ruby/object:Gem::Version
159
+ version: '0'
160
+ type: :runtime
161
+ prerelease: false
162
+ version_requirements: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - ">="
165
+ - !ruby/object:Gem::Version
166
+ version: '0'
167
+ - !ruby/object:Gem::Dependency
168
+ name: total_compressor
169
+ requirement: !ruby/object:Gem::Requirement
170
+ requirements:
171
+ - - ">="
172
+ - !ruby/object:Gem::Version
173
+ version: '0'
174
+ type: :runtime
175
+ prerelease: false
176
+ version_requirements: !ruby/object:Gem::Requirement
177
+ requirements:
178
+ - - ">="
179
+ - !ruby/object:Gem::Version
180
+ version: '0'
181
+ - !ruby/object:Gem::Dependency
182
+ name: filesize
183
+ requirement: !ruby/object:Gem::Requirement
184
+ requirements:
185
+ - - ">="
186
+ - !ruby/object:Gem::Version
187
+ version: '0'
188
+ type: :runtime
189
+ prerelease: false
190
+ version_requirements: !ruby/object:Gem::Requirement
191
+ requirements:
192
+ - - ">="
193
+ - !ruby/object:Gem::Version
194
+ version: '0'
195
+ description: Library (Docsplit wrapper) for text extraction from pdf, doc/x, txt files
196
+ with OpenOffice
197
+ email:
198
+ - phlowerteam@gmail.com
199
+ executables: []
200
+ extensions: []
201
+ extra_rdoc_files: []
202
+ files:
203
+ - ".gitignore"
204
+ - ".rmvrc"
205
+ - ".rspec"
206
+ - ".ruby-gemset"
207
+ - ".ruby-version"
208
+ - Gemfile
209
+ - Gemfile.lock
210
+ - LICENSE
211
+ - README.md
212
+ - Rakefile
213
+ - act_as_page_extractor.gemspec
214
+ - lib/act_as_page_extractor.rb
215
+ - lib/act_as_page_extractor/modules/extracting.rb
216
+ - lib/act_as_page_extractor/modules/interface.rb
217
+ - lib/act_as_page_extractor/modules/saving.rb
218
+ - lib/act_as_page_extractor/modules/tools.rb
219
+ - lib/act_as_page_extractor/modules/unzipping.rb
220
+ - lib/act_as_page_extractor/modules/validating.rb
221
+ - lib/act_as_page_extractor/version.rb
222
+ - lib/generators/act_as_page_extractor/migration_generator.rb
223
+ - lib/generators/act_as_page_extractor/templates/act_as_page_extractor.rb.erb
224
+ - lib/generators/act_as_page_extractor/templates/add_page_extractor_fields_to_documents.rb.erb
225
+ - lib/generators/act_as_page_extractor/templates/create_extracted_pages_table.rb.erb
226
+ - lib/generators/act_as_page_extractor/templates/extracted_page.rb.erb
227
+ - spec/act_as_page_extractor_spec.rb
228
+ - spec/spec_helper.rb
229
+ - spec/support/models.rb
230
+ - test/test-doc-3-pages.doc
231
+ - test/test-doc-3-pages.docx
232
+ - test/test-doc-3-pages.docx.7z
233
+ - test/test-doc-3-pages.docx.rar
234
+ - test/test-doc-3-pages.docx.zip
235
+ - test/test-doc-3-pages.html
236
+ - test/test-doc-3-pages.odt
237
+ - test/test-doc-3-pages.pdf
238
+ - test/test-doc-3-pages.rtf
239
+ - test/test-doc-3-pages.txt
240
+ - test/test-doc-3-pages.wrong
241
+ homepage: https://github.com/phlowerteam
242
+ licenses:
243
+ - MIT
244
+ metadata: {}
245
+ post_install_message:
246
+ rdoc_options: []
247
+ require_paths:
248
+ - lib
249
+ required_ruby_version: !ruby/object:Gem::Requirement
250
+ requirements:
251
+ - - ">="
252
+ - !ruby/object:Gem::Version
253
+ version: '0'
254
+ required_rubygems_version: !ruby/object:Gem::Requirement
255
+ requirements:
256
+ - - ">="
257
+ - !ruby/object:Gem::Version
258
+ version: '0'
259
+ requirements: []
260
+ rubyforge_project:
261
+ rubygems_version: 2.5.1
262
+ signing_key:
263
+ specification_version: 4
264
+ summary: Uses system calls
265
+ test_files:
266
+ - spec/act_as_page_extractor_spec.rb
267
+ - spec/spec_helper.rb
268
+ - spec/support/models.rb
269
+ - test/test-doc-3-pages.doc
270
+ - test/test-doc-3-pages.docx
271
+ - test/test-doc-3-pages.docx.7z
272
+ - test/test-doc-3-pages.docx.rar
273
+ - test/test-doc-3-pages.docx.zip
274
+ - test/test-doc-3-pages.html
275
+ - test/test-doc-3-pages.odt
276
+ - test/test-doc-3-pages.pdf
277
+ - test/test-doc-3-pages.rtf
278
+ - test/test-doc-3-pages.txt
279
+ - test/test-doc-3-pages.wrong