carrierwave-docsplit 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 130d69ad01b1df6ed26e49629fdba48deb463cc0
4
+ data.tar.gz: f288596517c59c64764c5a4a9b64734c75ba326c
5
+ SHA512:
6
+ metadata.gz: f78aa024c446ade564da2c3b18b46025bb5d2009e178142cdccee0ce5d68cecb7f66acacd25a93d79d4ee93ea620ccf7abe2ef37a34634c7b0dbfbe97ed02e3e
7
+ data.tar.gz: 8c5e6679a90e2da7e8658988bdf79d02cff814c51f65ba03f14125aad8ca153fc19765f051ff2f2fdd4672c3976bef1d89e71b924efce08f271b5d5e6a97207a
data/.gitignore ADDED
@@ -0,0 +1 @@
1
+ test/uploads
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in carrierwave-docsplit.gemspec
4
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,41 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ carrierwave-docsplit (0.0.1)
5
+ carrierwave
6
+ docsplit
7
+
8
+ GEM
9
+ remote: http://rubygems.org/
10
+ specs:
11
+ activemodel (3.2.2)
12
+ activesupport (= 3.2.2)
13
+ builder (~> 3.0.0)
14
+ activerecord (3.2.2)
15
+ activemodel (= 3.2.2)
16
+ activesupport (= 3.2.2)
17
+ arel (~> 3.0.2)
18
+ tzinfo (~> 0.3.29)
19
+ activesupport (3.2.2)
20
+ i18n (~> 0.6)
21
+ multi_json (~> 1.0)
22
+ ansi (1.4.2)
23
+ arel (3.0.2)
24
+ builder (3.0.0)
25
+ carrierwave (0.6.0)
26
+ activemodel (>= 3.2.0)
27
+ activesupport (>= 3.2.0)
28
+ docsplit (0.6.3)
29
+ i18n (0.6.0)
30
+ multi_json (1.2.0)
31
+ turn (0.9.4)
32
+ ansi
33
+ tzinfo (0.3.32)
34
+
35
+ PLATFORMS
36
+ ruby
37
+
38
+ DEPENDENCIES
39
+ activerecord
40
+ carrierwave-docsplit!
41
+ turn
data/README.md ADDED
@@ -0,0 +1,17 @@
1
+ # CarrierWave + Docsplit: A Loving Union
2
+
3
+ carrierwave-docsplit is a thin wrapper around docsplit that knows how to talk to carrierwave.
4
+
5
+ # Usage
6
+
7
+ 1. Require the file and drop it into your module.
8
+
9
+ ```ruby
10
+ extend CarrierWave::DocsplitIntegration
11
+ ```
12
+
13
+ 2. Hook in the integration.
14
+
15
+ ```ruby
16
+ extract_images :to => :thumbs, :sizes => { :large => "300x", :medium => "500x" }
17
+ ```
data/Rakefile ADDED
@@ -0,0 +1,12 @@
1
+ require "bundler/gem_tasks"
2
+ require 'rake/testtask'
3
+
4
+ task :console do
5
+ sh "irb -r ./test/test_uploader"
6
+ end
7
+
8
+ Rake::TestTask.new do |t|
9
+ t.libs << "test"
10
+ t.test_files = FileList['test/test*.rb']
11
+ t.verbose = true
12
+ end
@@ -0,0 +1,26 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "carrierwave-docsplit/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "carrierwave-docsplit"
7
+ s.version = Carrierwave::Docsplit::VERSION
8
+ s.authors = ["Justin Woodbridge"]
9
+ s.email = ["jwoodbridge@me.com"]
10
+ s.homepage = ""
11
+ s.summary = %q{Bring together docsplit and carrierwave in a loving union.}
12
+ s.description = %q{Bring together docsplit and carrierwave in a loving union.}
13
+
14
+ s.rubyforge_project = "carrierwave-docsplit"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_paths = ["lib"]
20
+
21
+ # specify any dependencies here; for example:
22
+ s.add_development_dependency "turn"
23
+
24
+ s.add_runtime_dependency "docsplit"
25
+ s.add_runtime_dependency "carrierwave"
26
+ end
@@ -0,0 +1,127 @@
1
+ require "carrierwave-docsplit/version"
2
+
3
+ require 'carrierwave'
4
+ require 'docsplit'
5
+ require 'json'
6
+ require 'pathname'
7
+ require 'fileutils'
8
+
9
+ module CarrierWave
10
+ module DocsplitIntegration
11
+
12
+ class NoModelError < Exception
13
+ def message
14
+ "Text extraction requires a model for the uploader to write results to."
15
+ end
16
+ end
17
+
18
+ # Setup the extraction.
19
+ #
20
+ #
21
+
22
+ def extract(options = {})
23
+ if options[:images]
24
+ self.setup_image_extraction options[:images]
25
+ end
26
+
27
+ if options[:text]
28
+ self.setup_text_extraction options[:text]
29
+ end
30
+ end
31
+
32
+ def setup_text_extraction(options)
33
+ self.instance_eval do
34
+ process :enact_text_extraction
35
+
36
+ define_method :enact_text_extraction do
37
+ raise NoModelError if @model.nil?
38
+
39
+ out = File.join self.store_dir, self.file.basename
40
+ FileUtils.mkdir_p out
41
+ Docsplit.extract_text self.file.path, :ocr => false, :output => out
42
+ text = File.read Dir.glob(File.join(out, '*.txt')).first
43
+
44
+ @model.send "#{options[:to]}=", text
45
+ end
46
+ end
47
+ end
48
+
49
+ def setup_image_extraction(options)
50
+ self.instance_eval do
51
+
52
+ define_method :output_path do
53
+ return nil if self.file.nil?
54
+ File.join self.store_dir, self.file.basename
55
+ end
56
+
57
+ # Latch our extraction method into the processing queue.
58
+ process :enact_extraction => options[:sizes]
59
+
60
+
61
+ # Define a reader to access the thumbnails stored on disk.
62
+ #
63
+ # Returns a hash structured like so.
64
+ #
65
+ # {
66
+ # '700x700' => ['/uploads/w9/700x/w9_1.png']
67
+ # }
68
+ #
69
+ # This allows for accessing the sizes like this:
70
+ #
71
+ # file.thumbs['700x700']
72
+
73
+ define_method options[:to] do
74
+ path = File.join(self.output_path, '*')
75
+
76
+ dirs_or_files = Dir.glob(path)
77
+ reduced = {}
78
+
79
+ # Multiple Sizes
80
+ if dirs_or_files.any? { |entry| entry.match /\d+x\d*/ }
81
+
82
+ Dir.glob(path) do |dirs|
83
+ if dirs.is_a?(String)
84
+ dirs = [] << dirs
85
+ end
86
+
87
+ dirs.each do |dir|
88
+ thumbs = Dir.glob(File.join(dir, '*'))
89
+ key = File.basename(dir)
90
+ reduced[key] = thumbs
91
+ end
92
+ end
93
+
94
+ # Only as single size supplied
95
+ else
96
+ options.delete :to
97
+ size = options.values.first
98
+ reduced[size] = dirs_or_files
99
+ end
100
+
101
+ reduced
102
+ end
103
+
104
+ define_method :enact_extraction do |*args|
105
+ args.flatten!
106
+
107
+ # zip the args back into the hash the user wrote them in.
108
+ if args.size == 0 || (args.size % 2) > 0
109
+ raise ArgumentError, "Need an even amount of arguments (Given #{args.size} #{args.size % 2})"
110
+ end
111
+
112
+ sizes = args.each_slice(2).reduce({}) { |mem, pair| mem.merge({ pair.first => pair.last }) }
113
+
114
+ self.class.class_eval do
115
+ sizes.keys.each { |size| version size }
116
+ end
117
+
118
+ out = self.output_path
119
+
120
+ FileUtils.mkdir_p out
121
+
122
+ Docsplit.extract_images self.file.path, :size => sizes.values, :output => out
123
+ end
124
+ end
125
+ end
126
+ end
127
+ end
@@ -0,0 +1,5 @@
1
+ module Carrierwave
2
+ module Docsplit
3
+ VERSION = "0.0.1"
4
+ end
5
+ end
data/test/data/w9.pdf ADDED
Binary file
Binary file
@@ -0,0 +1,143 @@
1
+ require 'carrierwave'
2
+ require 'fileutils'
3
+
4
+ ROOT = File.dirname(__FILE__)
5
+
6
+ $:.unshift File.join "../lib", ROOT
7
+
8
+ require 'carrierwave-docsplit'
9
+
10
+ require 'minitest/unit'
11
+
12
+ begin; require 'turn/autorun'; rescue LoadError; end
13
+
14
+ class TestUploader < CarrierWave::Uploader::Base
15
+ extend CarrierWave::DocsplitIntegration
16
+
17
+ def store_dir
18
+ File.join(ROOT, 'uploads')
19
+ end
20
+
21
+ def self.file_path
22
+ file = File.open(File.join(ROOT, 'data/w9.pdf'))
23
+ end
24
+
25
+ storage :file
26
+
27
+ extract :images => { :to => :thumbs, :sizes => { :large => "300x", :medium => "500x" } }
28
+ end
29
+
30
+ class SingleSizeUploader < CarrierWave::Uploader::Base
31
+ extend CarrierWave::DocsplitIntegration
32
+
33
+ def store_dir
34
+ File.join(ROOT, 'uploads')
35
+ end
36
+
37
+ storage :file
38
+
39
+ def self.sizes
40
+ { :large => "300x" }
41
+ end
42
+
43
+ def self.file_path
44
+ file = File.open(File.join(ROOT, 'data/w9_single.pdf'))
45
+ end
46
+
47
+ image_options = {:to => :thumbs }.merge self.sizes
48
+
49
+ extract :images => image_options
50
+ end
51
+
52
+ class TextExtractionUploader < CarrierWave::Uploader::Base
53
+ extend CarrierWave::DocsplitIntegration
54
+
55
+ def store_dir
56
+ File.join(ROOT, 'uploads')
57
+ end
58
+
59
+ storage :file
60
+
61
+ extract :text => { :to => :tail }
62
+ end
63
+
64
+ class Pig
65
+ extend CarrierWave::Mount
66
+ attr_accessor :tail
67
+ end
68
+
69
+ TEST_OUTPUT_PATH = File.join ROOT, 'uploads/w9'
70
+ SINGLE_OUTPUT_PATH = File.join ROOT, 'uploads/w9_single'
71
+
72
+ class TestCarrierWaveDocsplit < MiniTest::Unit::TestCase
73
+ include CarrierWave::DocsplitIntegration
74
+
75
+ def store_dir
76
+ File.join(ROOT, 'uploads')
77
+ end
78
+
79
+ def setup
80
+ @uploader = TestUploader.new
81
+ @uploader.retrieve_from_store! File.join(ROOT, 'w9.pdf')
82
+
83
+ CarrierWave.configure do |config|
84
+ config.root = ROOT
85
+ end
86
+ end
87
+
88
+ def extracted_images_exist?(uploader, output_path)
89
+ File.exist?(output_path) && Dir.glob(File.join(output_path, "*")).any?
90
+ end
91
+
92
+ def test_that_read_accessor_is_being_generated
93
+ assert @uploader.respond_to? :thumbs
94
+ end
95
+
96
+ def test_that_reader_returns_valid_hash
97
+ if extracted_images_exist? @uploader, TEST_OUTPUT_PATH
98
+ @uploader.retrieve_from_store!('w9.pdf')
99
+ else
100
+ file = File.open(TestUploader.file_path)
101
+ @uploader.store! file
102
+ end
103
+
104
+ thumbs = @uploader.thumbs
105
+
106
+ assert thumbs.include?('300x'), "Thumbs does not include 300x"
107
+ assert thumbs.include?('500x'), "Thumbs does not include 500x"
108
+
109
+ assert thumbs.values.all? { |val| val.is_a? Array }
110
+ end
111
+
112
+ def test_that_output_path_returns_nil_if_no_file_stored
113
+ uploader = TestUploader.new
114
+ assert_equal nil, uploader.output_path
115
+ end
116
+
117
+ def test_should_handle_one_size_gracefully
118
+ uploader = SingleSizeUploader.new
119
+
120
+ if extracted_images_exist? uploader, SINGLE_OUTPUT_PATH
121
+ uploader.retrieve_from_store! 'w9_single.pdf'
122
+ else
123
+ file = File.open SingleSizeUploader.file_path
124
+ uploads.store! file
125
+ end
126
+
127
+ assert uploader.thumbs.include?(SingleSizeUploader.sizes.values.first)
128
+ end
129
+
130
+ def test_that_text_extraction_should_raise_error_if_no_model
131
+ assert_raises NoModelError do
132
+ uploader = TextExtractionUploader.new
133
+ uploader.enact_text_extraction
134
+ end
135
+ end
136
+
137
+ def test_that_text_is_assigned_to_chosen_attribute
138
+ Pig.mount_uploader :description, TextExtractionUploader
139
+ pig = Pig.new
140
+ pig.description = File.open(File.join(ROOT, 'data/w9.pdf'))
141
+ assert_equal pig.tail, File.read(File.join(ROOT,'uploads/w9/w9.txt'))
142
+ end
143
+ end
metadata ADDED
@@ -0,0 +1,99 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: carrierwave-docsplit
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Justin Woodbridge
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-07-28 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: turn
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: docsplit
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: carrierwave
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ description: Bring together docsplit and carrierwave in a loving union.
56
+ email:
57
+ - jwoodbridge@me.com
58
+ executables: []
59
+ extensions: []
60
+ extra_rdoc_files: []
61
+ files:
62
+ - .gitignore
63
+ - Gemfile
64
+ - Gemfile.lock
65
+ - README.md
66
+ - Rakefile
67
+ - carrierwave-docsplit.gemspec
68
+ - lib/carrierwave-docsplit.rb
69
+ - lib/carrierwave-docsplit/version.rb
70
+ - test/data/w9.pdf
71
+ - test/data/w9_single.pdf
72
+ - test/test_uploader.rb
73
+ homepage: ''
74
+ licenses: []
75
+ metadata: {}
76
+ post_install_message:
77
+ rdoc_options: []
78
+ require_paths:
79
+ - lib
80
+ required_ruby_version: !ruby/object:Gem::Requirement
81
+ requirements:
82
+ - - '>='
83
+ - !ruby/object:Gem::Version
84
+ version: '0'
85
+ required_rubygems_version: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - '>='
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ requirements: []
91
+ rubyforge_project: carrierwave-docsplit
92
+ rubygems_version: 2.0.0
93
+ signing_key:
94
+ specification_version: 4
95
+ summary: Bring together docsplit and carrierwave in a loving union.
96
+ test_files:
97
+ - test/data/w9.pdf
98
+ - test/data/w9_single.pdf
99
+ - test/test_uploader.rb