carrierwave-docsplit 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 130d69ad01b1df6ed26e49629fdba48deb463cc0
4
+ data.tar.gz: f288596517c59c64764c5a4a9b64734c75ba326c
5
+ SHA512:
6
+ metadata.gz: f78aa024c446ade564da2c3b18b46025bb5d2009e178142cdccee0ce5d68cecb7f66acacd25a93d79d4ee93ea620ccf7abe2ef37a34634c7b0dbfbe97ed02e3e
7
+ data.tar.gz: 8c5e6679a90e2da7e8658988bdf79d02cff814c51f65ba03f14125aad8ca153fc19765f051ff2f2fdd4672c3976bef1d89e71b924efce08f271b5d5e6a97207a
data/.gitignore ADDED
@@ -0,0 +1 @@
1
+ test/uploads
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in carrierwave-docsplit.gemspec
4
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,41 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ carrierwave-docsplit (0.0.1)
5
+ carrierwave
6
+ docsplit
7
+
8
+ GEM
9
+ remote: http://rubygems.org/
10
+ specs:
11
+ activemodel (3.2.2)
12
+ activesupport (= 3.2.2)
13
+ builder (~> 3.0.0)
14
+ activerecord (3.2.2)
15
+ activemodel (= 3.2.2)
16
+ activesupport (= 3.2.2)
17
+ arel (~> 3.0.2)
18
+ tzinfo (~> 0.3.29)
19
+ activesupport (3.2.2)
20
+ i18n (~> 0.6)
21
+ multi_json (~> 1.0)
22
+ ansi (1.4.2)
23
+ arel (3.0.2)
24
+ builder (3.0.0)
25
+ carrierwave (0.6.0)
26
+ activemodel (>= 3.2.0)
27
+ activesupport (>= 3.2.0)
28
+ docsplit (0.6.3)
29
+ i18n (0.6.0)
30
+ multi_json (1.2.0)
31
+ turn (0.9.4)
32
+ ansi
33
+ tzinfo (0.3.32)
34
+
35
+ PLATFORMS
36
+ ruby
37
+
38
+ DEPENDENCIES
39
+ activerecord
40
+ carrierwave-docsplit!
41
+ turn
data/README.md ADDED
@@ -0,0 +1,17 @@
1
+ # CarrierWave + Docsplit: A Loving Union
2
+
3
+ carrierwave-docsplit is a thin wrapper around docsplit that knows how to talk to carrierwave.
4
+
5
+ # Usage
6
+
7
+ 1. Require the file and drop it into your module.
8
+
9
+ ```ruby
10
+ extend CarrierWave::DocsplitIntegration
11
+ ```
12
+
13
+ 2. Hook in the integration.
14
+
15
+ ```ruby
16
+ extract_images :to => :thumbs, :sizes => { :large => "300x", :medium => "500x" }
17
+ ```
data/Rakefile ADDED
@@ -0,0 +1,12 @@
1
+ require "bundler/gem_tasks"
2
+ require 'rake/testtask'
3
+
4
+ task :console do
5
+ sh "irb -r ./test/test_uploader"
6
+ end
7
+
8
+ Rake::TestTask.new do |t|
9
+ t.libs << "test"
10
+ t.test_files = FileList['test/test*.rb']
11
+ t.verbose = true
12
+ end
@@ -0,0 +1,26 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "carrierwave-docsplit/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "carrierwave-docsplit"
7
+ s.version = Carrierwave::Docsplit::VERSION
8
+ s.authors = ["Justin Woodbridge"]
9
+ s.email = ["jwoodbridge@me.com"]
10
+ s.homepage = ""
11
+ s.summary = %q{Bring together docsplit and carrierwave in a loving union.}
12
+ s.description = %q{Bring together docsplit and carrierwave in a loving union.}
13
+
14
+ s.rubyforge_project = "carrierwave-docsplit"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_paths = ["lib"]
20
+
21
+ # specify any dependencies here; for example:
22
+ s.add_development_dependency "turn"
23
+
24
+ s.add_runtime_dependency "docsplit"
25
+ s.add_runtime_dependency "carrierwave"
26
+ end
@@ -0,0 +1,127 @@
1
+ require "carrierwave-docsplit/version"
2
+
3
+ require 'carrierwave'
4
+ require 'docsplit'
5
+ require 'json'
6
+ require 'pathname'
7
+ require 'fileutils'
8
+
9
+ module CarrierWave
10
+ module DocsplitIntegration
11
+
12
+ class NoModelError < Exception
13
+ def message
14
+ "Text extraction requires a model for the uploader to write results to."
15
+ end
16
+ end
17
+
18
+ # Setup the extraction.
19
+ #
20
+ #
21
+
22
+ def extract(options = {})
23
+ if options[:images]
24
+ self.setup_image_extraction options[:images]
25
+ end
26
+
27
+ if options[:text]
28
+ self.setup_text_extraction options[:text]
29
+ end
30
+ end
31
+
32
+ def setup_text_extraction(options)
33
+ self.instance_eval do
34
+ process :enact_text_extraction
35
+
36
+ define_method :enact_text_extraction do
37
+ raise NoModelError if @model.nil?
38
+
39
+ out = File.join self.store_dir, self.file.basename
40
+ FileUtils.mkdir_p out
41
+ Docsplit.extract_text self.file.path, :ocr => false, :output => out
42
+ text = File.read Dir.glob(File.join(out, '*.txt')).first
43
+
44
+ @model.send "#{options[:to]}=", text
45
+ end
46
+ end
47
+ end
48
+
49
+ def setup_image_extraction(options)
50
+ self.instance_eval do
51
+
52
+ define_method :output_path do
53
+ return nil if self.file.nil?
54
+ File.join self.store_dir, self.file.basename
55
+ end
56
+
57
+ # Latch our extraction method into the processing queue.
58
+ process :enact_extraction => options[:sizes]
59
+
60
+
61
+ # Define a reader to access the thumbnails stored on disk.
62
+ #
63
+ # Returns a hash structured like so.
64
+ #
65
+ # {
66
+ # '700x700' => ['/uploads/w9/700x/w9_1.png']
67
+ # }
68
+ #
69
+ # This allows for accessing the sizes like this:
70
+ #
71
+ # file.thumbs['700x700']
72
+
73
+ define_method options[:to] do
74
+ path = File.join(self.output_path, '*')
75
+
76
+ dirs_or_files = Dir.glob(path)
77
+ reduced = {}
78
+
79
+ # Multiple Sizes
80
+ if dirs_or_files.any? { |entry| entry.match /\d+x\d*/ }
81
+
82
+ Dir.glob(path) do |dirs|
83
+ if dirs.is_a?(String)
84
+ dirs = [] << dirs
85
+ end
86
+
87
+ dirs.each do |dir|
88
+ thumbs = Dir.glob(File.join(dir, '*'))
89
+ key = File.basename(dir)
90
+ reduced[key] = thumbs
91
+ end
92
+ end
93
+
94
+ # Only as single size supplied
95
+ else
96
+ options.delete :to
97
+ size = options.values.first
98
+ reduced[size] = dirs_or_files
99
+ end
100
+
101
+ reduced
102
+ end
103
+
104
+ define_method :enact_extraction do |*args|
105
+ args.flatten!
106
+
107
+ # zip the args back into the hash the user wrote them in.
108
+ if args.size == 0 || (args.size % 2) > 0
109
+ raise ArgumentError, "Need an even amount of arguments (Given #{args.size} #{args.size % 2})"
110
+ end
111
+
112
+ sizes = args.each_slice(2).reduce({}) { |mem, pair| mem.merge({ pair.first => pair.last }) }
113
+
114
+ self.class.class_eval do
115
+ sizes.keys.each { |size| version size }
116
+ end
117
+
118
+ out = self.output_path
119
+
120
+ FileUtils.mkdir_p out
121
+
122
+ Docsplit.extract_images self.file.path, :size => sizes.values, :output => out
123
+ end
124
+ end
125
+ end
126
+ end
127
+ end
@@ -0,0 +1,5 @@
1
+ module Carrierwave
2
+ module Docsplit
3
+ VERSION = "0.0.1"
4
+ end
5
+ end
data/test/data/w9.pdf ADDED
Binary file
Binary file
@@ -0,0 +1,143 @@
1
+ require 'carrierwave'
2
+ require 'fileutils'
3
+
4
+ ROOT = File.dirname(__FILE__)
5
+
6
+ $:.unshift File.join "../lib", ROOT
7
+
8
+ require 'carrierwave-docsplit'
9
+
10
+ require 'minitest/unit'
11
+
12
+ begin; require 'turn/autorun'; rescue LoadError; end
13
+
14
+ class TestUploader < CarrierWave::Uploader::Base
15
+ extend CarrierWave::DocsplitIntegration
16
+
17
+ def store_dir
18
+ File.join(ROOT, 'uploads')
19
+ end
20
+
21
+ def self.file_path
22
+ file = File.open(File.join(ROOT, 'data/w9.pdf'))
23
+ end
24
+
25
+ storage :file
26
+
27
+ extract :images => { :to => :thumbs, :sizes => { :large => "300x", :medium => "500x" } }
28
+ end
29
+
30
+ class SingleSizeUploader < CarrierWave::Uploader::Base
31
+ extend CarrierWave::DocsplitIntegration
32
+
33
+ def store_dir
34
+ File.join(ROOT, 'uploads')
35
+ end
36
+
37
+ storage :file
38
+
39
+ def self.sizes
40
+ { :large => "300x" }
41
+ end
42
+
43
+ def self.file_path
44
+ file = File.open(File.join(ROOT, 'data/w9_single.pdf'))
45
+ end
46
+
47
+ image_options = {:to => :thumbs }.merge self.sizes
48
+
49
+ extract :images => image_options
50
+ end
51
+
52
+ class TextExtractionUploader < CarrierWave::Uploader::Base
53
+ extend CarrierWave::DocsplitIntegration
54
+
55
+ def store_dir
56
+ File.join(ROOT, 'uploads')
57
+ end
58
+
59
+ storage :file
60
+
61
+ extract :text => { :to => :tail }
62
+ end
63
+
64
+ class Pig
65
+ extend CarrierWave::Mount
66
+ attr_accessor :tail
67
+ end
68
+
69
+ TEST_OUTPUT_PATH = File.join ROOT, 'uploads/w9'
70
+ SINGLE_OUTPUT_PATH = File.join ROOT, 'uploads/w9_single'
71
+
72
+ class TestCarrierWaveDocsplit < MiniTest::Unit::TestCase
73
+ include CarrierWave::DocsplitIntegration
74
+
75
+ def store_dir
76
+ File.join(ROOT, 'uploads')
77
+ end
78
+
79
+ def setup
80
+ @uploader = TestUploader.new
81
+ @uploader.retrieve_from_store! File.join(ROOT, 'w9.pdf')
82
+
83
+ CarrierWave.configure do |config|
84
+ config.root = ROOT
85
+ end
86
+ end
87
+
88
+ def extracted_images_exist?(uploader, output_path)
89
+ File.exist?(output_path) && Dir.glob(File.join(output_path, "*")).any?
90
+ end
91
+
92
+ def test_that_read_accessor_is_being_generated
93
+ assert @uploader.respond_to? :thumbs
94
+ end
95
+
96
+ def test_that_reader_returns_valid_hash
97
+ if extracted_images_exist? @uploader, TEST_OUTPUT_PATH
98
+ @uploader.retrieve_from_store!('w9.pdf')
99
+ else
100
+ file = File.open(TestUploader.file_path)
101
+ @uploader.store! file
102
+ end
103
+
104
+ thumbs = @uploader.thumbs
105
+
106
+ assert thumbs.include?('300x'), "Thumbs does not include 300x"
107
+ assert thumbs.include?('500x'), "Thumbs does not include 500x"
108
+
109
+ assert thumbs.values.all? { |val| val.is_a? Array }
110
+ end
111
+
112
+ def test_that_output_path_returns_nil_if_no_file_stored
113
+ uploader = TestUploader.new
114
+ assert_equal nil, uploader.output_path
115
+ end
116
+
117
+ def test_should_handle_one_size_gracefully
118
+ uploader = SingleSizeUploader.new
119
+
120
+ if extracted_images_exist? uploader, SINGLE_OUTPUT_PATH
121
+ uploader.retrieve_from_store! 'w9_single.pdf'
122
+ else
123
+ file = File.open SingleSizeUploader.file_path
124
+ uploads.store! file
125
+ end
126
+
127
+ assert uploader.thumbs.include?(SingleSizeUploader.sizes.values.first)
128
+ end
129
+
130
+ def test_that_text_extraction_should_raise_error_if_no_model
131
+ assert_raises NoModelError do
132
+ uploader = TextExtractionUploader.new
133
+ uploader.enact_text_extraction
134
+ end
135
+ end
136
+
137
+ def test_that_text_is_assigned_to_chosen_attribute
138
+ Pig.mount_uploader :description, TextExtractionUploader
139
+ pig = Pig.new
140
+ pig.description = File.open(File.join(ROOT, 'data/w9.pdf'))
141
+ assert_equal pig.tail, File.read(File.join(ROOT,'uploads/w9/w9.txt'))
142
+ end
143
+ end
metadata ADDED
@@ -0,0 +1,99 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: carrierwave-docsplit
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Justin Woodbridge
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-07-28 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: turn
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: docsplit
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: carrierwave
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ description: Bring together docsplit and carrierwave in a loving union.
56
+ email:
57
+ - jwoodbridge@me.com
58
+ executables: []
59
+ extensions: []
60
+ extra_rdoc_files: []
61
+ files:
62
+ - .gitignore
63
+ - Gemfile
64
+ - Gemfile.lock
65
+ - README.md
66
+ - Rakefile
67
+ - carrierwave-docsplit.gemspec
68
+ - lib/carrierwave-docsplit.rb
69
+ - lib/carrierwave-docsplit/version.rb
70
+ - test/data/w9.pdf
71
+ - test/data/w9_single.pdf
72
+ - test/test_uploader.rb
73
+ homepage: ''
74
+ licenses: []
75
+ metadata: {}
76
+ post_install_message:
77
+ rdoc_options: []
78
+ require_paths:
79
+ - lib
80
+ required_ruby_version: !ruby/object:Gem::Requirement
81
+ requirements:
82
+ - - '>='
83
+ - !ruby/object:Gem::Version
84
+ version: '0'
85
+ required_rubygems_version: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - '>='
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ requirements: []
91
+ rubyforge_project: carrierwave-docsplit
92
+ rubygems_version: 2.0.0
93
+ signing_key:
94
+ specification_version: 4
95
+ summary: Bring together docsplit and carrierwave in a loving union.
96
+ test_files:
97
+ - test/data/w9.pdf
98
+ - test/data/w9_single.pdf
99
+ - test/test_uploader.rb