carrierwave-docsplit 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +1 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +41 -0
- data/README.md +17 -0
- data/Rakefile +12 -0
- data/carrierwave-docsplit.gemspec +26 -0
- data/lib/carrierwave-docsplit.rb +127 -0
- data/lib/carrierwave-docsplit/version.rb +5 -0
- data/test/data/w9.pdf +0 -0
- data/test/data/w9_single.pdf +0 -0
- data/test/test_uploader.rb +143 -0
- metadata +99 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 130d69ad01b1df6ed26e49629fdba48deb463cc0
|
4
|
+
data.tar.gz: f288596517c59c64764c5a4a9b64734c75ba326c
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: f78aa024c446ade564da2c3b18b46025bb5d2009e178142cdccee0ce5d68cecb7f66acacd25a93d79d4ee93ea620ccf7abe2ef37a34634c7b0dbfbe97ed02e3e
|
7
|
+
data.tar.gz: 8c5e6679a90e2da7e8658988bdf79d02cff814c51f65ba03f14125aad8ca153fc19765f051ff2f2fdd4672c3976bef1d89e71b924efce08f271b5d5e6a97207a
|
data/.gitignore
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
test/uploads
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
carrierwave-docsplit (0.0.1)
|
5
|
+
carrierwave
|
6
|
+
docsplit
|
7
|
+
|
8
|
+
GEM
|
9
|
+
remote: http://rubygems.org/
|
10
|
+
specs:
|
11
|
+
activemodel (3.2.2)
|
12
|
+
activesupport (= 3.2.2)
|
13
|
+
builder (~> 3.0.0)
|
14
|
+
activerecord (3.2.2)
|
15
|
+
activemodel (= 3.2.2)
|
16
|
+
activesupport (= 3.2.2)
|
17
|
+
arel (~> 3.0.2)
|
18
|
+
tzinfo (~> 0.3.29)
|
19
|
+
activesupport (3.2.2)
|
20
|
+
i18n (~> 0.6)
|
21
|
+
multi_json (~> 1.0)
|
22
|
+
ansi (1.4.2)
|
23
|
+
arel (3.0.2)
|
24
|
+
builder (3.0.0)
|
25
|
+
carrierwave (0.6.0)
|
26
|
+
activemodel (>= 3.2.0)
|
27
|
+
activesupport (>= 3.2.0)
|
28
|
+
docsplit (0.6.3)
|
29
|
+
i18n (0.6.0)
|
30
|
+
multi_json (1.2.0)
|
31
|
+
turn (0.9.4)
|
32
|
+
ansi
|
33
|
+
tzinfo (0.3.32)
|
34
|
+
|
35
|
+
PLATFORMS
|
36
|
+
ruby
|
37
|
+
|
38
|
+
DEPENDENCIES
|
39
|
+
activerecord
|
40
|
+
carrierwave-docsplit!
|
41
|
+
turn
|
data/README.md
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
# CarrierWave + Docsplit: A Loving Union
|
2
|
+
|
3
|
+
carrierwave-docsplit is a thin wrapper around docsplit that knows how to talk to carrierwave.
|
4
|
+
|
5
|
+
# Usage
|
6
|
+
|
7
|
+
1. Require the file and drop it into your module.
|
8
|
+
|
9
|
+
```ruby
|
10
|
+
extend CarrierWave::DocsplitIntegration
|
11
|
+
```
|
12
|
+
|
13
|
+
2. Hook in the integration.
|
14
|
+
|
15
|
+
```ruby
|
16
|
+
extract_images :to => :thumbs, :sizes => { :large => "300x", :medium => "500x" }
|
17
|
+
```
|
data/Rakefile
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "carrierwave-docsplit/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "carrierwave-docsplit"
|
7
|
+
s.version = Carrierwave::Docsplit::VERSION
|
8
|
+
s.authors = ["Justin Woodbridge"]
|
9
|
+
s.email = ["jwoodbridge@me.com"]
|
10
|
+
s.homepage = ""
|
11
|
+
s.summary = %q{Bring together docsplit and carrierwave in a loving union.}
|
12
|
+
s.description = %q{Bring together docsplit and carrierwave in a loving union.}
|
13
|
+
|
14
|
+
s.rubyforge_project = "carrierwave-docsplit"
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n")
|
17
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
19
|
+
s.require_paths = ["lib"]
|
20
|
+
|
21
|
+
# specify any dependencies here; for example:
|
22
|
+
s.add_development_dependency "turn"
|
23
|
+
|
24
|
+
s.add_runtime_dependency "docsplit"
|
25
|
+
s.add_runtime_dependency "carrierwave"
|
26
|
+
end
|
@@ -0,0 +1,127 @@
|
|
1
|
+
require "carrierwave-docsplit/version"
|
2
|
+
|
3
|
+
require 'carrierwave'
|
4
|
+
require 'docsplit'
|
5
|
+
require 'json'
|
6
|
+
require 'pathname'
|
7
|
+
require 'fileutils'
|
8
|
+
|
9
|
+
module CarrierWave
|
10
|
+
module DocsplitIntegration
|
11
|
+
|
12
|
+
class NoModelError < Exception
|
13
|
+
def message
|
14
|
+
"Text extraction requires a model for the uploader to write results to."
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
# Setup the extraction.
|
19
|
+
#
|
20
|
+
#
|
21
|
+
|
22
|
+
def extract(options = {})
|
23
|
+
if options[:images]
|
24
|
+
self.setup_image_extraction options[:images]
|
25
|
+
end
|
26
|
+
|
27
|
+
if options[:text]
|
28
|
+
self.setup_text_extraction options[:text]
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def setup_text_extraction(options)
|
33
|
+
self.instance_eval do
|
34
|
+
process :enact_text_extraction
|
35
|
+
|
36
|
+
define_method :enact_text_extraction do
|
37
|
+
raise NoModelError if @model.nil?
|
38
|
+
|
39
|
+
out = File.join self.store_dir, self.file.basename
|
40
|
+
FileUtils.mkdir_p out
|
41
|
+
Docsplit.extract_text self.file.path, :ocr => false, :output => out
|
42
|
+
text = File.read Dir.glob(File.join(out, '*.txt')).first
|
43
|
+
|
44
|
+
@model.send "#{options[:to]}=", text
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def setup_image_extraction(options)
|
50
|
+
self.instance_eval do
|
51
|
+
|
52
|
+
define_method :output_path do
|
53
|
+
return nil if self.file.nil?
|
54
|
+
File.join self.store_dir, self.file.basename
|
55
|
+
end
|
56
|
+
|
57
|
+
# Latch our extraction method into the processing queue.
|
58
|
+
process :enact_extraction => options[:sizes]
|
59
|
+
|
60
|
+
|
61
|
+
# Define a reader to access the thumbnails stored on disk.
|
62
|
+
#
|
63
|
+
# Returns a hash structured like so.
|
64
|
+
#
|
65
|
+
# {
|
66
|
+
# '700x700' => ['/uploads/w9/700x/w9_1.png']
|
67
|
+
# }
|
68
|
+
#
|
69
|
+
# This allows for accessing the sizes like this:
|
70
|
+
#
|
71
|
+
# file.thumbs['700x700']
|
72
|
+
|
73
|
+
define_method options[:to] do
|
74
|
+
path = File.join(self.output_path, '*')
|
75
|
+
|
76
|
+
dirs_or_files = Dir.glob(path)
|
77
|
+
reduced = {}
|
78
|
+
|
79
|
+
# Multiple Sizes
|
80
|
+
if dirs_or_files.any? { |entry| entry.match /\d+x\d*/ }
|
81
|
+
|
82
|
+
Dir.glob(path) do |dirs|
|
83
|
+
if dirs.is_a?(String)
|
84
|
+
dirs = [] << dirs
|
85
|
+
end
|
86
|
+
|
87
|
+
dirs.each do |dir|
|
88
|
+
thumbs = Dir.glob(File.join(dir, '*'))
|
89
|
+
key = File.basename(dir)
|
90
|
+
reduced[key] = thumbs
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
# Only as single size supplied
|
95
|
+
else
|
96
|
+
options.delete :to
|
97
|
+
size = options.values.first
|
98
|
+
reduced[size] = dirs_or_files
|
99
|
+
end
|
100
|
+
|
101
|
+
reduced
|
102
|
+
end
|
103
|
+
|
104
|
+
define_method :enact_extraction do |*args|
|
105
|
+
args.flatten!
|
106
|
+
|
107
|
+
# zip the args back into the hash the user wrote them in.
|
108
|
+
if args.size == 0 || (args.size % 2) > 0
|
109
|
+
raise ArgumentError, "Need an even amount of arguments (Given #{args.size} #{args.size % 2})"
|
110
|
+
end
|
111
|
+
|
112
|
+
sizes = args.each_slice(2).reduce({}) { |mem, pair| mem.merge({ pair.first => pair.last }) }
|
113
|
+
|
114
|
+
self.class.class_eval do
|
115
|
+
sizes.keys.each { |size| version size }
|
116
|
+
end
|
117
|
+
|
118
|
+
out = self.output_path
|
119
|
+
|
120
|
+
FileUtils.mkdir_p out
|
121
|
+
|
122
|
+
Docsplit.extract_images self.file.path, :size => sizes.values, :output => out
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
data/test/data/w9.pdf
ADDED
Binary file
|
Binary file
|
@@ -0,0 +1,143 @@
|
|
1
|
+
require 'carrierwave'
|
2
|
+
require 'fileutils'
|
3
|
+
|
4
|
+
ROOT = File.dirname(__FILE__)
|
5
|
+
|
6
|
+
$:.unshift File.join "../lib", ROOT
|
7
|
+
|
8
|
+
require 'carrierwave-docsplit'
|
9
|
+
|
10
|
+
require 'minitest/unit'
|
11
|
+
|
12
|
+
begin; require 'turn/autorun'; rescue LoadError; end
|
13
|
+
|
14
|
+
class TestUploader < CarrierWave::Uploader::Base
|
15
|
+
extend CarrierWave::DocsplitIntegration
|
16
|
+
|
17
|
+
def store_dir
|
18
|
+
File.join(ROOT, 'uploads')
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.file_path
|
22
|
+
file = File.open(File.join(ROOT, 'data/w9.pdf'))
|
23
|
+
end
|
24
|
+
|
25
|
+
storage :file
|
26
|
+
|
27
|
+
extract :images => { :to => :thumbs, :sizes => { :large => "300x", :medium => "500x" } }
|
28
|
+
end
|
29
|
+
|
30
|
+
class SingleSizeUploader < CarrierWave::Uploader::Base
|
31
|
+
extend CarrierWave::DocsplitIntegration
|
32
|
+
|
33
|
+
def store_dir
|
34
|
+
File.join(ROOT, 'uploads')
|
35
|
+
end
|
36
|
+
|
37
|
+
storage :file
|
38
|
+
|
39
|
+
def self.sizes
|
40
|
+
{ :large => "300x" }
|
41
|
+
end
|
42
|
+
|
43
|
+
def self.file_path
|
44
|
+
file = File.open(File.join(ROOT, 'data/w9_single.pdf'))
|
45
|
+
end
|
46
|
+
|
47
|
+
image_options = {:to => :thumbs }.merge self.sizes
|
48
|
+
|
49
|
+
extract :images => image_options
|
50
|
+
end
|
51
|
+
|
52
|
+
class TextExtractionUploader < CarrierWave::Uploader::Base
|
53
|
+
extend CarrierWave::DocsplitIntegration
|
54
|
+
|
55
|
+
def store_dir
|
56
|
+
File.join(ROOT, 'uploads')
|
57
|
+
end
|
58
|
+
|
59
|
+
storage :file
|
60
|
+
|
61
|
+
extract :text => { :to => :tail }
|
62
|
+
end
|
63
|
+
|
64
|
+
class Pig
|
65
|
+
extend CarrierWave::Mount
|
66
|
+
attr_accessor :tail
|
67
|
+
end
|
68
|
+
|
69
|
+
TEST_OUTPUT_PATH = File.join ROOT, 'uploads/w9'
|
70
|
+
SINGLE_OUTPUT_PATH = File.join ROOT, 'uploads/w9_single'
|
71
|
+
|
72
|
+
class TestCarrierWaveDocsplit < MiniTest::Unit::TestCase
|
73
|
+
include CarrierWave::DocsplitIntegration
|
74
|
+
|
75
|
+
def store_dir
|
76
|
+
File.join(ROOT, 'uploads')
|
77
|
+
end
|
78
|
+
|
79
|
+
def setup
|
80
|
+
@uploader = TestUploader.new
|
81
|
+
@uploader.retrieve_from_store! File.join(ROOT, 'w9.pdf')
|
82
|
+
|
83
|
+
CarrierWave.configure do |config|
|
84
|
+
config.root = ROOT
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
def extracted_images_exist?(uploader, output_path)
|
89
|
+
File.exist?(output_path) && Dir.glob(File.join(output_path, "*")).any?
|
90
|
+
end
|
91
|
+
|
92
|
+
def test_that_read_accessor_is_being_generated
|
93
|
+
assert @uploader.respond_to? :thumbs
|
94
|
+
end
|
95
|
+
|
96
|
+
def test_that_reader_returns_valid_hash
|
97
|
+
if extracted_images_exist? @uploader, TEST_OUTPUT_PATH
|
98
|
+
@uploader.retrieve_from_store!('w9.pdf')
|
99
|
+
else
|
100
|
+
file = File.open(TestUploader.file_path)
|
101
|
+
@uploader.store! file
|
102
|
+
end
|
103
|
+
|
104
|
+
thumbs = @uploader.thumbs
|
105
|
+
|
106
|
+
assert thumbs.include?('300x'), "Thumbs does not include 300x"
|
107
|
+
assert thumbs.include?('500x'), "Thumbs does not include 500x"
|
108
|
+
|
109
|
+
assert thumbs.values.all? { |val| val.is_a? Array }
|
110
|
+
end
|
111
|
+
|
112
|
+
def test_that_output_path_returns_nil_if_no_file_stored
|
113
|
+
uploader = TestUploader.new
|
114
|
+
assert_equal nil, uploader.output_path
|
115
|
+
end
|
116
|
+
|
117
|
+
def test_should_handle_one_size_gracefully
|
118
|
+
uploader = SingleSizeUploader.new
|
119
|
+
|
120
|
+
if extracted_images_exist? uploader, SINGLE_OUTPUT_PATH
|
121
|
+
uploader.retrieve_from_store! 'w9_single.pdf'
|
122
|
+
else
|
123
|
+
file = File.open SingleSizeUploader.file_path
|
124
|
+
uploads.store! file
|
125
|
+
end
|
126
|
+
|
127
|
+
assert uploader.thumbs.include?(SingleSizeUploader.sizes.values.first)
|
128
|
+
end
|
129
|
+
|
130
|
+
def test_that_text_extraction_should_raise_error_if_no_model
|
131
|
+
assert_raises NoModelError do
|
132
|
+
uploader = TextExtractionUploader.new
|
133
|
+
uploader.enact_text_extraction
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
def test_that_text_is_assigned_to_chosen_attribute
|
138
|
+
Pig.mount_uploader :description, TextExtractionUploader
|
139
|
+
pig = Pig.new
|
140
|
+
pig.description = File.open(File.join(ROOT, 'data/w9.pdf'))
|
141
|
+
assert_equal pig.tail, File.read(File.join(ROOT,'uploads/w9/w9.txt'))
|
142
|
+
end
|
143
|
+
end
|
metadata
ADDED
@@ -0,0 +1,99 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: carrierwave-docsplit
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Justin Woodbridge
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-07-28 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: turn
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - '>='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: docsplit
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: carrierwave
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
description: Bring together docsplit and carrierwave in a loving union.
|
56
|
+
email:
|
57
|
+
- jwoodbridge@me.com
|
58
|
+
executables: []
|
59
|
+
extensions: []
|
60
|
+
extra_rdoc_files: []
|
61
|
+
files:
|
62
|
+
- .gitignore
|
63
|
+
- Gemfile
|
64
|
+
- Gemfile.lock
|
65
|
+
- README.md
|
66
|
+
- Rakefile
|
67
|
+
- carrierwave-docsplit.gemspec
|
68
|
+
- lib/carrierwave-docsplit.rb
|
69
|
+
- lib/carrierwave-docsplit/version.rb
|
70
|
+
- test/data/w9.pdf
|
71
|
+
- test/data/w9_single.pdf
|
72
|
+
- test/test_uploader.rb
|
73
|
+
homepage: ''
|
74
|
+
licenses: []
|
75
|
+
metadata: {}
|
76
|
+
post_install_message:
|
77
|
+
rdoc_options: []
|
78
|
+
require_paths:
|
79
|
+
- lib
|
80
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
81
|
+
requirements:
|
82
|
+
- - '>='
|
83
|
+
- !ruby/object:Gem::Version
|
84
|
+
version: '0'
|
85
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - '>='
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
requirements: []
|
91
|
+
rubyforge_project: carrierwave-docsplit
|
92
|
+
rubygems_version: 2.0.0
|
93
|
+
signing_key:
|
94
|
+
specification_version: 4
|
95
|
+
summary: Bring together docsplit and carrierwave in a loving union.
|
96
|
+
test_files:
|
97
|
+
- test/data/w9.pdf
|
98
|
+
- test/data/w9_single.pdf
|
99
|
+
- test/test_uploader.rb
|