carrierwave-docsplit 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +1 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +41 -0
- data/README.md +17 -0
- data/Rakefile +12 -0
- data/carrierwave-docsplit.gemspec +26 -0
- data/lib/carrierwave-docsplit.rb +127 -0
- data/lib/carrierwave-docsplit/version.rb +5 -0
- data/test/data/w9.pdf +0 -0
- data/test/data/w9_single.pdf +0 -0
- data/test/test_uploader.rb +143 -0
- metadata +99 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 130d69ad01b1df6ed26e49629fdba48deb463cc0
|
4
|
+
data.tar.gz: f288596517c59c64764c5a4a9b64734c75ba326c
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: f78aa024c446ade564da2c3b18b46025bb5d2009e178142cdccee0ce5d68cecb7f66acacd25a93d79d4ee93ea620ccf7abe2ef37a34634c7b0dbfbe97ed02e3e
|
7
|
+
data.tar.gz: 8c5e6679a90e2da7e8658988bdf79d02cff814c51f65ba03f14125aad8ca153fc19765f051ff2f2fdd4672c3976bef1d89e71b924efce08f271b5d5e6a97207a
|
data/.gitignore
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
test/uploads
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
carrierwave-docsplit (0.0.1)
|
5
|
+
carrierwave
|
6
|
+
docsplit
|
7
|
+
|
8
|
+
GEM
|
9
|
+
remote: http://rubygems.org/
|
10
|
+
specs:
|
11
|
+
activemodel (3.2.2)
|
12
|
+
activesupport (= 3.2.2)
|
13
|
+
builder (~> 3.0.0)
|
14
|
+
activerecord (3.2.2)
|
15
|
+
activemodel (= 3.2.2)
|
16
|
+
activesupport (= 3.2.2)
|
17
|
+
arel (~> 3.0.2)
|
18
|
+
tzinfo (~> 0.3.29)
|
19
|
+
activesupport (3.2.2)
|
20
|
+
i18n (~> 0.6)
|
21
|
+
multi_json (~> 1.0)
|
22
|
+
ansi (1.4.2)
|
23
|
+
arel (3.0.2)
|
24
|
+
builder (3.0.0)
|
25
|
+
carrierwave (0.6.0)
|
26
|
+
activemodel (>= 3.2.0)
|
27
|
+
activesupport (>= 3.2.0)
|
28
|
+
docsplit (0.6.3)
|
29
|
+
i18n (0.6.0)
|
30
|
+
multi_json (1.2.0)
|
31
|
+
turn (0.9.4)
|
32
|
+
ansi
|
33
|
+
tzinfo (0.3.32)
|
34
|
+
|
35
|
+
PLATFORMS
|
36
|
+
ruby
|
37
|
+
|
38
|
+
DEPENDENCIES
|
39
|
+
activerecord
|
40
|
+
carrierwave-docsplit!
|
41
|
+
turn
|
data/README.md
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
# CarrierWave + Docsplit: A Loving Union
|
2
|
+
|
3
|
+
carrierwave-docsplit is a thin wrapper around docsplit that knows how to talk to carrierwave.
|
4
|
+
|
5
|
+
# Usage
|
6
|
+
|
7
|
+
1. Require the file and drop it into your module.
|
8
|
+
|
9
|
+
```ruby
|
10
|
+
extend CarrierWave::DocsplitIntegration
|
11
|
+
```
|
12
|
+
|
13
|
+
2. Hook in the integration.
|
14
|
+
|
15
|
+
```ruby
|
16
|
+
extract_images :to => :thumbs, :sizes => { :large => "300x", :medium => "500x" }
|
17
|
+
```
|
data/Rakefile
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "carrierwave-docsplit/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "carrierwave-docsplit"
|
7
|
+
s.version = Carrierwave::Docsplit::VERSION
|
8
|
+
s.authors = ["Justin Woodbridge"]
|
9
|
+
s.email = ["jwoodbridge@me.com"]
|
10
|
+
s.homepage = ""
|
11
|
+
s.summary = %q{Bring together docsplit and carrierwave in a loving union.}
|
12
|
+
s.description = %q{Bring together docsplit and carrierwave in a loving union.}
|
13
|
+
|
14
|
+
s.rubyforge_project = "carrierwave-docsplit"
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n")
|
17
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
19
|
+
s.require_paths = ["lib"]
|
20
|
+
|
21
|
+
# specify any dependencies here; for example:
|
22
|
+
s.add_development_dependency "turn"
|
23
|
+
|
24
|
+
s.add_runtime_dependency "docsplit"
|
25
|
+
s.add_runtime_dependency "carrierwave"
|
26
|
+
end
|
@@ -0,0 +1,127 @@
|
|
1
|
+
require "carrierwave-docsplit/version"
|
2
|
+
|
3
|
+
require 'carrierwave'
|
4
|
+
require 'docsplit'
|
5
|
+
require 'json'
|
6
|
+
require 'pathname'
|
7
|
+
require 'fileutils'
|
8
|
+
|
9
|
+
module CarrierWave
|
10
|
+
module DocsplitIntegration
|
11
|
+
|
12
|
+
class NoModelError < Exception
|
13
|
+
def message
|
14
|
+
"Text extraction requires a model for the uploader to write results to."
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
# Setup the extraction.
|
19
|
+
#
|
20
|
+
#
|
21
|
+
|
22
|
+
def extract(options = {})
|
23
|
+
if options[:images]
|
24
|
+
self.setup_image_extraction options[:images]
|
25
|
+
end
|
26
|
+
|
27
|
+
if options[:text]
|
28
|
+
self.setup_text_extraction options[:text]
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def setup_text_extraction(options)
|
33
|
+
self.instance_eval do
|
34
|
+
process :enact_text_extraction
|
35
|
+
|
36
|
+
define_method :enact_text_extraction do
|
37
|
+
raise NoModelError if @model.nil?
|
38
|
+
|
39
|
+
out = File.join self.store_dir, self.file.basename
|
40
|
+
FileUtils.mkdir_p out
|
41
|
+
Docsplit.extract_text self.file.path, :ocr => false, :output => out
|
42
|
+
text = File.read Dir.glob(File.join(out, '*.txt')).first
|
43
|
+
|
44
|
+
@model.send "#{options[:to]}=", text
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def setup_image_extraction(options)
|
50
|
+
self.instance_eval do
|
51
|
+
|
52
|
+
define_method :output_path do
|
53
|
+
return nil if self.file.nil?
|
54
|
+
File.join self.store_dir, self.file.basename
|
55
|
+
end
|
56
|
+
|
57
|
+
# Latch our extraction method into the processing queue.
|
58
|
+
process :enact_extraction => options[:sizes]
|
59
|
+
|
60
|
+
|
61
|
+
# Define a reader to access the thumbnails stored on disk.
|
62
|
+
#
|
63
|
+
# Returns a hash structured like so.
|
64
|
+
#
|
65
|
+
# {
|
66
|
+
# '700x700' => ['/uploads/w9/700x/w9_1.png']
|
67
|
+
# }
|
68
|
+
#
|
69
|
+
# This allows for accessing the sizes like this:
|
70
|
+
#
|
71
|
+
# file.thumbs['700x700']
|
72
|
+
|
73
|
+
define_method options[:to] do
|
74
|
+
path = File.join(self.output_path, '*')
|
75
|
+
|
76
|
+
dirs_or_files = Dir.glob(path)
|
77
|
+
reduced = {}
|
78
|
+
|
79
|
+
# Multiple Sizes
|
80
|
+
if dirs_or_files.any? { |entry| entry.match /\d+x\d*/ }
|
81
|
+
|
82
|
+
Dir.glob(path) do |dirs|
|
83
|
+
if dirs.is_a?(String)
|
84
|
+
dirs = [] << dirs
|
85
|
+
end
|
86
|
+
|
87
|
+
dirs.each do |dir|
|
88
|
+
thumbs = Dir.glob(File.join(dir, '*'))
|
89
|
+
key = File.basename(dir)
|
90
|
+
reduced[key] = thumbs
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
# Only as single size supplied
|
95
|
+
else
|
96
|
+
options.delete :to
|
97
|
+
size = options.values.first
|
98
|
+
reduced[size] = dirs_or_files
|
99
|
+
end
|
100
|
+
|
101
|
+
reduced
|
102
|
+
end
|
103
|
+
|
104
|
+
define_method :enact_extraction do |*args|
|
105
|
+
args.flatten!
|
106
|
+
|
107
|
+
# zip the args back into the hash the user wrote them in.
|
108
|
+
if args.size == 0 || (args.size % 2) > 0
|
109
|
+
raise ArgumentError, "Need an even amount of arguments (Given #{args.size} #{args.size % 2})"
|
110
|
+
end
|
111
|
+
|
112
|
+
sizes = args.each_slice(2).reduce({}) { |mem, pair| mem.merge({ pair.first => pair.last }) }
|
113
|
+
|
114
|
+
self.class.class_eval do
|
115
|
+
sizes.keys.each { |size| version size }
|
116
|
+
end
|
117
|
+
|
118
|
+
out = self.output_path
|
119
|
+
|
120
|
+
FileUtils.mkdir_p out
|
121
|
+
|
122
|
+
Docsplit.extract_images self.file.path, :size => sizes.values, :output => out
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
data/test/data/w9.pdf
ADDED
Binary file
|
Binary file
|
@@ -0,0 +1,143 @@
|
|
1
|
+
require 'carrierwave'
|
2
|
+
require 'fileutils'
|
3
|
+
|
4
|
+
ROOT = File.dirname(__FILE__)
|
5
|
+
|
6
|
+
$:.unshift File.join "../lib", ROOT
|
7
|
+
|
8
|
+
require 'carrierwave-docsplit'
|
9
|
+
|
10
|
+
require 'minitest/unit'
|
11
|
+
|
12
|
+
begin; require 'turn/autorun'; rescue LoadError; end
|
13
|
+
|
14
|
+
class TestUploader < CarrierWave::Uploader::Base
|
15
|
+
extend CarrierWave::DocsplitIntegration
|
16
|
+
|
17
|
+
def store_dir
|
18
|
+
File.join(ROOT, 'uploads')
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.file_path
|
22
|
+
file = File.open(File.join(ROOT, 'data/w9.pdf'))
|
23
|
+
end
|
24
|
+
|
25
|
+
storage :file
|
26
|
+
|
27
|
+
extract :images => { :to => :thumbs, :sizes => { :large => "300x", :medium => "500x" } }
|
28
|
+
end
|
29
|
+
|
30
|
+
class SingleSizeUploader < CarrierWave::Uploader::Base
|
31
|
+
extend CarrierWave::DocsplitIntegration
|
32
|
+
|
33
|
+
def store_dir
|
34
|
+
File.join(ROOT, 'uploads')
|
35
|
+
end
|
36
|
+
|
37
|
+
storage :file
|
38
|
+
|
39
|
+
def self.sizes
|
40
|
+
{ :large => "300x" }
|
41
|
+
end
|
42
|
+
|
43
|
+
def self.file_path
|
44
|
+
file = File.open(File.join(ROOT, 'data/w9_single.pdf'))
|
45
|
+
end
|
46
|
+
|
47
|
+
image_options = {:to => :thumbs }.merge self.sizes
|
48
|
+
|
49
|
+
extract :images => image_options
|
50
|
+
end
|
51
|
+
|
52
|
+
class TextExtractionUploader < CarrierWave::Uploader::Base
|
53
|
+
extend CarrierWave::DocsplitIntegration
|
54
|
+
|
55
|
+
def store_dir
|
56
|
+
File.join(ROOT, 'uploads')
|
57
|
+
end
|
58
|
+
|
59
|
+
storage :file
|
60
|
+
|
61
|
+
extract :text => { :to => :tail }
|
62
|
+
end
|
63
|
+
|
64
|
+
class Pig
|
65
|
+
extend CarrierWave::Mount
|
66
|
+
attr_accessor :tail
|
67
|
+
end
|
68
|
+
|
69
|
+
TEST_OUTPUT_PATH = File.join ROOT, 'uploads/w9'
|
70
|
+
SINGLE_OUTPUT_PATH = File.join ROOT, 'uploads/w9_single'
|
71
|
+
|
72
|
+
class TestCarrierWaveDocsplit < MiniTest::Unit::TestCase
|
73
|
+
include CarrierWave::DocsplitIntegration
|
74
|
+
|
75
|
+
def store_dir
|
76
|
+
File.join(ROOT, 'uploads')
|
77
|
+
end
|
78
|
+
|
79
|
+
def setup
|
80
|
+
@uploader = TestUploader.new
|
81
|
+
@uploader.retrieve_from_store! File.join(ROOT, 'w9.pdf')
|
82
|
+
|
83
|
+
CarrierWave.configure do |config|
|
84
|
+
config.root = ROOT
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
def extracted_images_exist?(uploader, output_path)
|
89
|
+
File.exist?(output_path) && Dir.glob(File.join(output_path, "*")).any?
|
90
|
+
end
|
91
|
+
|
92
|
+
def test_that_read_accessor_is_being_generated
|
93
|
+
assert @uploader.respond_to? :thumbs
|
94
|
+
end
|
95
|
+
|
96
|
+
def test_that_reader_returns_valid_hash
|
97
|
+
if extracted_images_exist? @uploader, TEST_OUTPUT_PATH
|
98
|
+
@uploader.retrieve_from_store!('w9.pdf')
|
99
|
+
else
|
100
|
+
file = File.open(TestUploader.file_path)
|
101
|
+
@uploader.store! file
|
102
|
+
end
|
103
|
+
|
104
|
+
thumbs = @uploader.thumbs
|
105
|
+
|
106
|
+
assert thumbs.include?('300x'), "Thumbs does not include 300x"
|
107
|
+
assert thumbs.include?('500x'), "Thumbs does not include 500x"
|
108
|
+
|
109
|
+
assert thumbs.values.all? { |val| val.is_a? Array }
|
110
|
+
end
|
111
|
+
|
112
|
+
def test_that_output_path_returns_nil_if_no_file_stored
|
113
|
+
uploader = TestUploader.new
|
114
|
+
assert_equal nil, uploader.output_path
|
115
|
+
end
|
116
|
+
|
117
|
+
def test_should_handle_one_size_gracefully
|
118
|
+
uploader = SingleSizeUploader.new
|
119
|
+
|
120
|
+
if extracted_images_exist? uploader, SINGLE_OUTPUT_PATH
|
121
|
+
uploader.retrieve_from_store! 'w9_single.pdf'
|
122
|
+
else
|
123
|
+
file = File.open SingleSizeUploader.file_path
|
124
|
+
uploads.store! file
|
125
|
+
end
|
126
|
+
|
127
|
+
assert uploader.thumbs.include?(SingleSizeUploader.sizes.values.first)
|
128
|
+
end
|
129
|
+
|
130
|
+
def test_that_text_extraction_should_raise_error_if_no_model
|
131
|
+
assert_raises NoModelError do
|
132
|
+
uploader = TextExtractionUploader.new
|
133
|
+
uploader.enact_text_extraction
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
def test_that_text_is_assigned_to_chosen_attribute
|
138
|
+
Pig.mount_uploader :description, TextExtractionUploader
|
139
|
+
pig = Pig.new
|
140
|
+
pig.description = File.open(File.join(ROOT, 'data/w9.pdf'))
|
141
|
+
assert_equal pig.tail, File.read(File.join(ROOT,'uploads/w9/w9.txt'))
|
142
|
+
end
|
143
|
+
end
|
metadata
ADDED
@@ -0,0 +1,99 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: carrierwave-docsplit
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Justin Woodbridge
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-07-28 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: turn
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - '>='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: docsplit
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: carrierwave
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
description: Bring together docsplit and carrierwave in a loving union.
|
56
|
+
email:
|
57
|
+
- jwoodbridge@me.com
|
58
|
+
executables: []
|
59
|
+
extensions: []
|
60
|
+
extra_rdoc_files: []
|
61
|
+
files:
|
62
|
+
- .gitignore
|
63
|
+
- Gemfile
|
64
|
+
- Gemfile.lock
|
65
|
+
- README.md
|
66
|
+
- Rakefile
|
67
|
+
- carrierwave-docsplit.gemspec
|
68
|
+
- lib/carrierwave-docsplit.rb
|
69
|
+
- lib/carrierwave-docsplit/version.rb
|
70
|
+
- test/data/w9.pdf
|
71
|
+
- test/data/w9_single.pdf
|
72
|
+
- test/test_uploader.rb
|
73
|
+
homepage: ''
|
74
|
+
licenses: []
|
75
|
+
metadata: {}
|
76
|
+
post_install_message:
|
77
|
+
rdoc_options: []
|
78
|
+
require_paths:
|
79
|
+
- lib
|
80
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
81
|
+
requirements:
|
82
|
+
- - '>='
|
83
|
+
- !ruby/object:Gem::Version
|
84
|
+
version: '0'
|
85
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - '>='
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
requirements: []
|
91
|
+
rubyforge_project: carrierwave-docsplit
|
92
|
+
rubygems_version: 2.0.0
|
93
|
+
signing_key:
|
94
|
+
specification_version: 4
|
95
|
+
summary: Bring together docsplit and carrierwave in a loving union.
|
96
|
+
test_files:
|
97
|
+
- test/data/w9.pdf
|
98
|
+
- test/data/w9_single.pdf
|
99
|
+
- test/test_uploader.rb
|