grim 0.2.4 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.textile +15 -0
- data/grim.gemspec +0 -1
- data/lib/grim.rb +9 -2
- data/lib/grim/image_magick_processor.rb +37 -0
- data/lib/grim/multi_processor.rb +29 -0
- data/lib/grim/page.rb +2 -11
- data/lib/grim/pdf.rb +1 -5
- data/lib/grim/version.rb +1 -1
- data/spec/lib/grim/image_magick_processor_spec.rb +87 -0
- data/spec/lib/grim/multi_processor_spec.rb +42 -0
- data/spec/lib/grim/page_spec.rb +12 -63
- data/spec/lib/grim/pdf_spec.rb +3 -2
- data/spec/lib/grim_spec.rb +5 -1
- metadata +13 -22
data/README.textile
CHANGED
@@ -48,6 +48,21 @@ h2. Usage
|
|
48
48
|
end
|
49
49
|
</pre></code>
|
50
50
|
|
51
|
+
We also support using other processors (the default is whatever version of Imagemagick/Ghostscript is in your path).
|
52
|
+
|
53
|
+
<pre><code>
|
54
|
+
# specifying one processor with specific ImageMagick and GhostScript paths
|
55
|
+
Grim.processor = Grim::ImageMagickProcessor.new({:imagemagick_path => "/path/to/convert", :ghostscript_path => "/path/to/gs"})
|
56
|
+
|
57
|
+
# multiple processors with fallback if first fails, useful if you need multiple versions of convert/gs
|
58
|
+
Grim.processor = Grim::MultiProcessor.new([
|
59
|
+
Grim::ImageMagickProcessor.new({:imagemagick_path => "/path/to/6.7/convert", :ghostscript_path => "/path/to/9.04/gs"}),
|
60
|
+
Grim::ImageMagickProcessor.new({:imagemagick_path => "/path/to/6.6/convert", :ghostscript_path => "/path/to/9.02/gs"})
|
61
|
+
])
|
62
|
+
|
63
|
+
pdf = Grim.reap('/path/to/pdf)
|
64
|
+
</code></pre>
|
65
|
+
|
51
66
|
h2. License
|
52
67
|
|
53
68
|
See LICENSE for details.
|
data/grim.gemspec
CHANGED
@@ -12,7 +12,6 @@ Gem::Specification.new do |s|
|
|
12
12
|
s.description = %q{Grim is a simple gem for extracting a page from a pdf and converting it to an image as well as extract the text from the page as a string. It basically gives you an easy to use api to ghostscript, imagemagick, and pdftotext specific to this use case.}
|
13
13
|
|
14
14
|
s.rubyforge_project = "grim"
|
15
|
-
s.add_dependency 'safe_shell', '~> 1.0.0'
|
16
15
|
|
17
16
|
s.files = `git ls-files`.split("\n")
|
18
17
|
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
data/lib/grim.rb
CHANGED
@@ -1,7 +1,10 @@
|
|
1
1
|
# encoding: UTF-8
|
2
|
-
require '
|
2
|
+
require 'shellwords'
|
3
3
|
|
4
4
|
module Grim
|
5
|
+
extend self
|
6
|
+
attr_accessor :processor
|
7
|
+
|
5
8
|
# Default resize output width, any positive integer
|
6
9
|
WIDTH = 1024
|
7
10
|
|
@@ -47,4 +50,8 @@ module Grim
|
|
47
50
|
end
|
48
51
|
|
49
52
|
require 'grim/pdf'
|
50
|
-
require 'grim/page'
|
53
|
+
require 'grim/page'
|
54
|
+
require 'grim/image_magick_processor'
|
55
|
+
require 'grim/multi_processor'
|
56
|
+
|
57
|
+
Grim.processor = Grim::ImageMagickProcessor.new
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module Grim
|
2
|
+
class ImageMagickProcessor
|
3
|
+
|
4
|
+
# ghostscript prints out a warning, this regex matches it
|
5
|
+
WarningRegex = /\*\*\*\*.*\n/
|
6
|
+
|
7
|
+
def initialize(options={})
|
8
|
+
@imagemagick_path = options[:imagemagick_path] || 'convert'
|
9
|
+
@ghostscript_path = options[:ghostscript_path]
|
10
|
+
@original_path = ENV['PATH']
|
11
|
+
end
|
12
|
+
|
13
|
+
def count(path)
|
14
|
+
command = ["-dNODISPLAY", "-q",
|
15
|
+
"-sFile=#{Shellwords.shellescape(path)}",
|
16
|
+
File.expand_path('../../../lib/pdf_info.ps', __FILE__)]
|
17
|
+
@ghostscript_path ? command.unshift(@ghostscript_path) : command.unshift('gs')
|
18
|
+
result = `#{command.join(' ')}`
|
19
|
+
result.gsub(WarningRegex, '').to_i
|
20
|
+
end
|
21
|
+
|
22
|
+
def save(pdf, index, path, options)
|
23
|
+
width = options.fetch(:width, Grim::WIDTH)
|
24
|
+
density = options.fetch(:density, Grim::DENSITY)
|
25
|
+
quality = options.fetch(:quality, Grim::QUALITY)
|
26
|
+
command = [@imagemagick_path, "-resize", width.to_s, "-antialias", "-render",
|
27
|
+
"-quality", quality.to_s, "-colorspace", "RGB",
|
28
|
+
"-interlace", "none", "-density", density.to_s,
|
29
|
+
"#{Shellwords.shellescape(pdf.path)}[#{index}]", path]
|
30
|
+
command.unshift("PATH=#{File.dirname(@ghostscript_path)}:#{ENV['PATH']}") if @ghostscript_path
|
31
|
+
|
32
|
+
result = `#{command.join(' ')}`
|
33
|
+
|
34
|
+
$? == 0 || raise(UnprocessablePage, result)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module Grim
|
2
|
+
class MultiProcessor
|
3
|
+
def initialize(processors)
|
4
|
+
@processors = processors
|
5
|
+
end
|
6
|
+
|
7
|
+
def count(path)
|
8
|
+
result = ""
|
9
|
+
@processors.each do |processor|
|
10
|
+
result = processor.count(path)
|
11
|
+
break if result != ""
|
12
|
+
end
|
13
|
+
result
|
14
|
+
end
|
15
|
+
|
16
|
+
def save(pdf, index, path, options)
|
17
|
+
result = true
|
18
|
+
@processors.each do |processor|
|
19
|
+
begin
|
20
|
+
result = processor.save(pdf, index, path, options)
|
21
|
+
rescue UnprocessablePage
|
22
|
+
next
|
23
|
+
end
|
24
|
+
break if result
|
25
|
+
end
|
26
|
+
raise UnprocessablePage unless result
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
data/lib/grim/page.rb
CHANGED
@@ -32,16 +32,7 @@ module Grim
|
|
32
32
|
def save(path, options={})
|
33
33
|
raise PathMissing if path.nil? || path !~ /\S/
|
34
34
|
|
35
|
-
|
36
|
-
density = options.fetch(:density, Grim::DENSITY)
|
37
|
-
quality = options.fetch(:quality, Grim::QUALITY)
|
38
|
-
|
39
|
-
output = SafeShell.execute("convert", "-resize", width, "-antialias", "-render",
|
40
|
-
"-quality", quality, "-colorspace", "RGB",
|
41
|
-
"-interlace", "none", "-density", density,
|
42
|
-
"#{@pdf.path}[#{@index}]", path)
|
43
|
-
|
44
|
-
$? == 0 || raise(UnprocessablePage, output)
|
35
|
+
Grim.processor.save(@pdf, @index, path, options)
|
45
36
|
end
|
46
37
|
|
47
38
|
# Extracts the text from the selected page.
|
@@ -54,7 +45,7 @@ module Grim
|
|
54
45
|
# Returns a String.
|
55
46
|
#
|
56
47
|
def text
|
57
|
-
|
48
|
+
`#{["pdftotext", "-enc", "UTF-8", "-f", @number, "-l", @number, Shellwords.escape(@pdf.path), "-"].join(' ')}`
|
58
49
|
end
|
59
50
|
end
|
60
51
|
end
|
data/lib/grim/pdf.rb
CHANGED
@@ -5,9 +5,6 @@ module Grim
|
|
5
5
|
|
6
6
|
attr_reader :path
|
7
7
|
|
8
|
-
# ghostscript prints out a warning, this regex matches it
|
9
|
-
WarningRegex = /\*\*\*\*.*\n/
|
10
|
-
|
11
8
|
# Raises an error if pdf not found and sets some instance
|
12
9
|
# variables if pdf is found.
|
13
10
|
#
|
@@ -30,8 +27,7 @@ module Grim
|
|
30
27
|
#
|
31
28
|
def count
|
32
29
|
@count ||= begin
|
33
|
-
|
34
|
-
result.gsub(WarningRegex, '').to_i
|
30
|
+
Grim.processor.count(@path)
|
35
31
|
end
|
36
32
|
end
|
37
33
|
|
data/lib/grim/version.rb
CHANGED
@@ -0,0 +1,87 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'spec_helper'
|
3
|
+
|
4
|
+
describe Grim::ImageMagickProcessor do
|
5
|
+
before(:each) do
|
6
|
+
@reset_to = ENV['PATH']
|
7
|
+
end
|
8
|
+
|
9
|
+
after(:each) do
|
10
|
+
ENV['PATH'] = @reset_to
|
11
|
+
end
|
12
|
+
|
13
|
+
describe "#count" do
|
14
|
+
before(:each) do
|
15
|
+
@processor = Grim::ImageMagickProcessor.new
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should return page count" do
|
19
|
+
@processor.count(fixture_path("smoker.pdf")).should == 25
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
describe "#save" do
|
24
|
+
before(:all) do
|
25
|
+
@path = tmp_path("to_png_spec.png")
|
26
|
+
@pdf = Grim::Pdf.new(fixture_path("smoker.pdf"))
|
27
|
+
|
28
|
+
@processor = Grim::ImageMagickProcessor.new
|
29
|
+
end
|
30
|
+
|
31
|
+
it "should create the file" do
|
32
|
+
@processor.save(@pdf, 0, @path, {})
|
33
|
+
File.exist?(@path).should be_true
|
34
|
+
end
|
35
|
+
|
36
|
+
it "should use default width of 1024" do
|
37
|
+
@processor.save(@pdf, 0, @path, {})
|
38
|
+
width, height = dimensions_for_path(@path)
|
39
|
+
width.should == 1024
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
describe "#save with width option" do
|
44
|
+
before(:each) do
|
45
|
+
@path = tmp_path("to_png_spec.png")
|
46
|
+
pdf = Grim::Pdf.new(fixture_path("smoker.pdf"))
|
47
|
+
|
48
|
+
Grim::ImageMagickProcessor.new.save(pdf, 0, @path, {:width => 20})
|
49
|
+
end
|
50
|
+
|
51
|
+
it "should set width" do
|
52
|
+
width, height = dimensions_for_path(@path)
|
53
|
+
width.should == 20
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
describe "#save with quality option" do
|
58
|
+
before(:each) do
|
59
|
+
@path = tmp_path("to_png_spec.jpg")
|
60
|
+
@pdf = Grim::Pdf.new(fixture_path("smoker.pdf"))
|
61
|
+
end
|
62
|
+
|
63
|
+
it "should use quality" do
|
64
|
+
Grim::ImageMagickProcessor.new.save(@pdf, 0, @path, {:quality => 20})
|
65
|
+
lower_size = File.size(@path)
|
66
|
+
|
67
|
+
Grim::ImageMagickProcessor.new.save(@pdf, 0, @path, {:quality => 90})
|
68
|
+
higher_size = File.size(@path)
|
69
|
+
|
70
|
+
(lower_size < higher_size).should be_true
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
describe "#save with density option" do
|
75
|
+
before(:each) do
|
76
|
+
@path = tmp_path("to_png_spec.jpg")
|
77
|
+
@pdf = Grim::Pdf.new(fixture_path("smoker.pdf"))
|
78
|
+
end
|
79
|
+
|
80
|
+
it "should use density" do
|
81
|
+
lower_time = Benchmark.realtime { Grim::ImageMagickProcessor.new.save(@pdf, 0, @path, {:density => 72}) }
|
82
|
+
higher_time = Benchmark.realtime { Grim::ImageMagickProcessor.new.save(@pdf, 0, @path, {:density => 300}) }
|
83
|
+
|
84
|
+
(lower_time < higher_time).should be_true
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'spec_helper'
|
3
|
+
|
4
|
+
describe Grim::MultiProcessor do
|
5
|
+
before(:each) do
|
6
|
+
@failure = Grim::ImageMagickProcessor.new
|
7
|
+
@success = Grim::ImageMagickProcessor.new
|
8
|
+
@extra = Grim::ImageMagickProcessor.new
|
9
|
+
@processor = Grim::MultiProcessor.new([@failure, @success, @extra])
|
10
|
+
|
11
|
+
@path = fixture_path("smoker.pdf")
|
12
|
+
@pdf = Grim::Pdf.new(@path)
|
13
|
+
end
|
14
|
+
|
15
|
+
describe "#count" do
|
16
|
+
it "should try processors until it succeeds" do
|
17
|
+
@failure.stub(:count){""}
|
18
|
+
@success.should_receive(:count).and_return(30)
|
19
|
+
@extra.should_not_receive(:count)
|
20
|
+
|
21
|
+
@processor.count(@path)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
describe "#save" do
|
26
|
+
it "should try processors until it succeeds" do
|
27
|
+
@failure.stub(:save){false}
|
28
|
+
@success.should_receive(:save).and_return(true)
|
29
|
+
@extra.should_not_receive(:save)
|
30
|
+
|
31
|
+
@processor.save(@pdf, 0, @path, {})
|
32
|
+
end
|
33
|
+
|
34
|
+
it "should raise error if all processors fail" do
|
35
|
+
@failure.should_receive(:save).and_return(false)
|
36
|
+
@success.should_receive(:save).and_return(false)
|
37
|
+
@extra.should_receive(:save).and_return(false)
|
38
|
+
|
39
|
+
lambda { @processor.save(@pdf, 0, @path, {}) }.should raise_error(Grim::UnprocessablePage)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
data/spec/lib/grim/page_spec.rb
CHANGED
@@ -14,18 +14,12 @@ describe Grim::Page do
|
|
14
14
|
describe "#save" do
|
15
15
|
before(:all) do
|
16
16
|
@path = tmp_path("to_png_spec.png")
|
17
|
-
pdf
|
18
|
-
|
19
|
-
pdf[0].save(@path)
|
20
|
-
end
|
21
|
-
|
22
|
-
it "should create the file" do
|
23
|
-
File.exist?(@path).should be_true
|
17
|
+
@pdf = Grim::Pdf.new(fixture_path("smoker.pdf"))
|
24
18
|
end
|
25
19
|
|
26
|
-
it "should
|
27
|
-
|
28
|
-
|
20
|
+
it "should call Grim.processor.save with pdf, index, path, and options" do
|
21
|
+
Grim.processor.should_receive(:save).with(@pdf, 0, @path, {})
|
22
|
+
@pdf[0].save(@path)
|
29
23
|
end
|
30
24
|
end
|
31
25
|
|
@@ -41,59 +35,14 @@ describe Grim::Page do
|
|
41
35
|
end
|
42
36
|
end
|
43
37
|
|
44
|
-
describe "#save with
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
it "should set width" do
|
53
|
-
width, height = dimensions_for_path(@path)
|
54
|
-
width.should == 20
|
55
|
-
end
|
56
|
-
end
|
57
|
-
|
58
|
-
describe "#save with quality option" do
|
59
|
-
before(:each) do
|
60
|
-
@path = tmp_path("to_png_spec.jpg")
|
61
|
-
@pdf = Grim::Pdf.new(fixture_path("smoker.pdf"))
|
62
|
-
end
|
63
|
-
|
64
|
-
it "should use quality" do
|
65
|
-
@pdf[0].save(@path, :quality => 20)
|
66
|
-
lower_size = File.size(@path)
|
67
|
-
|
68
|
-
@pdf[0].save(@path, :quality => 90)
|
69
|
-
higher_size = File.size(@path)
|
70
|
-
|
71
|
-
(lower_size < higher_size).should be_true
|
72
|
-
end
|
73
|
-
end
|
74
|
-
|
75
|
-
describe "#save with density option" do
|
76
|
-
before(:each) do
|
77
|
-
@path = tmp_path("to_png_spec.jpg")
|
78
|
-
@pdf = Grim::Pdf.new(fixture_path("smoker.pdf"))
|
79
|
-
end
|
80
|
-
|
81
|
-
it "should use density" do
|
82
|
-
lower_time = Benchmark.realtime { @pdf[0].save(@path, :density => 20) }
|
83
|
-
higher_time = Benchmark.realtime { @pdf[0].save(@path, :density => 300) }
|
84
|
-
|
85
|
-
(lower_time < higher_time).should be_true
|
86
|
-
end
|
87
|
-
end
|
88
|
-
|
89
|
-
describe "#save with an unprocessable PDF" do
|
90
|
-
let(:path) { tmp_path("unprocessable.jpg") }
|
91
|
-
let(:pdf) { Grim::Pdf.new(fixture_path("unprocessable.pdf")) }
|
92
|
-
|
93
|
-
it "should raise an error" do
|
94
|
-
lambda { pdf[0].save(path) }.should raise_error(Grim::UnprocessablePage, /missing an image filename/)
|
95
|
-
end
|
96
|
-
end
|
38
|
+
# describe "#save with an unprocessable PDF" do
|
39
|
+
# let(:path) { tmp_path("unprocessable.jpg") }
|
40
|
+
# let(:pdf) { Grim::Pdf.new(fixture_path("unprocessable.pdf")) }
|
41
|
+
#
|
42
|
+
# it "should raise an error" do
|
43
|
+
# lambda { pdf[0].save(path) }.should raise_error(Grim::UnprocessablePage, /missing an image filename/)
|
44
|
+
# end
|
45
|
+
# end
|
97
46
|
|
98
47
|
describe "#text" do
|
99
48
|
it "should return the text from the selected page" do
|
data/spec/lib/grim/pdf_spec.rb
CHANGED
@@ -19,9 +19,10 @@ describe Grim::Pdf do
|
|
19
19
|
end
|
20
20
|
|
21
21
|
describe "#count" do
|
22
|
-
it "should
|
22
|
+
it "should call Grim.processor.count with pdf path" do
|
23
|
+
Grim.processor.should_receive(:count).with(fixture_path("smoker.pdf"))
|
23
24
|
pdf = Grim::Pdf.new(fixture_path("smoker.pdf"))
|
24
|
-
pdf.count
|
25
|
+
pdf.count
|
25
26
|
end
|
26
27
|
end
|
27
28
|
|
data/spec/lib/grim_spec.rb
CHANGED
@@ -2,6 +2,10 @@
|
|
2
2
|
require 'spec_helper'
|
3
3
|
|
4
4
|
describe Grim do
|
5
|
+
it "should have a default processor" do
|
6
|
+
Grim.processor.class.should == Grim::ImageMagickProcessor
|
7
|
+
end
|
8
|
+
|
5
9
|
it "should have a VERSION constant" do
|
6
10
|
Grim.const_defined?('VERSION').should be_true
|
7
11
|
end
|
@@ -18,7 +22,7 @@ describe Grim do
|
|
18
22
|
Grim::DENSITY.should == 300
|
19
23
|
end
|
20
24
|
|
21
|
-
describe "#
|
25
|
+
describe "#reap" do
|
22
26
|
it "should return an instance of Grim::Pdf" do
|
23
27
|
Grim.reap(fixture_path("smoker.pdf")).class.should == Grim::Pdf
|
24
28
|
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: grim
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 19
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
-
|
10
|
-
version: 0.
|
8
|
+
- 3
|
9
|
+
- 0
|
10
|
+
version: 0.3.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Jonathan Hoyt
|
@@ -15,24 +15,9 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-
|
19
|
-
dependencies:
|
20
|
-
|
21
|
-
name: safe_shell
|
22
|
-
prerelease: false
|
23
|
-
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
-
none: false
|
25
|
-
requirements:
|
26
|
-
- - ~>
|
27
|
-
- !ruby/object:Gem::Version
|
28
|
-
hash: 23
|
29
|
-
segments:
|
30
|
-
- 1
|
31
|
-
- 0
|
32
|
-
- 0
|
33
|
-
version: 1.0.0
|
34
|
-
type: :runtime
|
35
|
-
version_requirements: *id001
|
18
|
+
date: 2011-10-04 00:00:00 Z
|
19
|
+
dependencies: []
|
20
|
+
|
36
21
|
description: Grim is a simple gem for extracting a page from a pdf and converting it to an image as well as extract the text from the page as a string. It basically gives you an easy to use api to ghostscript, imagemagick, and pdftotext specific to this use case.
|
37
22
|
email:
|
38
23
|
- jonmagic@gmail.com
|
@@ -50,12 +35,16 @@ files:
|
|
50
35
|
- Rakefile
|
51
36
|
- grim.gemspec
|
52
37
|
- lib/grim.rb
|
38
|
+
- lib/grim/image_magick_processor.rb
|
39
|
+
- lib/grim/multi_processor.rb
|
53
40
|
- lib/grim/page.rb
|
54
41
|
- lib/grim/pdf.rb
|
55
42
|
- lib/grim/version.rb
|
56
43
|
- lib/pdf_info.ps
|
57
44
|
- spec/fixtures/smoker.pdf
|
58
45
|
- spec/fixtures/unprocessable.pdf
|
46
|
+
- spec/lib/grim/image_magick_processor_spec.rb
|
47
|
+
- spec/lib/grim/multi_processor_spec.rb
|
59
48
|
- spec/lib/grim/page_spec.rb
|
60
49
|
- spec/lib/grim/pdf_spec.rb
|
61
50
|
- spec/lib/grim_spec.rb
|
@@ -96,6 +85,8 @@ summary: Extract slides and text from a PDF.
|
|
96
85
|
test_files:
|
97
86
|
- spec/fixtures/smoker.pdf
|
98
87
|
- spec/fixtures/unprocessable.pdf
|
88
|
+
- spec/lib/grim/image_magick_processor_spec.rb
|
89
|
+
- spec/lib/grim/multi_processor_spec.rb
|
99
90
|
- spec/lib/grim/page_spec.rb
|
100
91
|
- spec/lib/grim/pdf_spec.rb
|
101
92
|
- spec/lib/grim_spec.rb
|