grim 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +4 -0
- data/Gemfile +4 -0
- data/README.textile +17 -0
- data/Rakefile +1 -0
- data/grim.gemspec +20 -0
- data/lib/grim.rb +104 -0
- data/lib/pdf_info.ps +21 -0
- data/spec/fixtures/smoker.pdf +0 -0
- data/spec/lib/grim_spec.rb +95 -0
- data/spec/spec_helper.rb +21 -0
- metadata +79 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/README.textile
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
h1. Grim
|
2
|
+
|
3
|
+
Grim is a simple gem for extracting a page from a pdf and converting it to an image as well as extract the text from the page as a string. It basically gives you an easy to use api to ghostscript, imagemagick, and pdftotext specific to this use case.
|
4
|
+
|
5
|
+
h2. Prerequisites
|
6
|
+
|
7
|
+
You will need ghostscript, imagemagick, and xpdf installed. On the Mac (OSX) I highly recommend using "Homebrew":http://mxcl.github.com/homebrew/ to get them installed, its as simple as "brew install ghostscript", "brew install imagemagick", and "brew install xpdf".
|
8
|
+
|
9
|
+
h2. Usage
|
10
|
+
|
11
|
+
<pre><code>
|
12
|
+
instance = Grim.new("/path/to/pdf")
|
13
|
+
page_count = instance.page_count # returns the number of pages in the pdf
|
14
|
+
png = instance.page(1).to_image("/path/to/save/image.png") # saves png to path and returns File instance
|
15
|
+
jpeg = instance.page(2).to_image("/path/to/save/image.jpeg") # saves jpeg to path and returns File instance
|
16
|
+
text = instance.page(3).text # returns text as a string
|
17
|
+
</pre></code>
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
data/grim.gemspec
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "grim"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "grim"
|
7
|
+
s.version = Grim::VERSION
|
8
|
+
s.authors = ["Jonathan Hoyt"]
|
9
|
+
s.email = ["jonmagic@gmail.com"]
|
10
|
+
s.homepage = ""
|
11
|
+
s.summary = %q{Extract slides and text from a PDF.}
|
12
|
+
s.description = %q{Grim is a simple gem for extracting a page from a pdf and converting it to an image as well as extract the text from the page as a string. It basically gives you an easy to use api to ghostscript, imagemagick, and pdftotext specific to this use case.}
|
13
|
+
|
14
|
+
s.rubyforge_project = "grim"
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n")
|
17
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
19
|
+
s.require_paths = ["lib"]
|
20
|
+
end
|
data/lib/grim.rb
ADDED
@@ -0,0 +1,104 @@
|
|
1
|
+
# Grim is a class with instance methods for getting number of pages in a pdf,
|
2
|
+
# extracting a page as an image, and extracting the text from a page.
|
3
|
+
#
|
4
|
+
# For example:
|
5
|
+
#
|
6
|
+
# instance = Grim.new("/path/to/pdf")
|
7
|
+
# page_count = instance.page_count
|
8
|
+
# png = instance.page(1).to_png("/path/to/save/png")
|
9
|
+
# jpeg = instance.page(2).to_jpeg("/path/to/save/jpeg")
|
10
|
+
# text = instance.page(3).text
|
11
|
+
#
|
12
|
+
class Grim
|
13
|
+
# VERSION
|
14
|
+
VERSION = "0.1.0"
|
15
|
+
|
16
|
+
# Default resize output width, any positive integer
|
17
|
+
WIDTH = 1024
|
18
|
+
|
19
|
+
# Default image quality, 1 to 100
|
20
|
+
QUALITY = 90
|
21
|
+
|
22
|
+
# Default density, any positive integer
|
23
|
+
DENSITY = 300
|
24
|
+
|
25
|
+
# Default exception class for Grim.
|
26
|
+
class Exception < ::StandardError
|
27
|
+
end
|
28
|
+
|
29
|
+
# Exception that is raised if pdf is not found.
|
30
|
+
class PdfNotFound < Grim::Exception
|
31
|
+
end
|
32
|
+
|
33
|
+
# be able to store what page instance should focus on
|
34
|
+
attr_accessor :page_number
|
35
|
+
|
36
|
+
# initialize is called when a new instance is created and accepts path.
|
37
|
+
def initialize(path)
|
38
|
+
raise Grim::PdfNotFound unless File.exists?(path)
|
39
|
+
@page_number = 1
|
40
|
+
@path = path
|
41
|
+
end
|
42
|
+
|
43
|
+
# page_count uses the memoized path and shells out to ghostscript
|
44
|
+
# to read the pdf with the pdf_info.ps script as a filter,
|
45
|
+
# returning the number of pages in the pdf as an integer.
|
46
|
+
#
|
47
|
+
# For example:
|
48
|
+
#
|
49
|
+
# instance.page_count
|
50
|
+
# => 4
|
51
|
+
#
|
52
|
+
# Returns an integer.
|
53
|
+
def page_count
|
54
|
+
@page_count ||= begin
|
55
|
+
`gs -dNODISPLAY -q -sFile=#{@path} ./lib/pdf_info.ps`.to_i
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
# page just sets the page attribute on the instance.
|
60
|
+
#
|
61
|
+
# For example:
|
62
|
+
#
|
63
|
+
# instance.page(1)
|
64
|
+
# => instance
|
65
|
+
#
|
66
|
+
# Returns self.
|
67
|
+
def page(number)
|
68
|
+
@page_number = number
|
69
|
+
self
|
70
|
+
end
|
71
|
+
|
72
|
+
# Returns page_number minus 1
|
73
|
+
def index
|
74
|
+
@page_number - 1
|
75
|
+
end
|
76
|
+
|
77
|
+
# to_image extracts the selected page and turns it into an image.
|
78
|
+
# Tested on png and jpeg.
|
79
|
+
#
|
80
|
+
# For example:
|
81
|
+
#
|
82
|
+
# instance.page(2).to_image(/path/to/save/image)
|
83
|
+
# => File
|
84
|
+
#
|
85
|
+
# Returns an instance of File
|
86
|
+
def to_image(path)
|
87
|
+
`convert -resize #{Grim::WIDTH} -antialias -render -quality #{Grim::QUALITY} -colorspace RGB -interlace none -density #{Grim::DENSITY} #{@path}[#{index}] #{path}`
|
88
|
+
file = File.open(path)
|
89
|
+
file.rewind
|
90
|
+
file
|
91
|
+
end
|
92
|
+
|
93
|
+
# text is an instance method that extracts the text from the selected page.
|
94
|
+
#
|
95
|
+
# For example:
|
96
|
+
#
|
97
|
+
# instance.page(2).text
|
98
|
+
# => "This is text from slide 2.\n\nAnd even more text from slide 2."
|
99
|
+
#
|
100
|
+
# Returns a string
|
101
|
+
def text
|
102
|
+
`pdftotext -enc UTF-8 -f #{@page_number} -l #{@page_number} #{@path} -`
|
103
|
+
end
|
104
|
+
end
|
data/lib/pdf_info.ps
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
% usage: gs -dNODISPLAY -q -sFile=____.pdf pdf_info.ps
|
2
|
+
|
3
|
+
/File where not {
|
4
|
+
(\n *** Missing input file name \(use -sFile=____.pdf\)\n) =
|
5
|
+
( usage: gs -dNODISPLAY -q -sFile=____.pdf [ options ] toolbin/pdf_info.ps\n) =
|
6
|
+
() =
|
7
|
+
flush
|
8
|
+
quit
|
9
|
+
} if
|
10
|
+
pop % discard the dict from where
|
11
|
+
|
12
|
+
/QUIET true def % in case they forgot
|
13
|
+
|
14
|
+
() =
|
15
|
+
File dup (r) file runpdfbegin
|
16
|
+
/PDFPageCount pdfpagecount def
|
17
|
+
PDFPageCount =print () =
|
18
|
+
flush
|
19
|
+
() =
|
20
|
+
|
21
|
+
quit
|
Binary file
|
@@ -0,0 +1,95 @@
|
|
1
|
+
require 'fileutils'
|
2
|
+
require 'spec_helper'
|
3
|
+
|
4
|
+
describe Grim do
|
5
|
+
after(:all) do
|
6
|
+
FileUtils.rm_rf(tmp_dir)
|
7
|
+
end
|
8
|
+
|
9
|
+
it "should have a VERSION constant" do
|
10
|
+
Grim.const_defined?('VERSION').should be_true
|
11
|
+
end
|
12
|
+
|
13
|
+
describe "Pdf" do
|
14
|
+
describe "#initialize" do
|
15
|
+
it "should raise an error if pdf does not exist" do
|
16
|
+
lambda { Grim.new(fixture_path("booboo.pdf")) }.should raise_error(Grim::PdfNotFound)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
describe "#page_count" do
|
21
|
+
it "should return an integer" do
|
22
|
+
instance = Grim.new(fixture_path("smoker.pdf"))
|
23
|
+
instance.page_count.should == 25
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
describe "#page" do
|
28
|
+
it "should be set to 1 by default" do
|
29
|
+
instance = Grim.new(fixture_path("smoker.pdf"))
|
30
|
+
instance.page_number.should == 1
|
31
|
+
end
|
32
|
+
|
33
|
+
it "should set page attribute and return instance" do
|
34
|
+
instance = Grim.new(fixture_path("smoker.pdf"))
|
35
|
+
instance.page(2).should == instance
|
36
|
+
instance.page_number.should == 2
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
describe "#index" do
|
41
|
+
it "should return page minus 1" do
|
42
|
+
instance = Grim.new(fixture_path("smoker.pdf"))
|
43
|
+
instance.page(2)
|
44
|
+
instance.index.should == 1
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
describe "#to_image" do
|
49
|
+
describe "output png" do
|
50
|
+
before(:all) do
|
51
|
+
instance = Grim.new(fixture_path("smoker.pdf"))
|
52
|
+
@png = instance.to_image(tmp_path("to_png_spec.png"))
|
53
|
+
end
|
54
|
+
|
55
|
+
it "should create the file" do
|
56
|
+
File.exist?(tmp_path("to_png_spec.png")).should be_true
|
57
|
+
end
|
58
|
+
|
59
|
+
it "should return an instance of File" do
|
60
|
+
@png.class.should == File
|
61
|
+
end
|
62
|
+
|
63
|
+
it "should have the right file size" do
|
64
|
+
@png.stat.size.should == 188515
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
describe "output jpeg" do
|
69
|
+
before(:all) do
|
70
|
+
instance = Grim.new(fixture_path("smoker.pdf"))
|
71
|
+
@jpeg = instance.to_image(tmp_path("to_jpeg_spec.jpeg"))
|
72
|
+
end
|
73
|
+
|
74
|
+
it "should create the file" do
|
75
|
+
File.exist?(tmp_path("to_jpeg_spec.jpeg")).should be_true
|
76
|
+
end
|
77
|
+
|
78
|
+
it "should return an instance of File" do
|
79
|
+
@jpeg.class.should == File
|
80
|
+
end
|
81
|
+
|
82
|
+
it "should have the right file size" do
|
83
|
+
@jpeg.stat.size.should == 53980
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
describe "#text" do
|
89
|
+
it "should return the text from the selected page" do
|
90
|
+
instance = Grim.new(fixture_path("smoker.pdf"))
|
91
|
+
instance.page(2).text.should == "Step 1: get someone to print this curve for you to scale, 72\342\200\235 wide\n\nStep 2: Get a couple 55 gallon drums\n\n\f"
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'bundler/setup'
|
3
|
+
|
4
|
+
require 'grim'
|
5
|
+
|
6
|
+
RSpec.configure do |config|
|
7
|
+
def fixture_path(name)
|
8
|
+
path = File.expand_path("./spec/fixtures/")
|
9
|
+
File.join(path, name)
|
10
|
+
end
|
11
|
+
|
12
|
+
def tmp_dir
|
13
|
+
path = File.expand_path("./tmp")
|
14
|
+
Dir.mkdir(path) unless File.directory?(path)
|
15
|
+
path
|
16
|
+
end
|
17
|
+
|
18
|
+
def tmp_path(name)
|
19
|
+
File.join(tmp_dir, name)
|
20
|
+
end
|
21
|
+
end
|
metadata
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: grim
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 27
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
- 0
|
10
|
+
version: 0.1.0
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Jonathan Hoyt
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2011-09-05 00:00:00 -04:00
|
19
|
+
default_executable:
|
20
|
+
dependencies: []
|
21
|
+
|
22
|
+
description: Grim is a simple gem for extracting a page from a pdf and converting it to an image as well as extract the text from the page as a string. It basically gives you an easy to use api to ghostscript, imagemagick, and pdftotext specific to this use case.
|
23
|
+
email:
|
24
|
+
- jonmagic@gmail.com
|
25
|
+
executables: []
|
26
|
+
|
27
|
+
extensions: []
|
28
|
+
|
29
|
+
extra_rdoc_files: []
|
30
|
+
|
31
|
+
files:
|
32
|
+
- .gitignore
|
33
|
+
- Gemfile
|
34
|
+
- README.textile
|
35
|
+
- Rakefile
|
36
|
+
- grim.gemspec
|
37
|
+
- lib/grim.rb
|
38
|
+
- lib/pdf_info.ps
|
39
|
+
- spec/fixtures/smoker.pdf
|
40
|
+
- spec/lib/grim_spec.rb
|
41
|
+
- spec/spec_helper.rb
|
42
|
+
has_rdoc: true
|
43
|
+
homepage: ""
|
44
|
+
licenses: []
|
45
|
+
|
46
|
+
post_install_message:
|
47
|
+
rdoc_options: []
|
48
|
+
|
49
|
+
require_paths:
|
50
|
+
- lib
|
51
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
52
|
+
none: false
|
53
|
+
requirements:
|
54
|
+
- - ">="
|
55
|
+
- !ruby/object:Gem::Version
|
56
|
+
hash: 3
|
57
|
+
segments:
|
58
|
+
- 0
|
59
|
+
version: "0"
|
60
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
61
|
+
none: false
|
62
|
+
requirements:
|
63
|
+
- - ">="
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
hash: 3
|
66
|
+
segments:
|
67
|
+
- 0
|
68
|
+
version: "0"
|
69
|
+
requirements: []
|
70
|
+
|
71
|
+
rubyforge_project: grim
|
72
|
+
rubygems_version: 1.6.2
|
73
|
+
signing_key:
|
74
|
+
specification_version: 3
|
75
|
+
summary: Extract slides and text from a PDF.
|
76
|
+
test_files:
|
77
|
+
- spec/fixtures/smoker.pdf
|
78
|
+
- spec/lib/grim_spec.rb
|
79
|
+
- spec/spec_helper.rb
|