grim 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +4 -0
- data/Gemfile +4 -0
- data/README.textile +17 -0
- data/Rakefile +1 -0
- data/grim.gemspec +20 -0
- data/lib/grim.rb +104 -0
- data/lib/pdf_info.ps +21 -0
- data/spec/fixtures/smoker.pdf +0 -0
- data/spec/lib/grim_spec.rb +95 -0
- data/spec/spec_helper.rb +21 -0
- metadata +79 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/README.textile
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
h1. Grim
|
2
|
+
|
3
|
+
Grim is a simple gem for extracting a page from a pdf and converting it to an image as well as extract the text from the page as a string. It basically gives you an easy to use api to ghostscript, imagemagick, and pdftotext specific to this use case.
|
4
|
+
|
5
|
+
h2. Prerequisites
|
6
|
+
|
7
|
+
You will need ghostscript, imagemagick, and xpdf installed. On the Mac (OSX) I highly recommend using "Homebrew":http://mxcl.github.com/homebrew/ to get them installed, its as simple as "brew install ghostscript", "brew install imagemagick", and "brew install xpdf".
|
8
|
+
|
9
|
+
h2. Usage
|
10
|
+
|
11
|
+
<pre><code>
|
12
|
+
instance = Grim.new("/path/to/pdf")
|
13
|
+
page_count = instance.page_count # returns the number of pages in the pdf
|
14
|
+
png = instance.page(1).to_image("/path/to/save/image.png") # saves png to path and returns File instance
|
15
|
+
jpeg = instance.page(2).to_image("/path/to/save/image.jpeg") # saves jpeg to path and returns File instance
|
16
|
+
text = instance.page(3).text # returns text as a string
|
17
|
+
</pre></code>
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
data/grim.gemspec
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "grim"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "grim"
|
7
|
+
s.version = Grim::VERSION
|
8
|
+
s.authors = ["Jonathan Hoyt"]
|
9
|
+
s.email = ["jonmagic@gmail.com"]
|
10
|
+
s.homepage = ""
|
11
|
+
s.summary = %q{Extract slides and text from a PDF.}
|
12
|
+
s.description = %q{Grim is a simple gem for extracting a page from a pdf and converting it to an image as well as extract the text from the page as a string. It basically gives you an easy to use api to ghostscript, imagemagick, and pdftotext specific to this use case.}
|
13
|
+
|
14
|
+
s.rubyforge_project = "grim"
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n")
|
17
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
19
|
+
s.require_paths = ["lib"]
|
20
|
+
end
|
data/lib/grim.rb
ADDED
@@ -0,0 +1,104 @@
|
|
1
|
+
# Grim is a class with instance methods for getting number of pages in a pdf,
|
2
|
+
# extracting a page as an image, and extracting the text from a page.
|
3
|
+
#
|
4
|
+
# For example:
|
5
|
+
#
|
6
|
+
# instance = Grim.new("/path/to/pdf")
|
7
|
+
# page_count = instance.page_count
|
8
|
+
# png = instance.page(1).to_png("/path/to/save/png")
|
9
|
+
# jpeg = instance.page(2).to_jpeg("/path/to/save/jpeg")
|
10
|
+
# text = instance.page(3).text
|
11
|
+
#
|
12
|
+
class Grim
|
13
|
+
# VERSION
|
14
|
+
VERSION = "0.1.0"
|
15
|
+
|
16
|
+
# Default resize output width, any positive integer
|
17
|
+
WIDTH = 1024
|
18
|
+
|
19
|
+
# Default image quality, 1 to 100
|
20
|
+
QUALITY = 90
|
21
|
+
|
22
|
+
# Default density, any positive integer
|
23
|
+
DENSITY = 300
|
24
|
+
|
25
|
+
# Default exception class for Grim.
|
26
|
+
class Exception < ::StandardError
|
27
|
+
end
|
28
|
+
|
29
|
+
# Exception that is raised if pdf is not found.
|
30
|
+
class PdfNotFound < Grim::Exception
|
31
|
+
end
|
32
|
+
|
33
|
+
# be able to store what page instance should focus on
|
34
|
+
attr_accessor :page_number
|
35
|
+
|
36
|
+
# initialize is called when a new instance is created and accepts path.
|
37
|
+
def initialize(path)
|
38
|
+
raise Grim::PdfNotFound unless File.exists?(path)
|
39
|
+
@page_number = 1
|
40
|
+
@path = path
|
41
|
+
end
|
42
|
+
|
43
|
+
# page_count uses the memoized path and shells out to ghostscript
|
44
|
+
# to read the pdf with the pdf_info.ps script as a filter,
|
45
|
+
# returning the number of pages in the pdf as an integer.
|
46
|
+
#
|
47
|
+
# For example:
|
48
|
+
#
|
49
|
+
# instance.page_count
|
50
|
+
# => 4
|
51
|
+
#
|
52
|
+
# Returns an integer.
|
53
|
+
def page_count
|
54
|
+
@page_count ||= begin
|
55
|
+
`gs -dNODISPLAY -q -sFile=#{@path} ./lib/pdf_info.ps`.to_i
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
# page just sets the page attribute on the instance.
|
60
|
+
#
|
61
|
+
# For example:
|
62
|
+
#
|
63
|
+
# instance.page(1)
|
64
|
+
# => instance
|
65
|
+
#
|
66
|
+
# Returns self.
|
67
|
+
def page(number)
|
68
|
+
@page_number = number
|
69
|
+
self
|
70
|
+
end
|
71
|
+
|
72
|
+
# Returns page_number minus 1
|
73
|
+
def index
|
74
|
+
@page_number - 1
|
75
|
+
end
|
76
|
+
|
77
|
+
# to_image extracts the selected page and turns it into an image.
|
78
|
+
# Tested on png and jpeg.
|
79
|
+
#
|
80
|
+
# For example:
|
81
|
+
#
|
82
|
+
# instance.page(2).to_image(/path/to/save/image)
|
83
|
+
# => File
|
84
|
+
#
|
85
|
+
# Returns an instance of File
|
86
|
+
def to_image(path)
|
87
|
+
`convert -resize #{Grim::WIDTH} -antialias -render -quality #{Grim::QUALITY} -colorspace RGB -interlace none -density #{Grim::DENSITY} #{@path}[#{index}] #{path}`
|
88
|
+
file = File.open(path)
|
89
|
+
file.rewind
|
90
|
+
file
|
91
|
+
end
|
92
|
+
|
93
|
+
# text is an instance method that extracts the text from the selected page.
|
94
|
+
#
|
95
|
+
# For example:
|
96
|
+
#
|
97
|
+
# instance.page(2).text
|
98
|
+
# => "This is text from slide 2.\n\nAnd even more text from slide 2."
|
99
|
+
#
|
100
|
+
# Returns a string
|
101
|
+
def text
|
102
|
+
`pdftotext -enc UTF-8 -f #{@page_number} -l #{@page_number} #{@path} -`
|
103
|
+
end
|
104
|
+
end
|
data/lib/pdf_info.ps
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
% usage: gs -dNODISPLAY -q -sFile=____.pdf pdf_info.ps
|
2
|
+
|
3
|
+
/File where not {
|
4
|
+
(\n *** Missing input file name \(use -sFile=____.pdf\)\n) =
|
5
|
+
( usage: gs -dNODISPLAY -q -sFile=____.pdf [ options ] toolbin/pdf_info.ps\n) =
|
6
|
+
() =
|
7
|
+
flush
|
8
|
+
quit
|
9
|
+
} if
|
10
|
+
pop % discard the dict from where
|
11
|
+
|
12
|
+
/QUIET true def % in case they forgot
|
13
|
+
|
14
|
+
() =
|
15
|
+
File dup (r) file runpdfbegin
|
16
|
+
/PDFPageCount pdfpagecount def
|
17
|
+
PDFPageCount =print () =
|
18
|
+
flush
|
19
|
+
() =
|
20
|
+
|
21
|
+
quit
|
Binary file
|
@@ -0,0 +1,95 @@
|
|
1
|
+
require 'fileutils'
|
2
|
+
require 'spec_helper'
|
3
|
+
|
4
|
+
describe Grim do
|
5
|
+
after(:all) do
|
6
|
+
FileUtils.rm_rf(tmp_dir)
|
7
|
+
end
|
8
|
+
|
9
|
+
it "should have a VERSION constant" do
|
10
|
+
Grim.const_defined?('VERSION').should be_true
|
11
|
+
end
|
12
|
+
|
13
|
+
describe "Pdf" do
|
14
|
+
describe "#initialize" do
|
15
|
+
it "should raise an error if pdf does not exist" do
|
16
|
+
lambda { Grim.new(fixture_path("booboo.pdf")) }.should raise_error(Grim::PdfNotFound)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
describe "#page_count" do
|
21
|
+
it "should return an integer" do
|
22
|
+
instance = Grim.new(fixture_path("smoker.pdf"))
|
23
|
+
instance.page_count.should == 25
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
describe "#page" do
|
28
|
+
it "should be set to 1 by default" do
|
29
|
+
instance = Grim.new(fixture_path("smoker.pdf"))
|
30
|
+
instance.page_number.should == 1
|
31
|
+
end
|
32
|
+
|
33
|
+
it "should set page attribute and return instance" do
|
34
|
+
instance = Grim.new(fixture_path("smoker.pdf"))
|
35
|
+
instance.page(2).should == instance
|
36
|
+
instance.page_number.should == 2
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
describe "#index" do
|
41
|
+
it "should return page minus 1" do
|
42
|
+
instance = Grim.new(fixture_path("smoker.pdf"))
|
43
|
+
instance.page(2)
|
44
|
+
instance.index.should == 1
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
describe "#to_image" do
|
49
|
+
describe "output png" do
|
50
|
+
before(:all) do
|
51
|
+
instance = Grim.new(fixture_path("smoker.pdf"))
|
52
|
+
@png = instance.to_image(tmp_path("to_png_spec.png"))
|
53
|
+
end
|
54
|
+
|
55
|
+
it "should create the file" do
|
56
|
+
File.exist?(tmp_path("to_png_spec.png")).should be_true
|
57
|
+
end
|
58
|
+
|
59
|
+
it "should return an instance of File" do
|
60
|
+
@png.class.should == File
|
61
|
+
end
|
62
|
+
|
63
|
+
it "should have the right file size" do
|
64
|
+
@png.stat.size.should == 188515
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
describe "output jpeg" do
|
69
|
+
before(:all) do
|
70
|
+
instance = Grim.new(fixture_path("smoker.pdf"))
|
71
|
+
@jpeg = instance.to_image(tmp_path("to_jpeg_spec.jpeg"))
|
72
|
+
end
|
73
|
+
|
74
|
+
it "should create the file" do
|
75
|
+
File.exist?(tmp_path("to_jpeg_spec.jpeg")).should be_true
|
76
|
+
end
|
77
|
+
|
78
|
+
it "should return an instance of File" do
|
79
|
+
@jpeg.class.should == File
|
80
|
+
end
|
81
|
+
|
82
|
+
it "should have the right file size" do
|
83
|
+
@jpeg.stat.size.should == 53980
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
describe "#text" do
|
89
|
+
it "should return the text from the selected page" do
|
90
|
+
instance = Grim.new(fixture_path("smoker.pdf"))
|
91
|
+
instance.page(2).text.should == "Step 1: get someone to print this curve for you to scale, 72\342\200\235 wide\n\nStep 2: Get a couple 55 gallon drums\n\n\f"
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'bundler/setup'
|
3
|
+
|
4
|
+
require 'grim'
|
5
|
+
|
6
|
+
RSpec.configure do |config|
|
7
|
+
def fixture_path(name)
|
8
|
+
path = File.expand_path("./spec/fixtures/")
|
9
|
+
File.join(path, name)
|
10
|
+
end
|
11
|
+
|
12
|
+
def tmp_dir
|
13
|
+
path = File.expand_path("./tmp")
|
14
|
+
Dir.mkdir(path) unless File.directory?(path)
|
15
|
+
path
|
16
|
+
end
|
17
|
+
|
18
|
+
def tmp_path(name)
|
19
|
+
File.join(tmp_dir, name)
|
20
|
+
end
|
21
|
+
end
|
metadata
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: grim
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 27
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
- 0
|
10
|
+
version: 0.1.0
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Jonathan Hoyt
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2011-09-05 00:00:00 -04:00
|
19
|
+
default_executable:
|
20
|
+
dependencies: []
|
21
|
+
|
22
|
+
description: Grim is a simple gem for extracting a page from a pdf and converting it to an image as well as extract the text from the page as a string. It basically gives you an easy to use api to ghostscript, imagemagick, and pdftotext specific to this use case.
|
23
|
+
email:
|
24
|
+
- jonmagic@gmail.com
|
25
|
+
executables: []
|
26
|
+
|
27
|
+
extensions: []
|
28
|
+
|
29
|
+
extra_rdoc_files: []
|
30
|
+
|
31
|
+
files:
|
32
|
+
- .gitignore
|
33
|
+
- Gemfile
|
34
|
+
- README.textile
|
35
|
+
- Rakefile
|
36
|
+
- grim.gemspec
|
37
|
+
- lib/grim.rb
|
38
|
+
- lib/pdf_info.ps
|
39
|
+
- spec/fixtures/smoker.pdf
|
40
|
+
- spec/lib/grim_spec.rb
|
41
|
+
- spec/spec_helper.rb
|
42
|
+
has_rdoc: true
|
43
|
+
homepage: ""
|
44
|
+
licenses: []
|
45
|
+
|
46
|
+
post_install_message:
|
47
|
+
rdoc_options: []
|
48
|
+
|
49
|
+
require_paths:
|
50
|
+
- lib
|
51
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
52
|
+
none: false
|
53
|
+
requirements:
|
54
|
+
- - ">="
|
55
|
+
- !ruby/object:Gem::Version
|
56
|
+
hash: 3
|
57
|
+
segments:
|
58
|
+
- 0
|
59
|
+
version: "0"
|
60
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
61
|
+
none: false
|
62
|
+
requirements:
|
63
|
+
- - ">="
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
hash: 3
|
66
|
+
segments:
|
67
|
+
- 0
|
68
|
+
version: "0"
|
69
|
+
requirements: []
|
70
|
+
|
71
|
+
rubyforge_project: grim
|
72
|
+
rubygems_version: 1.6.2
|
73
|
+
signing_key:
|
74
|
+
specification_version: 3
|
75
|
+
summary: Extract slides and text from a PDF.
|
76
|
+
test_files:
|
77
|
+
- spec/fixtures/smoker.pdf
|
78
|
+
- spec/lib/grim_spec.rb
|
79
|
+
- spec/spec_helper.rb
|